Example #1
0
class MBExperiment:
    def __init__(self, params):
        """Initializes class instance.

        Argument:
            params (DotMap): A DotMap containing the following:
                .sim_cfg:
                    .env (gym.env): Environment for this experiment
                    .task_hor (int): Task horizon
                    .stochastic (bool): (optional) If True, agent adds noise to its actions.
                        Must provide noise_std (see below). Defaults to False.
                    .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I)
                        will be added.

                .exp_cfg:
                    .ntrain_iters (int): Number of training iterations to be performed.
                    .nrollouts_per_iter (int): (optional) Number of rollouts done between training
                        iterations. Defaults to 1.
                    .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1.
                    .policy (controller): Policy that will be trained.

                .log_cfg:
                    .logdir (str): Parent of directory path where experiment data will be saved.
                        Experiment will be saved in logdir/<date+time of experiment start>
                    .nrecord (int): (optional) Number of rollouts to record for every iteration.
                        Defaults to 0.
                    .neval (int): (optional) Number of rollouts for performance evaluation.
                        Defaults to 1.
        """
        self.env = get_required_argument(params.sim_cfg, "env",
                                         "Must provide environment.")
        self.task_hor = get_required_argument(params.sim_cfg, "task_hor",
                                              "Must provide task horizon.")
        if params.sim_cfg.get("stochastic", False):
            self.agent = Agent(
                DotMap(
                    env=self.env,
                    noisy_actions=True,
                    noise_stddev=get_required_argument(
                        params.sim_cfg, "noise_std",
                        "Must provide noise standard deviation in the case of a stochastic environment."
                    )))
        else:
            self.agent = Agent(DotMap(env=self.env, noisy_actions=False))

        self.ntrain_iters = get_required_argument(
            params.exp_cfg, "ntrain_iters",
            "Must provide number of training iterations.")
        self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1)
        self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1)
        self.policy = get_required_argument(params.exp_cfg, "policy",
                                            "Must provide a policy.")

        self.logdir = os.path.join(
            get_required_argument(params.log_cfg, "logdir",
                                  "Must provide log parent directory."),
            strftime("%Y-%m-%d--%H_%M_%S",
                     localtime())  # Compatiable format in windows
        )
        self.nrecord = params.log_cfg.get("nrecord", 0)
        self.neval = params.log_cfg.get("neval", 1)

    def run_experiment(self):
        """Perform experiment.
        """
        os.makedirs(self.logdir, exist_ok=True)

        traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], []

        # Perform initial rollouts
        samples = []
        for i in range(self.ninit_rollouts):
            samples.append(self.agent.sample(self.task_hor, self.policy))
            traj_obs.append(samples[-1]["obs"])
            traj_acs.append(samples[-1]["ac"])
            traj_rews.append(samples[-1]["rewards"])

        if self.ninit_rollouts > 0:
            self.policy.train([sample["obs"] for sample in samples],
                              [sample["ac"] for sample in samples],
                              [sample["rewards"] for sample in samples])

        # Training loop
        for i in range(self.ntrain_iters):
            print(
                "####################################################################"
            )
            print("Starting training iteration %d." % (i + 1))

            iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1))
            os.makedirs(iter_dir, exist_ok=True)

            samples = []
            for j in range(self.nrecord):
                samples.append(
                    self.agent.sample(
                        self.task_hor, self.policy,
                        os.path.join(iter_dir, "rollout%d.mp4" % j)))
            if self.nrecord > 0:
                for item in filter(lambda f: f.endswith(".json"),
                                   os.listdir(iter_dir)):
                    os.remove(os.path.join(iter_dir, item))
            for j in range(
                    max(self.neval, self.nrollouts_per_iter) - self.nrecord):
                samples.append(self.agent.sample(self.task_hor, self.policy))
            print("Rewards obtained:",
                  [sample["reward_sum"] for sample in samples[:self.neval]])
            traj_obs.extend([
                sample["obs"] for sample in samples[:self.nrollouts_per_iter]
            ])
            traj_acs.extend(
                [sample["ac"] for sample in samples[:self.nrollouts_per_iter]])
            traj_rets.extend(
                [sample["reward_sum"] for sample in samples[:self.neval]])
            traj_rews.extend([
                sample["rewards"]
                for sample in samples[:self.nrollouts_per_iter]
            ])
            samples = samples[:self.nrollouts_per_iter]

            self.policy.dump_logs(self.logdir, iter_dir)
            savemat(
                os.path.join(self.logdir, "logs.mat"), {
                    "observations": traj_obs,
                    "actions": traj_acs,
                    "returns": traj_rets,
                    "rewards": traj_rews
                })
            # Delete iteration directory if not used
            if len(os.listdir(iter_dir)) == 0:
                os.rmdir(iter_dir)

            if i < self.ntrain_iters - 1:
                self.policy.train([sample["obs"] for sample in samples],
                                  [sample["ac"] for sample in samples],
                                  [sample["rewards"] for sample in samples])
Example #2
0
class MBExperiment:
    def __init__(self, params):
        """Initializes class instance.

        Argument:
            params (DotMap): A DotMap containing the following:
                .sim_cfg:
                    .env (gym.env): Environment for this experiment
                    .task_hor (int): Task horizon
                    .stochastic (bool): (optional) If True, agent adds noise to its actions.
                        Must provide noise_std (see below). Defaults to False.
                    .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I)
                        will be added.

                .exp_cfg:
                    .ntrain_iters (int): Number of training iterations to be performed.
                    .nrollouts_per_iter (int): (optional) Number of rollouts done between training
                        iterations. Defaults to 1.
                    .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1.
                    .policy (controller): Policy that will be trained.
                    .demo_low_cost (int): Minimum allowed cost for demonstrations
                    .demo_high_cost (int): Maximum allowed cost for demonstrations
                    .num_demos (int): Number of demonstrations
                    .ss_buffer_size (int): Size of buffer of safe states that density model is
                        trained on
                    .gym_robotics (bool): Indicates whether env is a gym robotics env, in which
                        case there are some small differences in data loading and environment
                        parameters
                .log_cfg:
                    .logdir (str): Parent of directory path where experiment data will be saved.
                        Experiment will be saved in logdir/<date+time of experiment start>
                    .nrecord (int): (optional) Number of rollouts to record for every iteration.
                        Defaults to 0.
                    .neval (int): (optional) Number of rollouts for performance evaluation.
                        Defaults to 1.
        """
        self.env = get_required_argument(params.sim_cfg, "env",
                                         "Must provide environment.")
        self.demo_low_cost = params.exp_cfg.demo_low_cost
        self.demo_high_cost = params.exp_cfg.demo_high_cost
        self.num_demos = params.exp_cfg.num_demos
        self.ss_buffer_size = params.exp_cfg.ss_buffer_size
        self.gym_robotics = params.exp_cfg.gym_robotics

        self.task_hor = get_required_argument(params.sim_cfg, "task_hor",
                                              "Must provide task horizon.")
        if params.sim_cfg.get("stochastic", False):
            self.agent = Agent(
                DotMap(
                    env=self.env,
                    noisy_actions=True,
                    noise_stddev=get_required_argument(
                        params.sim_cfg, "noise_std",
                        "Must provide noise standard deviation in the case of a stochastic environment."
                    )))
        else:
            self.agent = Agent(DotMap(env=self.env, noisy_actions=False))

        self.ntrain_iters = get_required_argument(
            params.exp_cfg, "ntrain_iters",
            "Must provide number of training iterations.")
        self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1)
        self.policy = get_required_argument(params.exp_cfg, "policy",
                                            "Must provide a policy.")
        self.value = get_required_argument(params.exp_cfg, "value",
                                           "Must provide a value function.")
        self.target = get_required_argument(params.exp_cfg, "value_target",
                                            "Must provide a value function.")
        self.value.target = self.target

        self.logdir = os.path.join(
            get_required_argument(params.log_cfg, "logdir",
                                  "Must provide log parent directory."),
            strftime("%Y-%m-%d--%H:%M:%S", localtime()))
        self.nrecord = params.log_cfg.get("nrecord", 0)
        self.neval = params.log_cfg.get("neval", 1)
        self.load_samples = params.exp_cfg.get("load_samples", True)
        self.demo_load_path = params.exp_cfg.get("demo_load_path", None)
        self.use_value = params.exp_cfg.get("use_value", True)
        self.teacher = params.exp_cfg.get("teacher")
        self.stabilizable_observations = []
        self.tvalue_schedule = LinearSchedule(3, 3, 500)
        self.stabilized_model = knn(n_neighbors=1)
        self.target_update_freq = 1

    def run_experiment(self):
        """Perform experiment.
        """
        os.makedirs(self.logdir, exist_ok=True)

        traj_obs, traj_acs, traj_rets, traj_costs = [], [], [], []

        # Perform initial rollouts
        samples = []
        if self.load_samples:
            if not self.gym_robotics:
                samples = load_teacher_samples_gym(self.demo_load_path,
                                                   self.env,
                                                   self.demo_low_cost,
                                                   self.demo_high_cost,
                                                   self.num_demos)
            else:
                samples = load_teacher_samples_gym_robotics(
                    self.demo_load_path, self.env, self.demo_low_cost,
                    self.demo_high_cost, self.num_demos)

            for i in range(len(samples)):
                traj_obs.append(samples[i]["obs"])
                traj_acs.append(samples[i]["ac"])
                traj_costs.append(samples[i]["costs"])
                self.stabilizable_observations.extend(
                    samples[i]["stabilizable_obs"])
        else:
            for i in range(self.num_demos):
                s = self.teacher.get_rollout()
                samples.append(s)
                traj_obs.append(samples[-1]["obs"])
                traj_acs.append(samples[-1]["ac"])
                traj_costs.append(samples[-1]["costs"])
                self.stabilizable_observations.extend(
                    samples[-1]["stabilizable_obs"])

        # Fit density model to demonstrations
        if self.stabilized_model is not None:
            self.stabilized_model.fit(np.array(self.stabilizable_observations))
        else:
            self.stabilized_model = None
        self.policy.set_stabilized_model(self.stabilized_model)

        if self.num_demos > 0:
            self.policy.train([sample["obs"] for sample in samples],
                              [sample["ac"] for sample in samples],
                              [sample["costs"] for sample in samples],
                              np.array(self.stabilizable_observations))
            if self.use_value:
                # Train value function using teacher rollouts
                self.value.train([sample["obs"][:-1] for sample in samples],
                                 [sample["costs"] for sample in samples],
                                 [sample["obs"][1:] for sample in samples],
                                 [sample["values"] for sample in samples],
                                 use_TD=False,
                                 terminal_states=[
                                     sample["obs"][-1] for sample in samples
                                 ],
                                 copy_target=True)

        demo_samples = deepcopy(samples)

        # Training loop
        for i in range(self.ntrain_iters):
            print(
                "####################################################################"
            )
            print("Starting training iteration %d." % (i + 1))

            iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1))
            os.makedirs(iter_dir, exist_ok=True)

            samples = []
            for j in range(self.nrecord):
                samples.append(
                    self.agent.sample(
                        self.task_hor, self.policy,
                        os.path.join(iter_dir, "rollout%d.mp4" % j)))
            if self.nrecord > 0:
                for item in filter(lambda f: f.endswith(".json"),
                                   os.listdir(iter_dir)):
                    os.remove(os.path.join(iter_dir, item))
            for j in range(
                    max(self.neval, self.nrollouts_per_iter) - self.nrecord):
                samples.append(self.agent.sample(self.task_hor, self.policy))
            print("Costs obtained:",
                  [sample["cost_sum"] for sample in samples[:self.neval]])
            traj_obs.extend([
                sample["obs"] for sample in samples[:self.nrollouts_per_iter]
            ])
            traj_acs.extend(
                [sample["ac"] for sample in samples[:self.nrollouts_per_iter]])
            traj_rets.extend(
                [sample["cost_sum"] for sample in samples[:self.neval]])
            traj_costs.extend([
                sample["costs"] for sample in samples[:self.nrollouts_per_iter]
            ])

            samples = samples[:self.nrollouts_per_iter]

            self.policy.dump_logs(self.logdir, iter_dir)
            if self.use_value:
                self.value.dump_logs(self.logdir, iter_dir)

            savemat(
                os.path.join(self.logdir, "logs.mat"), {
                    "observations": traj_obs,
                    "actions": traj_acs,
                    "returns": traj_rets,
                    "costs": traj_costs
                })
            # Delete iteration directory if not used
            if len(os.listdir(iter_dir)) == 0:
                os.rmdir(iter_dir)

            if i < self.ntrain_iters - 1:
                self.policy.train([sample["obs"] for sample in samples],
                                  [sample["ac"] for sample in samples],
                                  [sample["costs"] for sample in samples],
                                  np.array(self.stabilizable_observations))

                if self.gym_robotics:
                    current_stabilizable_obs = np.array([
                        sample["stabilizable_obs"] for sample in samples
                    ]).reshape((-1, self.env.observation_space.
                                spaces['observation'].low.size))
                else:
                    current_stabilizable_obs = np.array([
                        sample["stabilizable_obs"] for sample in samples
                    ]).reshape((-1, self.env.observation_space.shape[0]))
                if self.use_value:
                    copy_target = i % self.target_update_freq == 0
                    # Train value function using teacher rollouts
                    self.value.train(
                        [sample["obs"][:-1] for sample in samples],
                        [sample["costs"] for sample in samples],
                        [sample["obs"][1:] for sample in samples],
                        [sample["values"] for sample in samples],
                        use_TD=True,
                        terminal_states=[
                            sample["obs"][-1] for sample in samples
                        ],
                        copy_target=copy_target)

                if len(current_stabilizable_obs):
                    current_stabilizable_obs = [
                        c for c in current_stabilizable_obs
                    ]
                    self.stabilizable_observations.extend(
                        current_stabilizable_obs)
                    self.stabilizable_observations = self.stabilizable_observations[
                        -self.ss_buffer_size:]

                if self.stabilized_model is not None:
                    self.stabilized_model.fit(
                        np.array(self.stabilizable_observations))
                    self.policy.set_stabilized_model(self.stabilized_model)
                    pickle.dump(
                        self.stabilized_model,
                        open(os.path.join(self.logdir, "stabilized_model.pkl"),
                             "wb"))
Example #3
0
class MBExperiment:
    def __init__(self, params):
        """Initializes class instance.

        Argument:
            params (DotMap): A DotMap containing the following:
                .sim_cfg:
                    .env (gym.env): Environment for this experiment
                    .task_hor (int): Task horizon
                    .stochastic (bool): (optional) If True, agent adds noise to its actions.
                        Must provide noise_std (see below). Defaults to False.
                    .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I)
                        will be added.

                .exp_cfg:
                    .ntrain_iters (int): Number of training iterations to be performed.
                    .nrollouts_per_iter (int): (optional) Number of rollouts done between training
                        iterations. Defaults to 1.
                    .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1.
                    .policy (controller): Policy that will be trained.

                .log_cfg:
                    .logdir (str): Parent of directory path where experiment data will be saved.
                        Experiment will be saved in logdir/<date+time of experiment start>
                    .nrecord (int): (optional) Number of rollouts to record for every iteration.
                        Defaults to 0.
                    .neval (int): (optional) Number of rollouts for performance evaluation.
                        Defaults to 1.
        """
        self.env = get_required_argument(params.sim_cfg, "env",
                                         "Must provide environment.")
        self.task_hor = get_required_argument(params.sim_cfg, "task_hor",
                                              "Must provide task horizon.")
        if params.sim_cfg.get("stochastic", False):
            self.agent = Agent(
                DotMap(
                    env=self.env,
                    noisy_actions=True,
                    noise_stddev=get_required_argument(
                        params.sim_cfg, "noise_std",
                        "Must provide noise standard deviation in the case of a stochastic environment."
                    )))
        else:
            self.agent = Agent(DotMap(env=self.env, noisy_actions=False))

        self.ntrain_iters = get_required_argument(
            params.exp_cfg, "ntrain_iters",
            "Must provide number of training iterations.")
        self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1)
        self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1)
        self.policy = get_required_argument(params.exp_cfg, "policy",
                                            "Must provide a policy.")

        self.logdir = os.path.join(
            get_required_argument(params.log_cfg, "logdir",
                                  "Must provide log parent directory."),
            strftime("%Y-%m-%d--%H:%M:%S", localtime()))
        self.nrecord = params.log_cfg.get("nrecord", 0)
        self.neval = params.log_cfg.get("neval", 1)

    def run_experiment(self):
        """Perform experiment.
        """
        os.makedirs(self.logdir, exist_ok=True)

        traj_obs, traj_acs, traj_rets, traj_rews, traj_cost = [], [], [], [], []

        # Perform initial rollouts
        # uses policy.act() to come up with action, should be uniform random
        samples = []
        print(f"Acting randomly for {self.ninit_rollouts} rollouts")
        for i in range(self.ninit_rollouts):

            samples.append(self.agent.sample(self.task_hor, self.policy))
            traj_obs.append(samples[-1]["obs"])
            traj_acs.append(samples[-1]["ac"])
            traj_rews.append(samples[-1]["rewards"])
            traj_cost.append(samples[-1]["cost"])
        print(f"{len(traj_rews)} Number of initial samples")

        # jsw: "Initialize data D with a random controller for one trial"
        if self.ninit_rollouts > 0:
            print("Training on random actions")
            # jsw this trains the NN model for the very first time
            # policy is of type Controller, which MPC inherits from
            self.policy.train([sample["obs"] for sample in samples],
                              [sample["ac"] for sample in samples],
                              [sample["rewards"] for sample in samples])
        # Plot the train/validation curves
        self.policy.model.plot_train_val(self.logdir)

        # Training loop:
        # jsw: "for Trial k = 1 to K do:"
        for i in range(self.ntrain_iters):
            print(
                "####################################################################"
            )
            print("Starting training iteration %d." % (i + 1))

            # jsw note that NN model trained above, initialized with action from a random policy
            iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1))
            os.makedirs(iter_dir, exist_ok=True)

            samples = []
            for j in range(self.nrecord):
                samples.append(
                    #####################
                    # This call does a lot! uses policy.act() to come up with action
                    # policy.act() solves open loop finite time problem
                    # Uses environment to actually act.
                    #####################
                    self.agent.sample(
                        self.task_hor, self.policy,
                        os.path.join(iter_dir, "rollout%d.mp4" % j)))
            if self.nrecord > 0:
                for item in filter(lambda f: f.endswith(".json"),
                                   os.listdir(iter_dir)):
                    os.remove(os.path.join(iter_dir, item))

            # jsw: actually executing action from optimal actions, log it
            # sample() calls Agent.py's sample which gets the best action from the policy, and
            # uses the environment to see what happens when you use that action. it repeats
            # this for the entire horizon. Actually exploring the true environment
            for j in range(
                    max(self.neval, self.nrollouts_per_iter) - self.nrecord):
                samples.append(
                    # jsw for time t = 0 to task horizon
                    self.agent.sample(self.task_hor, self.policy))
            print("Rewards obtained:",
                  [sample["reward_sum"] for sample in samples[:self.neval]])

            traj_obs.extend([
                sample["obs"] for sample in samples[:self.nrollouts_per_iter]
            ])
            traj_acs.extend(
                [sample["ac"] for sample in samples[:self.nrollouts_per_iter]])
            traj_rets.extend(
                [sample["reward_sum"] for sample in samples[:self.neval]])
            traj_rews.extend([
                sample["rewards"]
                for sample in samples[:self.nrollouts_per_iter]
            ])
            traj_cost.extend([
                sample["cost"] for sample in samples[:self.nrollouts_per_iter]
            ])
            samples = samples[:self.nrollouts_per_iter]

            self.policy.dump_logs(self.logdir, iter_dir)
            savemat(
                os.path.join(self.logdir, "logs.mat"), {
                    "observations": traj_obs,
                    "actions": traj_acs,
                    "returns": traj_rets,
                    "rewards": traj_rews,
                    "cost": traj_cost,
                })

            # Delete iteration directory if not used
            if len(os.listdir(iter_dir)) == 0:
                os.rmdir(iter_dir)

            if i < self.ntrain_iters - 1:
                self.policy.train([sample["obs"] for sample in samples],
                                  [sample["ac"] for sample in samples],
                                  [sample["rewards"] for sample in samples])
Example #4
0
class MBExperiment:
    def __init__(self, params):
        """Initializes class instance.

        Argument:
            params (DotMap): A DotMap containing the following:
                .sim_cfg:
                    .env (gym.env): Environment for this experiment
                    .task_hor (int): Task horizon
                    .stochastic (bool): (optional) If True, agent adds noise to its actions.
                        Must provide noise_std (see below). Defaults to False.
                    .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I)
                        will be added.

                .exp_cfg:
                    .ntrain_iters (int): Number of training iterations to be performed.
                    .nrollouts_per_iter (int): (optional) Number of rollouts done between training
                        iterations. Defaults to 1.
                    .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1.
                    .policy (controller): Policy that will be trained.

                .log_cfg:
                    .logdir (str): Parent of directory path where experiment data will be saved.
                        Experiment will be saved in logdir/<date+time of experiment start>
                    .nrecord (int): (optional) Number of rollouts to record for every iteration.
                        Defaults to 0.
                    .neval (int): (optional) Number of rollouts for performance evaluation.
                        Defaults to 1.
        """
        self.env = get_required_argument(params.sim_cfg, "env",
                                         "Must provide environment.")
        self.task_hor = get_required_argument(params.sim_cfg, "task_hor",
                                              "Must provide task horizon.")
        self._params = params
        params.sim_cfg.misc = copy.copy(params)
        if params.sim_cfg.get("stochastic", False):
            self.agent = Agent(
                DotMap(
                    env=self.env,
                    noisy_actions=True,
                    noise_stddev=get_required_argument(
                        params.sim_cfg, "noise_std",
                        "Must provide noise standard deviation in the case of a stochastic environment."
                    ),
                    params=params))
        else:
            self.agent = Agent(
                DotMap(env=self.env, noisy_actions=False, params=params))

        self.ntrain_iters = get_required_argument(
            params.exp_cfg, "ntrain_iters",
            "Must provide number of training iterations.")
        self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1)
        self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1)
        self.policy = get_required_argument(params.exp_cfg, "policy",
                                            "Must provide a policy.")

        self.logdir = os.path.join(
            get_required_argument(params.log_cfg, "logdir",
                                  "Must provide log parent directory."),
            strftime("%Y-%m-%d--%H-%M-%S", localtime()))
        logger.set_file_handler(path=self.logdir)
        logger.info('Starting the experiments')
        self.nrecord = params.log_cfg.get("nrecord", 0)
        self.neval = params.log_cfg.get("neval", 1)
        self.delay_hor = params.sim_cfg.get("delay_hor", 0)

    def run_experiment(self):
        """Perform experiment.
        """
        os.makedirs(self.logdir, exist_ok=True)

        traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], []
        test_traj_obs, test_traj_acs, test_traj_rets = [], [], []
        episode_iter_id = []

        # Perform initial rollouts
        samples = []
        needed_num_steps = self.ninit_rollouts * self.task_hor
        finished_num_steps = 0
        """
        # TODO DEBUG
        needed_num_steps = 64
        self.task_hor = 64
        """
        while True:
            samples.append(
                self.agent.sample(self.task_hor, self.policy, self.delay_hor))
            traj_obs.append(samples[-1]["obs"])
            traj_acs.append(samples[-1]["ac"])
            traj_rews.append(samples[-1]["rewards"])
            finished_num_steps += len(samples[-1]["ac"])
            print(finished_num_steps)

            if finished_num_steps >= needed_num_steps:
                break

        if self.ninit_rollouts > 0:
            self.policy.train([sample["obs"] for sample in samples],
                              [sample["ac"] for sample in samples],
                              [sample["rewards"] for sample in samples])

        # Training loop
        for i in range(self.ntrain_iters):

            logger.info(
                "####################################################################"
            )
            logger.info("Starting training iteration %d." % (i + 1))

            iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1))
            os.makedirs(iter_dir, exist_ok=True)

            samples = []
            assert self.nrecord == 0

            needed_num_steps = self.task_hor * \
                (max(self.neval, self.nrollouts_per_iter) - self.nrecord)
            finished_num_steps = 0
            while True:
                samples.append(
                    self.agent.sample(self.task_hor, self.policy,
                                      self.delay_hor))
                finished_num_steps += len(samples[-1]["ac"])

                if finished_num_steps >= needed_num_steps:
                    break
            logger.info("Rewards obtained: {}".format(
                [sample["reward_sum"] for sample in samples[:self.neval]]))
            # test the policy if needed
            if self._params.misc.ctrl_cfg.cem_cfg.test_policy > 0:
                test_data = []
                for _ in range(5):
                    test_data.append(
                        self.agent.sample(self.task_hor,
                                          self.policy,
                                          test_policy=True,
                                          average=False))
                test_traj_rets.extend([
                    np.mean([
                        i_test_data["reward_sum"] for i_test_data in test_data
                    ])
                ])
                test_traj_obs.extend(
                    [i_test_data["obs"] for i_test_data in test_data])
                test_traj_acs.extend(
                    [i_test_data["ac"] for i_test_data in test_data])

            traj_obs.extend([sample["obs"] for sample in samples])
            traj_acs.extend([sample["ac"] for sample in samples])
            traj_rets.extend([sample["reward_sum"] for sample in samples])
            traj_rews.extend([sample["rewards"] for sample in samples])
            episode_iter_id.extend([i] * len(samples))
            samples = samples[:self.nrollouts_per_iter]

            self.policy.dump_logs(self.logdir, iter_dir)
            savemat(
                os.path.join(self.logdir, "logs.mat"), {
                    "observations": traj_obs,
                    "actions": traj_acs,
                    "returns": traj_rets,
                    "rewards": traj_rews,
                    "test_returns": test_traj_rets,
                    "test_obs": test_traj_obs,
                    "test_acs": test_traj_acs,
                    'episode_iter_id': episode_iter_id
                })
            # Delete iteration directory if not used
            if len(os.listdir(iter_dir)) == 0:
                os.rmdir(iter_dir)

            if i < self.ntrain_iters - 1:
                self.policy.train([sample["obs"] for sample in samples],
                                  [sample["ac"] for sample in samples],
                                  [sample["rewards"] for sample in samples])
Example #5
0
class MBExperiment:
    def __init__(self, params):
        """Initializes class instance.

        Argument:
            params (DotMap): A DotMap containing the following:
                .sim_cfg:
                    .env (gym.env): Environment for this experiment
                    .task_hor (int): Task horizon
                    .stochastic (bool): (optional) If True, agent adds noise to its actions.
                        Must provide noise_std (see below). Defaults to False.
                    .noise_std (float): for stochastic agents, noise of the form N(0, noise_std^2I)
                        will be added.

                .exp_cfg:
                    .ntrain_iters (int): Number of training iterations to be performed.
                    .nrollouts_per_iter (int): (optional) Number of rollouts done between training
                        iterations. Defaults to 1.
                    .ninit_rollouts (int): (optional) Number of initial rollouts. Defaults to 1.
                    .policy (controller): Policy that will be trained.

                .log_cfg:
                    .logdir (str): Parent of directory path where experiment data will be saved.
                        Experiment will be saved in logdir/<date+time of experiment start>
                    .nrecord (int): (optional) Number of rollouts to record for every iteration.
                        Defaults to 0.
                    .neval (int): (optional) Number of rollouts for performance evaluation.
                        Defaults to 1.
        """
        self.env = get_required_argument(params.sim_cfg, "env",
                                         "Must provide environment.")
        self.task_hor = get_required_argument(params.sim_cfg, "task_hor",
                                              "Must provide task horizon.")
        self._params = params
        # params.sim_cfg.misc = copy.copy(params)
        if params.sim_cfg.get("stochastic", False):
            self.agent = Agent(
                DotMap(
                    env=self.env,
                    noisy_actions=True,
                    noise_stddev=get_required_argument(
                        params.sim_cfg, "noise_std",
                        "Must provide noise standard deviation in the case of a stochastic environment."
                    ),
                    params=params))
        else:
            self.agent = Agent(
                DotMap(env=self.env, noisy_actions=False, params=params))

        self.ntrain_iters = get_required_argument(
            params.exp_cfg, "ntrain_iters",
            "Must provide number of training iterations.")
        self.nrollouts_per_iter = params.exp_cfg.get("nrollouts_per_iter", 1)
        self.ninit_rollouts = params.exp_cfg.get("ninit_rollouts", 1)
        self.policy = get_required_argument(params.exp_cfg, "policy",
                                            "Must provide a policy.")

        self.logdir = os.path.join(
            get_required_argument(params.log_cfg, "logdir",
                                  "Must provide log parent directory."),
            strftime("%Y-%m-%d--%H:%M:%S", localtime()))
        logger.set_file_handler(path=self.logdir)
        logger.info('Starting the experiments')
        self.nrecord = params.log_cfg.get("nrecord", 0)
        self.neval = params.log_cfg.get("neval", 1)

    def run_experiment(self):
        """Perform experiment.
        """
        os.makedirs(self.logdir, exist_ok=True)

        traj_obs, traj_acs, traj_rets, traj_rews = [], [], [], []
        test_traj_obs, test_traj_acs, test_traj_rets = [], [], []
        episode_iter_id = []

        # Perform initial rollouts
        samples = []
        needed_num_steps = self.ninit_rollouts * self.task_hor
        finished_num_steps = 0
        """
        # TODO DEBUG
        needed_num_steps = 64
        self.task_hor = 64
        """

        # logger.info("Collecting n_init rollout before policy trainning")
        while True:
            samples.append(self.agent.sample(self.task_hor, self.policy))
            traj_obs.append(samples[-1]["obs"])
            traj_acs.append(samples[-1]["ac"])
            traj_rews.append(samples[-1]["rewards"])
            finished_num_steps += len(samples[-1]["ac"])

            if finished_num_steps >= needed_num_steps:
                break

        if self.ninit_rollouts > 0:
            # logger.info("Performing init policy trianing")
            self.policy.train([sample["obs"] for sample in samples],
                              [sample["ac"] for sample in samples],
                              [sample["rewards"] for sample in samples])

        # Training loop
        for i in range(self.ntrain_iters):

            logger.info(
                "####################################################################"
            )
            logger.info("Starting training iteration %d." % (i + 1))

            iter_dir = os.path.join(self.logdir, "train_iter%d" % (i + 1))
            os.makedirs(iter_dir, exist_ok=True)

            samples = []
            assert self.nrecord == 0

            needed_num_steps = self.task_hor * \
                (max(self.neval, self.nrollouts_per_iter) - self.nrecord)
            finished_num_steps = 0
            while True:
                samples.append(self.agent.sample(self.task_hor, self.policy))
                finished_num_steps += len(samples[-1]["ac"])

                if finished_num_steps >= needed_num_steps:
                    break
            logger.info("Rewards obtained: {}".format(
                [sample["reward_sum"] for sample in samples[:self.neval]]))
            # test the policy if needed

            # comment out by ShenShuo
            # passing while config to misc is much too messy
            # we juse comment it out, if need testing policy, we consider a smarter way to pass
            # test_policy arg

            # if self._params.misc.ctrl_cfg.cem_cfg.test_policy > 0:
            #     test_data = []
            #     for _ in range(5):
            #         test_data.append(
            #             self.agent.sample(self.task_hor, self.policy,
            #                               test_policy=True, average=False)
            #         )
            #     test_traj_rets.extend([
            #         np.mean([i_test_data["reward_sum"] for i_test_data in test_data])
            #     ])
            #     test_traj_obs.extend(
            #         [i_test_data["obs"] for i_test_data in test_data]
            #     )
            #     test_traj_acs.extend(
            #         [i_test_data["ac"] for i_test_data in test_data]
            #     )

            traj_obs.extend([sample["obs"] for sample in samples])
            traj_acs.extend([sample["ac"] for sample in samples])
            traj_rets.extend([sample["reward_sum"] for sample in samples])
            traj_rews.extend([sample["rewards"] for sample in samples])
            episode_iter_id.extend([i] * len(samples))
            samples = samples[:self.nrollouts_per_iter]

            self.policy.dump_logs(self.logdir, iter_dir)
            savemat(
                os.path.join(self.logdir, "logs.mat"), {
                    "observations": traj_obs,
                    "actions": traj_acs,
                    "returns": traj_rets,
                    "rewards": traj_rews,
                    "test_returns": test_traj_rets,
                    "test_obs": test_traj_obs,
                    "test_acs": test_traj_acs,
                    'episode_iter_id': episode_iter_id
                })
            # Delete iteration directory if not used
            if len(os.listdir(iter_dir)) == 0:
                os.rmdir(iter_dir)

            # train policy and model together
            if i < self.ntrain_iters - 1:
                self.policy.train([sample["obs"] for sample in samples],
                                  [sample["ac"] for sample in samples],
                                  [sample["rewards"] for sample in samples])

            if i % 10 == 0:
                self.log_model_predictions(i)

    def log_model_predictions(self, itr):
        import matplotlib.pyplot as plt

        action_sequence = self.policy.sample_random_action_sequences(
            num_sequences=1, horizon=200)  #20 reacher
        action_sequence = action_sequence[0]

        mpe, true_states, pred_states = self.agent.calculate_mean_prediction_error(
            action_sequence, self.policy)
        savemat(os.path.join(self.logdir, str(itr), "states.mat"), {
            "t_state": true_states,
            "p_state": pred_states,
        })
        assert self.env.observation_space.shape[0] == true_states.shape[
            1] == pred_states.shape[1]
        ob_dim = self.env.observation_space.shape[0]
        self.fig = plt.figure(figsize=(10, 1 * ob_dim))

        if ob_dim > 16:
            ob_dim = 16
        if ob_dim % 2 == 1:
            ob_dim -= 1

        # plot the predictions
        self.fig.clf()
        for i in range(ob_dim):
            plt.subplot(ob_dim / 2, 2, i + 1)
            plt.plot(true_states[:, i], 'g')
            plt.plot(pred_states[:, i], 'r')
        self.fig.suptitle('MPE: ' + str(mpe))
        self.fig.savefig(self.logdir + '/itr_' + str(itr) + '_predictions.png',
                         dpi=200,
                         bbox_inches='tight')