Example #1
0
    def regularization_paths(self, methods, n_samples=1000, n_eps=1,
                             seed=1, criteria=["RMSBE"], verbose=0):

        # Intialization
        self._init_methods(methods)
        err_f = [self._init_error_fun(criterion) for criterion in criteria]

        errors = dict([(crit, [[] for m in methods]) for crit in criteria])
        for m in methods:
            m.reset_trace()

        # Generate trajectories
        s, a, r, s_n, restarts = self.mdp.samples_cached(n_iter=n_samples,
                                                         n_restarts=n_eps,
                                                         policy=self.behavior_policy,
                                                         seed=seed)
        if self.off_policy:
            m_a_beh = policies.mean_action_trajectory(self.behavior_policy, s)
            m_a_tar = policies.mean_action_trajectory(self.target_policy, s)
            rhos = np.zeros_like(r)
            self.rhos = rhos

        # Method learning
        with ProgressBar(enabled=(verbose > 2.)) as p:
            for i in xrange(n_samples * n_eps):
                p.update(i, n_samples * n_eps)
                f0 = self.phi(s[i])
                f1 = self.phi(s_n[i])
                if restarts[i]:
                    for k, m in enumerate(methods):
                        m.reset_trace()

                for k, m in enumerate(methods):
                    if self.off_policy:
                        rhos[i] = self.target_policy.p(s[i], a[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a[i], mean=m_a_beh[i])
                        m.update_V(s[i], s_n[i], r[i],
                                    rho=rhos[i],
                                    f0=f0, f1=f1)
                    else:
                        m.update_V(s[i], s_n[i], r[i], f0=f0, f1=f1)


        for i, m in enumerate(methods):
            v = m.regularization_path()
            for tau, theta in v:
                for i_e, crit in enumerate(criteria):
                    errors[crit][i].append((tau, theta, err_f[i_e](theta)))

        return errors
Example #2
0
    def regularization_paths(self, methods, n_samples=1000, n_eps=1,
                             seed=1, criteria=["RMSBE"], verbose=0):

        # Intialization
        self._init_methods(methods)
        err_f = [self._init_error_fun(criterion) for criterion in criteria]

        errors = dict([(crit, [[] for m in methods]) for crit in criteria])
        for m in methods:
            m.reset_trace()

        # Generate trajectories
        s, a, r, s_n, restarts = self.mdp.samples_cached(n_iter=n_samples,
                                                         n_restarts=n_eps,
                                                         policy=self.behavior_policy,
                                                         seed=seed)
        if self.off_policy:
            m_a_beh = policies.mean_action_trajectory(self.behavior_policy, s)
            m_a_tar = policies.mean_action_trajectory(self.target_policy, s)
            rhos = np.zeros_like(r)
            self.rhos = rhos

        # Method learning
        with ProgressBar(enabled=(verbose > 2.)) as p:
            for i in xrange(n_samples * n_eps):
                p.update(i, n_samples * n_eps)
                f0 = self.phi(s[i])
                f1 = self.phi(s_n[i])
                if restarts[i]:
                    for k, m in enumerate(methods):
                        m.reset_trace()

                for k, m in enumerate(methods):
                    if self.off_policy:
                        rhos[i] = self.target_policy.p(s[i], a[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a[i], mean=m_a_beh[i])
                        m.update_V(s[i], s_n[i], r[i],
                                    rho=rhos[i],
                                    f0=f0, f1=f1)
                    else:
                        m.update_V(s[i], s_n[i], r[i], f0=f0, f1=f1)


        for i, m in enumerate(methods):
            v = m.regularization_path()
            for tau, theta in v:
                for i_e, crit in enumerate(criteria):
                    errors[crit][i].append((tau, theta, err_f[i_e](theta)))

        return errors
Example #3
0
    def error_traces(self, methods, n_samples=1000, n_eps=1, verbose=0.,
                     seed=1, criteria=["RMSBE"], error_every=1, episodic=False,
                     eval_on_traces=False, n_samples_eval=None):

        # Intialization
        self._init_methods(methods)
        err_f = [self._init_error_fun(criterion) for criterion in criteria]
        err_f_gen = [self._init_error_fun(
            criterion, general=True) for criterion in criteria]

        if episodic:
            n_e = n_eps
            k_e = 0
        else:
            n_e = int(np.ceil(float(n_samples * n_eps) / error_every))

        errors = np.ones((len(methods), len(criteria), n_e)) * np.inf
        for m in methods:
            m.reset_trace()

        # Generate trajectories
        with Timer("Generate Samples", active=(verbose > 1.)):
            s, a, r, s_n, restarts = self.mdp.samples_cached(n_iter=n_samples,
                                                             n_restarts=n_eps,
                                                             policy=self.behavior_policy,
                                                             seed=seed, verbose=verbose)
        with Timer("Generate Double Samples", active=(verbose > 1.)):
            a2, r2, s_n2 = self.mdp.samples_cached_transitions(
                policy=self.behavior_policy,
                states=s, seed=seed)
        if eval_on_traces:
            print "Evaluation of traces samples"
            self.set_mu_from_states(
                seed=self.mu_seed, s=s, n_samples_eval=n_samples_eval)

        if self.off_policy:
            with Timer("Generate off-policy weights", active=(verbose > 1.)):
                m_a_beh = policies.mean_action_trajectory(
                    self.behavior_policy, s)
                m_a_tar = policies.mean_action_trajectory(
                    self.target_policy, s)
                rhos = np.zeros_like(r)
                rhos2 = np.zeros_like(r2)
                self.rhos = rhos

        # Method learning
        with ProgressBar(enabled=(verbose > 2.)) as p:
            for i in xrange(n_samples * n_eps):
                p.update(i, n_samples * n_eps)
                f0 = self.phi(s[i])
                f1 = self.phi(s_n[i])
                f1t = self.phi(s_n2[i])
                if restarts[i]:
                    for k, m in enumerate(methods):
                        m.reset_trace()
                        if episodic:
                            cur_theta = m.theta
                            if not np.isfinite(np.sum(cur_theta)):
                                errors[k,:, k_e] = np.nan
                                continue
                            for i_e in range(len(criteria)):
                                if isinstance(m, td.LinearValueFunctionPredictor):
                                    errors[k, i_e, k_e] = err_f[i_e](cur_theta)
                                else:
                                    errors[k, i_e, k_e] = err_f_gen[i_e](m.V)

                    if episodic:
                        k_e += 1
                        if k_e >= n_e:
                            break

                for k, m in enumerate(methods):
                    if self.off_policy:
                        rhos[i] = self.target_policy.p(s[i], a[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a[i], mean=m_a_beh[i])
                        rhos2[i] = self.target_policy.p(s[i], a2[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a2[i], mean=m_a_beh[i])
                        m.update_V(s[i], s_n[i], r[i],
                                   rho=rhos[i], rhot=rhos2[i],
                                   f0=f0, f1=f1, f1t=f1t, s1t=s_n[i], rt=r2[i])
                    else:
                        m.update_V(s[i], s_n[i], r[i],
                                   f0=f0, f1=f1, s1t=s_n2[i], f1t=f1t, rt=r2[i])
                    if i % error_every == 0 and not episodic:
                        cur_theta = m.theta
                        if not np.isfinite(np.sum(cur_theta)):
                            errors[k,:, int(i / error_every)] = np.nan
                            continue
                        for i_e in range(len(criteria)):
                            if isinstance(m, td.LinearValueFunctionPredictor):
                                errors[k, i_e, int(
                                    i / error_every)] = err_f[i_e](cur_theta)
                            else:
                                errors[k, i_e, int(
                                    i / error_every)] = err_f_gen[i_e](m.V)

        return errors
Example #4
0
    def error_traces_cpu_time(self, method, max_t=600, max_passes=None, min_diff=0.1, n_samples=1000, n_eps=1, verbose=0.,
                     seed=1, criteria=["RMSBE"], error_every=1,
                     eval_on_traces=False, n_samples_eval=None, eval_once=False):

        # Intialization
        self._init_methods([method])
        err_f = [self._init_error_fun(criterion) for criterion in criteria]
        err_f_gen = [self._init_error_fun(
            criterion, general=True) for criterion in criteria]

        times = []
        errors = []
        processed = []

        method.reset_trace()
        if hasattr(method, "lam") and method.lam > 0.:
            print "WARNING: reuse of samples only works without e-traces"

        # Generate trajectories
        with Timer("Generate Samples", active=(verbose > 1.)):
            s, a, r, s_n, restarts = self.mdp.samples_cached(n_iter=n_samples,
                                                             n_restarts=n_eps,
                                                             policy=self.behavior_policy,
                                                             seed=seed, verbose=verbose)
        with Timer("Generate Double Samples", active=(verbose > 1.)):
            a2, r2, s_n2 = self.mdp.samples_cached_transitions(
                policy=self.behavior_policy,
                states=s, seed=seed)
        if eval_on_traces:
            print "Evaluation of traces samples"
            self.set_mu_from_states(
                seed=self.mu_seed, s=s, n_samples_eval=n_samples_eval)

        if self.off_policy:
            with Timer("Generate off-policy weights", active=(verbose > 1.)):
                m_a_beh = policies.mean_action_trajectory(
                    self.behavior_policy, s)
                m_a_tar = policies.mean_action_trajectory(
                    self.target_policy, s)
                rhos = np.zeros_like(r)
                rhos2 = np.zeros_like(r2)
                self.rhos = rhos

        # Method learning
        i = 0
        last_t = 0.
        passes = 0
        u = 0
        with ProgressBar(enabled=(verbose > 2.)) as p:
            while method.time < max_t:


                f0 = self.phi(s[i])
                f1 = self.phi(s_n[i])
                f1t = self.phi(s_n2[i])
                #assert not np.any(np.isnan(f0))
                #assert not np.any(np.isnan(f1))
                #assert not np.any(np.isnan(f1t))
                if restarts[i]:
                    method.reset_trace()
                if self.off_policy:
                    rhos[i] = self.target_policy.p(s[i], a[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a[i], mean=m_a_beh[i])
                    rhos2[i] = self.target_policy.p(s[i], a2[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a2[i], mean=m_a_beh[i])
                    method.update_V(s[i], s_n[i], r[i],
                                rho=rhos[i], rhot=rhos2[i],
                                f0=f0, f1=f1, f1t=f1t, s1t=s_n[i], rt=r2[i])
                else:
                    method.update_V(s[i], s_n[i], r[i],
                                f0=f0, f1=f1, s1t=s_n2[i], f1t=f1t, rt=r2[i])
                u+=1
                assert(method.time > last_t)
                if method.time - last_t > min_diff:
                    p.update(method.time, max_t)
                    last_t = method.time
                    if not eval_once:
                        cur_theta = method.theta
                        e = np.empty(len(criteria))
                        for i_e in range(len(criteria)):
                            e[i_e] = err_f[i_e](cur_theta)
                        errors.append(e)
                        processed.append(u)
                        times.append(method.time)
                i += 1
                if i >= n_samples * n_eps:
                    passes += 1
                    if max_passes is not None and passes >= max_passes:
                        break
                i = i % (n_samples * n_eps)
        if eval_once:
            cur_theta = method.theta
            e = np.empty(len(criteria))
            for i_e in range(len(criteria)):
                e[i_e] = err_f[i_e](cur_theta)
            return e, method.time

        return errors, processed, times
Example #5
0
    def error_traces(self, methods, n_samples=1000, n_eps=1, verbose=0.,
                     seed=1, criteria=["RMSBE"], error_every=1, episodic=False,
                     eval_on_traces=False, n_samples_eval=None):

        # Intialization
        self._init_methods(methods)
        err_f = [self._init_error_fun(criterion) for criterion in criteria]
        err_f_gen = [self._init_error_fun(
            criterion, general=True) for criterion in criteria]

        if episodic:
            n_e = n_eps
            k_e = 0
        else:
            n_e = int(np.ceil(float(n_samples * n_eps) / error_every))

        errors = np.ones((len(methods), len(criteria), n_e)) * np.inf
        for m in methods:
            m.reset_trace()

        # Generate trajectories
        with Timer("Generate Samples", active=(verbose > 1.)):
            s, a, r, s_n, restarts = self.mdp.samples_cached(n_iter=n_samples,
                                                             n_restarts=n_eps,
                                                             policy=self.behavior_policy,
                                                             seed=seed, verbose=verbose)
        with Timer("Generate Double Samples", active=(verbose > 1.)):
            a2, r2, s_n2 = self.mdp.samples_cached_transitions(
                policy=self.behavior_policy,
                states=s, seed=seed)
        if eval_on_traces:
            print "Evaluation of traces samples"
            self.set_mu_from_states(
                seed=self.mu_seed, s=s, n_samples_eval=n_samples_eval)

        if self.off_policy:
            with Timer("Generate off-policy weights", active=(verbose > 1.)):
                m_a_beh = policies.mean_action_trajectory(
                    self.behavior_policy, s)
                m_a_tar = policies.mean_action_trajectory(
                    self.target_policy, s)
                rhos = np.zeros_like(r)
                rhos2 = np.zeros_like(r2)
                self.rhos = rhos

        # Method learning
        with ProgressBar(enabled=(verbose > 2.)) as p:
            for i in xrange(n_samples * n_eps):
                p.update(i, n_samples * n_eps)
                f0 = self.phi(s[i])
                f1 = self.phi(s_n[i])
                f1t = self.phi(s_n2[i])
                if restarts[i]:
                    for k, m in enumerate(methods):
                        m.reset_trace()
                        if episodic:
                            cur_theta = m.theta
                            if not np.isfinite(np.sum(cur_theta)):
                                errors[k,:, k_e] = np.nan
                                continue
                            for i_e in range(len(criteria)):
                                if isinstance(m, td.LinearValueFunctionPredictor):
                                    errors[k, i_e, k_e] = err_f[i_e](cur_theta)
                                else:
                                    errors[k, i_e, k_e] = err_f_gen[i_e](m.V)

                    if episodic:
                        k_e += 1
                        if k_e >= n_e:
                            break

                for k, m in enumerate(methods):
                    if self.off_policy:
                        rhos[i] = self.target_policy.p(s[i], a[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a[i], mean=m_a_beh[i])
                        rhos2[i] = self.target_policy.p(s[i], a2[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a2[i], mean=m_a_beh[i])
                        m.update_V(s[i], s_n[i], r[i],
                                   rho=rhos[i], rhot=rhos2[i],
                                   f0=f0, f1=f1, f1t=f1t, s1t=s_n[i], rt=r2[i])
                    else:
                        m.update_V(s[i], s_n[i], r[i],
                                   f0=f0, f1=f1, s1t=s_n2[i], f1t=f1t, rt=r2[i])
                    if i % error_every == 0 and not episodic:
                        cur_theta = m.theta
                        if not np.isfinite(np.sum(cur_theta)):
                            errors[k,:, int(i / error_every)] = np.nan
                            continue
                        for i_e in range(len(criteria)):
                            if isinstance(m, td.LinearValueFunctionPredictor):
                                errors[k, i_e, int(
                                    i / error_every)] = err_f[i_e](cur_theta)
                            else:
                                errors[k, i_e, int(
                                    i / error_every)] = err_f_gen[i_e](m.V)

        return errors
Example #6
0
    def error_traces_cpu_time(self, method, max_t=600, max_passes=None, min_diff=0.1, n_samples=1000, n_eps=1, verbose=0.,
                     seed=1, criteria=["RMSBE"], error_every=1,
                     eval_on_traces=False, n_samples_eval=None, eval_once=False):

        # Intialization
        self._init_methods([method])
        err_f = [self._init_error_fun(criterion) for criterion in criteria]
        err_f_gen = [self._init_error_fun(
            criterion, general=True) for criterion in criteria]

        times = []
        errors = []
        processed = []

        method.reset_trace()
        if hasattr(method, "lam") and method.lam > 0.:
            print "WARNING: reuse of samples only works without e-traces"

        # Generate trajectories
        with Timer("Generate Samples", active=(verbose > 1.)):
            s, a, r, s_n, restarts = self.mdp.samples_cached(n_iter=n_samples,
                                                             n_restarts=n_eps,
                                                             policy=self.behavior_policy,
                                                             seed=seed, verbose=verbose)
        with Timer("Generate Double Samples", active=(verbose > 1.)):
            a2, r2, s_n2 = self.mdp.samples_cached_transitions(
                policy=self.behavior_policy,
                states=s, seed=seed)
        if eval_on_traces:
            print "Evaluation of traces samples"
            self.set_mu_from_states(
                seed=self.mu_seed, s=s, n_samples_eval=n_samples_eval)

        if self.off_policy:
            with Timer("Generate off-policy weights", active=(verbose > 1.)):
                m_a_beh = policies.mean_action_trajectory(
                    self.behavior_policy, s)
                m_a_tar = policies.mean_action_trajectory(
                    self.target_policy, s)
                rhos = np.zeros_like(r)
                rhos2 = np.zeros_like(r2)
                self.rhos = rhos

        # Method learning
        i = 0
        last_t = 0.
        passes = 0
        u = 0
        with ProgressBar(enabled=(verbose > 2.)) as p:
            while method.time < max_t:


                f0 = self.phi(s[i])
                f1 = self.phi(s_n[i])
                f1t = self.phi(s_n2[i])
                #assert not np.any(np.isnan(f0))
                #assert not np.any(np.isnan(f1))
                #assert not np.any(np.isnan(f1t))
                if restarts[i]:
                    method.reset_trace()
                if self.off_policy:
                    rhos[i] = self.target_policy.p(s[i], a[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a[i], mean=m_a_beh[i])
                    rhos2[i] = self.target_policy.p(s[i], a2[i], mean=m_a_tar[i]) / self.behavior_policy.p(s[i], a2[i], mean=m_a_beh[i])
                    method.update_V(s[i], s_n[i], r[i],
                                rho=rhos[i], rhot=rhos2[i],
                                f0=f0, f1=f1, f1t=f1t, s1t=s_n[i], rt=r2[i])
                else:
                    method.update_V(s[i], s_n[i], r[i],
                                f0=f0, f1=f1, s1t=s_n2[i], f1t=f1t, rt=r2[i])
                u+=1
                assert(method.time > last_t)
                if method.time - last_t > min_diff:
                    p.update(method.time, max_t)
                    last_t = method.time
                    if not eval_once:
                        cur_theta = method.theta
                        e = np.empty(len(criteria))
                        for i_e in range(len(criteria)):
                            e[i_e] = err_f[i_e](cur_theta)
                        errors.append(e)
                        processed.append(u)
                        times.append(method.time)
                i += 1
                if i >= n_samples * n_eps:
                    passes += 1
                    if max_passes is not None and passes >= max_passes:
                        break
                i = i % (n_samples * n_eps)
        if eval_once:
            cur_theta = method.theta
            e = np.empty(len(criteria))
            for i_e in range(len(criteria)):
                e[i_e] = err_f[i_e](cur_theta)
            return e, method.time

        return errors, processed, times