def approx_model_policy(domain, data): f = lambda pars: err_array(domain, pars, data) pars0 = domain.true_pars pars = leastsq(f, pars0)[0] #print pars dynamics = lambda s, u: domain.approx_dynamics(s, u, pars) T = deterministic_continuous_to_discrete_model(dynamics, domain.state_centers, domain.action_centers, domain.at_goal) states_to_actions, V = value_iteration(T, domain.state_centers, domain.reward, threshold=domain.value_iteration_threshold, pi_init=domain.pi_init) return discrete_policy(domain, states_to_actions)
def best_policy(domain, data): f = lambda pars: err_array(domain, pars, data) pars0 = domain.true_pars ml_start = leastsq(f, pars0)[0] if 1: f = lambda pars: [tuple(pars), mfmc_evaluation(policy_wrt_approx_model(domain, pars), data, domain.distance_fn, domain.initstate, domain.episode_length, domain.at_goal)] pars = hill_climb(f, domain.optimization_pars, ml_start) else: f = lambda pars: [pars, mfmc_evaluation(policy_wrt_approx_model(domain, pars), data, domain.distance_fn, domain.initstate, domain.episode_length, domain.at_goal)] raw_returns = parallel.largeparmap(f, domain.initial_par_search_space) ind = np.argmax([raw[1] for raw in raw_returns]) pars = raw_returns[ind][0] #print pars dynamics = lambda s, u: domain.approx_dynamics(s, u, pars) T = deterministic_continuous_to_discrete_model(dynamics, domain.state_centers, domain.action_centers, domain.at_goal) states_to_actions, V = value_iteration(T, domain.state_centers, domain.reward, threshold=domain.value_iteration_threshold) return discrete_policy(domain, states_to_actions)
def discrete_model_policy(domain, data): T = fit_T(data, domain.state_centers, domain.action_centers, domain.at_goal) states_to_actions, V = value_iteration(T, domain.state_centers, domain.reward, threshold=domain.value_iteration_threshold) return discrete_policy(domain, states_to_actions)