def generate_batch_data(self, N, policy=None): #print "[rl_tools]: Generating " + str(N) + " episodes of " + ("batch training data..." if policy is None else "evaluation data...") #index = pandas.MultiIndex.from_tuples([(n, t) for n in range(N) for t in range(self.episode_length) ], names=['episode','time']) #data = pandas.DataFrame(index=index, columns=self.data_columns) f = lambda trash: self.simulate_episode(policy) if 1: raw_data = parallel.largeparmap(f, range(N)) else: raw_data = map(f, range(N)) #for n in range(N): # data.ix[n] = raw_data[n] data = raw_data return data
def best_policy(domain, data): f = lambda pars: err_array(domain, pars, data) pars0 = domain.true_pars ml_start = leastsq(f, pars0)[0] if 1: f = lambda pars: [tuple(pars), mfmc_evaluation(policy_wrt_approx_model(domain, pars), data, domain.distance_fn, domain.initstate, domain.episode_length, domain.at_goal)] pars = hill_climb(f, domain.optimization_pars, ml_start) else: f = lambda pars: [pars, mfmc_evaluation(policy_wrt_approx_model(domain, pars), data, domain.distance_fn, domain.initstate, domain.episode_length, domain.at_goal)] raw_returns = parallel.largeparmap(f, domain.initial_par_search_space) ind = np.argmax([raw[1] for raw in raw_returns]) pars = raw_returns[ind][0] #print pars dynamics = lambda s, u: domain.approx_dynamics(s, u, pars) T = deterministic_continuous_to_discrete_model(dynamics, domain.state_centers, domain.action_centers, domain.at_goal) states_to_actions, V = value_iteration(T, domain.state_centers, domain.reward, threshold=domain.value_iteration_threshold) return discrete_policy(domain, states_to_actions)