def generate_batch_data(self, N, policy=None):
     #print "[rl_tools]: Generating " + str(N) + " episodes of " + ("batch training data..." if policy is None else "evaluation data...")
     #index = pandas.MultiIndex.from_tuples([(n, t) for n in range(N) for t in range(self.episode_length) ], names=['episode','time'])
     #data = pandas.DataFrame(index=index, columns=self.data_columns)
     f = lambda trash: self.simulate_episode(policy)
     if 1:
         raw_data = parallel.largeparmap(f, range(N))
     else:
         raw_data = map(f, range(N))
     #for n in range(N):
     #    data.ix[n] = raw_data[n]
     data = raw_data
     return data
def best_policy(domain, data):
    f = lambda pars: err_array(domain, pars, data)
    pars0 = domain.true_pars
    ml_start = leastsq(f, pars0)[0]
    if 1:
        f = lambda pars: [tuple(pars),  mfmc_evaluation(policy_wrt_approx_model(domain, pars), data, domain.distance_fn, domain.initstate, domain.episode_length, domain.at_goal)]
        pars = hill_climb(f, domain.optimization_pars, ml_start)
    else:
        f = lambda pars: [pars, mfmc_evaluation(policy_wrt_approx_model(domain, pars), data, domain.distance_fn, domain.initstate, domain.episode_length, domain.at_goal)]
        raw_returns = parallel.largeparmap(f, domain.initial_par_search_space)
        ind = np.argmax([raw[1] for raw in raw_returns])
        pars = raw_returns[ind][0]
    #print pars
    dynamics = lambda s, u: domain.approx_dynamics(s, u, pars)
    T = deterministic_continuous_to_discrete_model(dynamics, domain.state_centers, domain.action_centers, domain.at_goal)
    states_to_actions, V = value_iteration(T, domain.state_centers, domain.reward, threshold=domain.value_iteration_threshold)
    return discrete_policy(domain, states_to_actions)