def full_info(): logger.info('using perfect information') # gen stacked matrices of I, P, P^2, ... R = numpy.array([]) S = sp.eye(n_states, n_states) P = sp.eye(n_states, n_states) for i in xrange(calc_discount_horizon(lam, gam, eps)): # decay epsilon R = numpy.append(R, P * m.R) P = m.P * P S = sp.vstack((S, P)) X = encoder.encode(S) R = sp.csr_matrix(R[:,None]) X_val = X_test = X R_val = R_test = R #losses = ['true-bellman', 'true-reward', 'true-model'] weighting = 'uniform' return (X, X_val, X_test), (R, R_val, R_test), weighting
def sample(n): logger.info('sampling from a grid world') # currently defaults to on-policy sampling n_extra = calc_discount_horizon(lam, gam, eps) - 1 # mdp returns n+1 states and n rewards kw = dict(n_samples = n + n_extra, encoder = encoder, req_rew = req_rew) R, X, _ = mdp.sample_encoding(**kw) if req_rew: logger.info('reward required') assert sum(R.todense()) > 0 logger.info('reward sum: %.2f' % sum(R.todense())) R_val, X_val, _ = mdp.sample_encoding(**kw) R_test, X_test, _ = mdp.sample_encoding(**kw) #losses = ['test-bellman', 'test-reward', 'test-model', #'true-bellman', 'true-reward', 'true-model', 'true-lsq'] # test-training weighting = 'policy' return (X, X_val, X_test), (R, R_val, R_test), weighting