Ejemplo n.º 1
0
    def __init__(self,
                 input_shape,
                 dim_of_actions,
                 gamma,
                 convergence_of_model_epsilon=1e-10,
                 model_type='cnn',
                 num_frame_stack=None,
                 frame_skip=None,
                 pic_size=None,
                 freeze_cnn_layers=False):
        super(CarNN, self).__init__()

        self.all_actions_func = None
        self.convergence_of_model_epsilon = convergence_of_model_epsilon
        self.model_type = model_type
        self.dim_of_actions = dim_of_actions
        self.input_shape = input_shape
        self.freeze_cnn_layers = freeze_cnn_layers
        self.model = self.create_model(input_shape)

        #debug purposes
        from config_car import action_space_map, env
        self.policy_evalutor = ExactPolicyEvaluator(
            action_space_map,
            gamma,
            env=env,
            num_frame_stack=num_frame_stack,
            frame_skip=frame_skip,
            pic_size=pic_size)
def main(policy_old, policy, model_type='cnn'):

    fqi = FittedQIteration(state_space_dim + action_space_dim,
                           map_size,
                           action_space_dim,
                           max_fitting_epochs,
                           gamma,
                           model_type=model_type)
    fqe = FittedQEvaluation(initial_states,
                            state_space_dim + action_space_dim,
                            map_size,
                            action_space_dim,
                            max_fitting_epochs,
                            gamma,
                            model_type=model_type)
    ips = InversePropensityScorer(action_space_dim)
    exact_evaluation = ExactPolicyEvaluator(initial_states, state_space_dim,
                                            gamma, env)

    max_epochs = np.array(
        [1000]
    )  # np.arange(50,1060,100) # max number of epochs over which to collect data
    epsilons = np.array([.25])  # np.array([.5])
    trials = np.array([1, 2])  # np.arange(20)
    eps_epochs_trials = cartesian_product(epsilons, max_epochs, trials)

    all_trials_estimators = []
    for epsilon in epsilons:

        trials_estimators = []
        for epochs in max_epochs:

            trial_estimators = []
            for trial in trials:
                estimators = run_trial(policy_old, policy, epochs, epsilon,
                                       fqi, fqe, ips, exact_evaluation)

                trial_estimators.append(estimators)
            trials_estimators.append(trial_estimators)

        all_trials_estimators.append(trials_estimators)

        # print epsilon, np.mean(all_trials_evaluated[-1]), np.mean(all_trials_approx_ips[-1]), np.mean(all_trials_exact_ips[-1]), np.mean(all_trials_exact[-1])

    results = np.hstack([
        eps_epochs_trials,
        np.array(all_trials_estimators).reshape(
            -1,
            np.array(all_trials_estimators).shape[-1])
    ])
    df = pd.DataFrame(
        results,
        columns=['epsilon', 'num_trajectories', 'trial_num', 'exact', 'fqe'])
    df.to_csv('fqe_quality.csv', index=False)
Ejemplo n.º 3
0
    def __init__(self,
                 num_inputs,
                 num_outputs,
                 grid_shape,
                 dim_of_actions,
                 gamma,
                 convergence_of_model_epsilon=1e-10,
                 model_type='mlp',
                 position_of_holes=None,
                 position_of_goals=None):
        '''
        An implementation of fitted Q iteration

        num_inputs: number of inputs
        num_outputs: number of outputs
        dim_of_actions: dimension of action space
        convergence_of_model_epsilon: small float. Defines when the model has converged.
        '''
        super(NN, self).__init__()
        self.convergence_of_model_epsilon = convergence_of_model_epsilon
        self.model_type = model_type
        self.dim_of_actions = dim_of_actions
        self.dim_of_state = grid_shape[0] * grid_shape[1]
        self.grid_shape = grid_shape

        if self.model_type == 'cnn':
            assert position_of_holes is not None
            assert position_of_goals is not None

        self.position_of_goals = position_of_goals

        if position_of_holes is not None:
            self.position_of_holes = np.zeros(self.dim_of_state)
            self.position_of_holes[position_of_holes] = 1
            self.position_of_holes = self.position_of_holes.reshape(
                self.grid_shape)
        else:
            self.position_of_holes = position_of_holes

        if position_of_goals is not None:
            self.position_of_goals = np.zeros(self.dim_of_state)
            self.position_of_goals[position_of_goals] = 1
            self.position_of_goals = self.position_of_goals.reshape(
                self.grid_shape)
        else:
            self.position_of_goals = position_of_goals

        self.model = self.create_model(num_inputs, num_outputs)
        #debug purposes
        self.policy_evalutor = ExactPolicyEvaluator([0], num_inputs -
                                                    dim_of_actions, gamma)
Ejemplo n.º 4
0
    def __init__(self,
                 input_shape,
                 dim_of_actions,
                 gamma,
                 convergence_of_model_epsilon=1e-10,
                 model_type='mlp',
                 position_of_holes=None,
                 position_of_goals=None,
                 num_frame_stack=None,
                 frame_skip=None,
                 pic_size=None,
                 freeze_cnn_layers=False,
                 **kw):
        '''
        An implementation of fitted Q iteration

        num_inputs: number of inputs
        num_outputs: number of outputs
        dim_of_actions: dimension of action space
        convergence_of_model_epsilon: small float. Defines when the model has converged.
        '''
        #        super(PortfolioNN, self).__init__()
        super().__init__()
        self.convergence_of_model_epsilon = convergence_of_model_epsilon
        self.model_type = model_type
        self.dim_of_actions = dim_of_actions
        self.dim_of_state = input_shape
        self.freeze_cnn_layers = freeze_cnn_layers
        self.model = self.create_model(input_shape)
        self.all_actions_func = None
        #debug purposes
        from config_portfolio import env
        self.policy_evalutor = ExactPolicyEvaluator(
            None,
            gamma,
            env=env,
            num_frame_stack=num_frame_stack,
            frame_skip=frame_skip,
            pic_size=pic_size)
Ejemplo n.º 5
0
    def __init__(self,
                 input_shape,
                 dim_of_actions,
                 gamma=0.95,
                 convergence_of_model_epsilon=1e-10,
                 model_type='mlp',
                 position_of_holes=None,
                 position_of_goals=None,
                 num_frame_stack=None,
                 frame_skip=None,
                 pic_size=None,
                 freeze_cnn_layers=False,
                 **kw):
        '''
        An implementation of fitted Q iteration

        num_inputs: number of inputs
        num_outputs: number of outputs
        dim_of_actions: dimension of action space
        convergence_of_model_epsilon: small float. Defines when the model has converged.
        '''
        super().__init__()
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = .995
        self.gamma = gamma
        self.tau = .125
        self.convergence_of_model_epsilon = convergence_of_model_epsilon
        self.model_type = model_type
        #        print(dim_of_actions)
        self.dim_of_actions = dim_of_actions
        self.dim_of_state = input_shape
        #        print(self.dim_of_state )
        self.freeze_cnn_layers = freeze_cnn_layers

        # ===================================================================== #
        #                               Actor Model                             #
        # Chain rule: find the gradient of chaging the actor network params in  #
        # getting closest to the final value network predictions, i.e. de/dA    #
        # Calculate de/dA as = de/dC * dC/dA, where e is error, C critic, A act #
        # ===================================================================== #

        self.memory = deque(maxlen=2000)
        self.actor_state_input, self.actor_model = self.create_actor_model()
        _, self.target_actor_model = self.create_actor_model()
        self.actor_critic_grad = tf.placeholder(
            tf.float32, [None, dim_of_actions[0]
                         ])  # where we will feed de/dC (from critic)

        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(
            self.actor_model.output, actor_model_weights,
            -self.actor_critic_grad)  # dC/dA (from actor)
        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(grads)

        # ===================================================================== #
        #                              Critic Model                             #
        # ===================================================================== #
        self.critic_state_input, self.critic_action_input, self.critic_model = self.create_critic_model(
        )
        _, _, self.target_critic_model = self.create_critic_model()
        self.critic_grads = tf.gradients(
            self.critic_model.output, self.critic_action_input
        )  # where we calcaulte de/dC for feeding above

        # Initialize for later gradient calculations
        #        self.sess.run(tf.initialize_all_variables())

        #        self.model = self.create_model(input_shape)
        self.all_actions_func = None
        #debug purposes
        from config_portfolio import env
        self.policy_evalutor = ExactPolicyEvaluator(
            None,
            gamma,
            env=env,
            num_frame_stack=num_frame_stack,
            frame_skip=frame_skip,
            pic_size=pic_size)
Ejemplo n.º 6
0
def main(policy_old, policy, model_type='mlp'):

    fqi = None  #FittedQIteration(state_space_dim + action_space_dim, map_size, action_space_dim, max_fitting_epochs, gamma,model_type =model_type )
    fqe = FittedQEvaluation(initial_states,
                            state_space_dim + action_space_dim,
                            map_size,
                            action_space_dim,
                            max_fitting_epochs,
                            gamma,
                            model_type=model_type)
    ips = InversePropensityScorer(env, state_space_dim, action_space_dim,
                                  map_size)
    exact_evaluation = ExactPolicyEvaluator(action_space_map,
                                            gamma,
                                            env=env,
                                            frame_skip=frame_skip,
                                            num_frame_stack=num_frame_stack,
                                            pic_size=pic_size)

    max_percentage = np.arange(
        .1, 1.05, .1)  # max number of epochs over which to collect data
    epsilons = np.array([.95])
    trials = np.arange(128)
    eps_epochs_trials = cartesian_product(epsilons, max_percentage, trials)

    all_trials_estimators = []
    for epsilon in epsilons:

        trials_estimators = []
        dataset, exact = get_dataset(1500, epsilon, exact_evaluation)

        for trial_num, trial in enumerate(trials):

            trial_estimators = []
            for perc_num, percentage in enumerate(max_percentage):
                K.clear_session()
                idxs = np.random.permutation(np.arange(len(
                    dataset.episodes))).tolist()
                estimators = run_trial(idxs, dataset, policy_old, policy,
                                       percentage, epsilon, fqi, fqe, ips,
                                       exact)

                trial_estimators.append(estimators)
            trials_estimators.append(trial_estimators)
            results = np.hstack([
                cartesian_product(epsilons, max_percentage,
                                  trials[0:(trial_num + 1)]),
                np.array([trials_estimators
                          ]).reshape(-1,
                                     np.array([trials_estimators]).shape[-1])
            ])
            df = pd.DataFrame(results,
                              columns=[
                                  'epsilon', 'num_trajectories', 'trial_num',
                                  'exact', 'fqe', 'approx_ips', 'exact_ips',
                                  'approx_pdis', 'exact_pdis', 'doubly_robust',
                                  'weighted_doubly_robust', 'AM'
                              ])
            df.to_csv('fqe_quality_fixed_dr.csv', index=False)

        all_trials_estimators.append(trials_estimators)

        # print epsilon, np.mean(all_trials_evaluated[-1]), np.mean(all_trials_approx_ips[-1]), np.mean(all_trials_exact_ips[-1]), np.mean(all_trials_exact[-1])

    results = np.hstack([
        eps_epochs_trials,
        np.array(all_trials_estimators).reshape(
            -1,
            np.array(all_trials_estimators).shape[-1])
    ])
    df = pd.DataFrame(results,
                      columns=[
                          'epsilon', 'num_trajectories', 'trial_num', 'exact',
                          'fqe', 'approx_ips', 'exact_ips', 'approx_pdis',
                          'exact_pdis', 'doubly_robust',
                          'weighted_doubly_robust', 'AM'
                      ])
    df.to_csv('fqe_quality_fixed_dr.csv', index=False)
Ejemplo n.º 7
0
#### Hyperparam
gamma = 0.9
max_fitting_epochs = 100  #max number of epochs over which to converge to Q^\ast
lambda_bound = 10.  # l1 bound on lagrange multipliers
action_space_dim = env.nA  # action space dimension
state_space_dim = env.nS  # state space dimension
eta = 10.  # param for exponentiated gradient algorithm
initial_states = [
    [0]
]  #The only initial state is [1,0...,0]. In general, this should be a list of initial states
from config_lake import action_space_map, frame_skip, num_frame_stack, pic_size

policy_evaluator = ExactPolicyEvaluator(action_space_map,
                                        gamma,
                                        env=env,
                                        frame_skip=frame_skip,
                                        num_frame_stack=num_frame_stack,
                                        pic_size=pic_size)
dqn_model_type = 'mlp'
testing_model_type = 'mlp'

#### Get a decent policy. Called pi_old because this will be the policy we use to gather data
policy_old = None
old_policy_path = os.path.join(
    model_dir, 'pi_old_map_size_{0}_{1}.h5'.format(map_size[0],
                                                   dqn_model_type))
policy_old = LakeDQN(env,
                     gamma,
                     model_type=dqn_model_type,
                     position_of_holes=position_of_holes,
                     position_of_goals=position_of_goals,
Ejemplo n.º 8
0
gamma = .95
action_space_map = {}
for i, action in enumerate(
    [k for k in itertools.product([-1, 0, 1], [1, 0], [0.2, 0])]):
    action_space_map[i] = action

init_seed = 2
stochastic_env = False  # = not deterministic
max_pos_costs = 12  # The maximum allowable positive cost before ending episode early
max_time_spent_in_episode = 2000
env = ExtendedCarRacing(init_seed, stochastic_env, max_pos_costs)
exact_policy_algorithm = ExactPolicyEvaluator(
    action_space_map,
    gamma,
    env=env,
    frame_skip=frame_skip,
    num_frame_stack=num_frame_stack,
    pic_size=pic_size,
    constraint_thresholds=constraint_thresholds,
    constraints_cared_about=constraints_cared_about)
env.reset()

GPU = 0
SEED = 0
np.random.seed(SEED)
import tensorflow as tf
tf.set_random_seed(SEED)
import random
random.seed(SEED)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)
Ejemplo n.º 9
0
    def __init__(self,
                 num_inputs,
                 num_outputs,
                 grid_shape,
                 dim_of_actions,
                 gamma,
                 convergence_of_model_epsilon=1e-10,
                 model_type='mlp',
                 position_of_holes=None,
                 position_of_goals=None,
                 num_frame_stack=None,
                 frame_skip=None,
                 pic_size=None,
                 **kw):
        '''
        An implementation of fitted Q iteration

        num_inputs: number of inputs
        num_outputs: number of outputs
        dim_of_actions: dimension of action space
        convergence_of_model_epsilon: small float. Defines when the model has converged.
        '''
        super(LakeNN, self).__init__()
        self.convergence_of_model_epsilon = convergence_of_model_epsilon
        self.model_type = model_type
        self.dim_of_actions = dim_of_actions
        self.dim_of_state = grid_shape[0] * grid_shape[1]
        self.grid_shape = grid_shape

        if self.model_type == 'cnn':
            assert position_of_holes is not None
            assert position_of_goals is not None

        self.position_of_goals = position_of_goals

        if position_of_holes is not None:
            self.position_of_holes = np.zeros(self.dim_of_state)
            self.position_of_holes[position_of_holes] = 1
            self.position_of_holes = self.position_of_holes.reshape(
                self.grid_shape)
        else:
            self.position_of_holes = position_of_holes

        if position_of_goals is not None:
            self.position_of_goals = np.zeros(self.dim_of_state)
            self.position_of_goals[position_of_goals] = 1
            self.position_of_goals = self.position_of_goals.reshape(
                self.grid_shape)
        else:
            self.position_of_goals = position_of_goals

        self.model = self.create_model(num_inputs, num_outputs)
        #debug purposes
        from config_lake import action_space_map, env
        if 'exact' in kw:
            self.policy_evalutor = kw['exact']
        else:
            self.policy_evalutor = ExactPolicyEvaluator(
                action_space_map,
                gamma,
                env=env,
                num_frame_stack=num_frame_stack,
                frame_skip=frame_skip,
                pic_size=pic_size)
Ejemplo n.º 10
0
def main(env_name, headless):

    if headless:
        display = Display(visible=0, size=(1280, 1024))
        display.start()
    ###
    #paths

    model_dir = os.path.join(os.getcwd(), 'models')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    ###
    if env_name == 'lake':
        from config_lake import *
    elif env_name == 'car':
        from config_car import *
    else:
        raise

    #### Get a decent policy.
    #### Called pi_old because this will be the policy we use to gather data
    policy_old = None
    old_policy_path = os.path.join(model_dir, old_policy_name)

    if env_name == 'lake':
        policy_old = LakeDQN(
            env,
            gamma,
            action_space_map=action_space_map,
            model_type=model_type,
            position_of_holes=position_of_holes,
            position_of_goals=position_of_goals,
            max_time_spent_in_episode=max_time_spent_in_episode,
            num_iterations=num_iterations,
            sample_every_N_transitions=sample_every_N_transitions,
            batchsize=batchsize,
            min_epsilon=min_epsilon,
            initial_epsilon=initial_epsilon,
            epsilon_decay_steps=epsilon_decay_steps,
            copy_over_target_every_M_training_iterations=
            copy_over_target_every_M_training_iterations,
            buffer_size=buffer_size,
            num_frame_stack=num_frame_stack,
            min_buffer_size_to_train=min_buffer_size_to_train,
            frame_skip=frame_skip,
            pic_size=pic_size,
            models_path=os.path.join(model_dir,
                                     'weights.{epoch:02d}-{loss:.2f}.hdf5'),
        )
    elif env_name == 'car':
        policy_old = CarDQN(
            env,
            gamma,
            action_space_map=action_space_map,
            action_space_dim=action_space_dim,
            model_type=model_type,
            max_time_spent_in_episode=max_time_spent_in_episode,
            num_iterations=num_iterations,
            sample_every_N_transitions=sample_every_N_transitions,
            batchsize=batchsize,
            copy_over_target_every_M_training_iterations=
            copy_over_target_every_M_training_iterations,
            buffer_size=buffer_size,
            min_epsilon=min_epsilon,
            initial_epsilon=initial_epsilon,
            epsilon_decay_steps=epsilon_decay_steps,
            num_frame_stack=num_frame_stack,
            min_buffer_size_to_train=min_buffer_size_to_train,
            frame_skip=frame_skip,
            pic_size=pic_size,
            models_path=os.path.join(model_dir,
                                     'weights.{epoch:02d}-{loss:.2f}.hdf5'),
        )

    else:
        raise

    if not os.path.isfile(old_policy_path):
        print 'Learning a policy using DQN'
        policy_old.learn()
        policy_old.Q.model.save(old_policy_path)
    else:
        print 'Loading a policy'
        policy_old.Q.model = load_model(old_policy_path)
        # if env_name == 'car':
        #     try:
        #         # using old style model. This can be deleted if not using provided .h5 file
        #         policy_old.Q.all_actions_func = K.function([self.model.get_layer('inp').input], [self.model.get_layer('dense_2').output])
        #     except:
        #         pass

    # import pdb; pdb.set_trace()
    if env_name == 'car':
        policy_old.Q.all_actions_func = K.function(
            [policy_old.Q.model.get_layer('inp').input],
            [policy_old.Q.model.get_layer('all_actions').output])

    if env_name == 'lake':
        policy_printer = PrintPolicy(size=[map_size, map_size], env=env)
        policy_printer.pprint(policy_old)

    #### Problem setup
    if env_name == 'lake':
        best_response_algorithm = LakeFittedQIteration(
            state_space_dim + action_space_dim, [map_size, map_size],
            action_space_dim,
            max_Q_fitting_epochs,
            gamma,
            model_type=model_type,
            position_of_goals=position_of_goals,
            position_of_holes=position_of_holes,
            num_frame_stack=num_frame_stack)

        fitted_off_policy_evaluation_algorithm = LakeFittedQEvaluation(
            initial_states,
            state_space_dim + action_space_dim, [map_size, map_size],
            action_space_dim,
            max_eval_fitting_epochs,
            gamma,
            model_type=model_type,
            position_of_goals=position_of_goals,
            position_of_holes=position_of_holes,
            num_frame_stack=num_frame_stack)
        exact_policy_algorithm = ExactPolicyEvaluator(
            action_space_map,
            gamma,
            env=env,
            frame_skip=frame_skip,
            num_frame_stack=num_frame_stack,
            pic_size=pic_size)
    elif env_name == 'car':
        best_response_algorithm = CarFittedQIteration(
            state_space_dim,
            action_space_dim,
            max_Q_fitting_epochs,
            gamma,
            model_type=model_type,
            num_frame_stack=num_frame_stack,
            initialization=policy_old,
            freeze_cnn_layers=freeze_cnn_layers)  # for _ in range(2)]
        fitted_off_policy_evaluation_algorithm = CarFittedQEvaluation(
            state_space_dim,
            action_space_dim,
            max_eval_fitting_epochs,
            gamma,
            model_type=model_type,
            num_frame_stack=num_frame_stack
        )  # for _ in range(2*len(constraints_cared_about) + 2)]
        exact_policy_algorithm = ExactPolicyEvaluator(
            action_space_map,
            gamma,
            env=env,
            frame_skip=frame_skip,
            num_frame_stack=num_frame_stack,
            pic_size=pic_size,
            constraint_thresholds=constraint_thresholds,
            constraints_cared_about=constraints_cared_about)
    else:
        raise

    online_convex_algorithm = ExponentiatedGradient(
        lambda_bound, len(constraints), eta, starting_lambda=starting_lambda)
    exploratory_policy_old = StochasticPolicy(
        policy_old,
        action_space_dim,
        exact_policy_algorithm,
        epsilon=deviation_from_old_policy_eps,
        prob=prob)
    problem = Program(
        constraints,
        action_space_dim,
        best_response_algorithm,
        online_convex_algorithm,
        fitted_off_policy_evaluation_algorithm,
        exact_policy_algorithm,
        lambda_bound,
        epsilon,
        env,
        max_number_of_main_algo_iterations,
        num_frame_stack,
        pic_size,
    )

    lambdas = []
    policies = []

    # print exact_policy_algorithm.run(policy_old.Q, to_monitor=True)

    #### Collect Data
    try:
        print 'Loading Prebuilt Data'
        tic = time.time()
        # problem.dataset.data = dd.io.load('%s_data.h5' % env_name)
        # print 'Loaded. Time elapsed: %s' % (time.time() - tic)
        # num of times breaking  + distance to center of track + zeros
        if env_name == 'car':
            tic = time.time()
            action_data = dd.io.load(
                './seed_2_data/car_data_actions_seed_2.h5')
            frame_data = dd.io.load('./seed_2_data/car_data_frames_seed_2.h5')
            done_data = dd.io.load('./seed_2_data/car_data_is_done_seed_2.h5')
            next_state_data = dd.io.load(
                './seed_2_data/car_data_next_states_seed_2.h5')
            current_state_data = dd.io.load(
                './seed_2_data/car_data_prev_states_seed_2.h5')
            cost_data = dd.io.load('./seed_2_data/car_data_rewards_seed_2.h5')

            frame_gray_scale = np.zeros(
                (len(frame_data), 96, 96)).astype('float32')
            for i in range(len(frame_data)):
                frame_gray_scale[i, :, :] = np.dot(
                    frame_data[i, :, :, :] / 255., [0.299, 0.587, 0.114])

            problem.dataset.data = {
                'frames': frame_gray_scale,
                'prev_states': current_state_data,
                'next_states': next_state_data,
                'a': action_data,
                'c': cost_data[:, 0],
                'g': cost_data[:, 1:],
                'done': done_data
            }

            problem.dataset.data['g'] = problem.dataset.data[
                'g'][:, constraints_cared_about]
            # problem.dataset.data['g'] = (problem.dataset.data['g'] >= constraint_thresholds[:-1]).astype(int)
            print 'Preprocessed g. Time elapsed: %s' % (time.time() - tic)
        else:
            raise
    except:
        print 'Failed to load'
        print 'Recreating dataset'
        num_goal = 0
        num_hole = 0
        dataset_size = 0
        main_tic = time.time()
        # from layer_visualizer import LayerVisualizer; LV = LayerVisualizer(exploratory_policy_old.policy.Q.model)
        for i in range(max_epochs):
            tic = time.time()
            x = env.reset()
            problem.collect(x, start=True)
            dataset_size += 1
            if env_name in ['car']: env.render()
            done = False
            time_steps = 0
            episode_cost = 0
            while not done:
                time_steps += 1
                if env_name in ['car']:
                    #
                    # epsilon decay
                    exploratory_policy_old.epsilon = 1. - np.exp(
                        -3 * (i / float(max_epochs)))

                #LV.display_activation([problem.dataset.current_state()[np.newaxis,...], np.atleast_2d(np.eye(12)[0])], 2, 2, 0)
                action = exploratory_policy_old(
                    [problem.dataset.current_state()], x_preprocessed=False)[0]
                cost = []
                for _ in range(frame_skip):
                    if env_name in ['car']: env.render()
                    x_prime, costs, done, _ = env.step(
                        action_space_map[action])
                    cost.append(costs)
                    if done:
                        break
                cost = np.vstack([np.hstack(x) for x in cost]).sum(axis=0)
                early_done, punishment = env.is_early_episode_termination(
                    cost=cost[0],
                    time_steps=time_steps,
                    total_cost=episode_cost)
                # print cost, action_space_map[action] #env.car.fuel_spent/ENGINE_POWER, env.tile_visited_count, len(env.track), env.tile_visited_count/float(len(env.track))
                done = done or early_done

                # if done and reward: num_goal += 1
                # if done and not reward: num_hole += 1
                episode_cost += cost[0] + punishment
                c = (cost[0] + punishment).tolist()
                g = cost[1:].tolist()
                if len(g) < len(constraints): g = np.hstack([g, 0])
                problem.collect(
                    action,
                    x_prime,  #np.dot(x_prime/255. , [0.299, 0.587, 0.114]),
                    np.hstack([c, g]).reshape(-1).tolist(),
                    done)  #{(x,a,x',c(x,a), g(x,a)^T, done)}
                dataset_size += 1
                x = x_prime
            if (i % 1) == 0:
                print 'Epoch: %s. Exploration probability: %s' % (
                    i,
                    np.round(exploratory_policy_old.epsilon, 5),
                )
                print 'Dataset size: %s Time Elapsed: %s. Total time: %s' % (
                    dataset_size, time.time() - tic, time.time() - main_tic)
                if env_name in ['car']:
                    print 'Performance: %s/%s = %s' % (
                        env.tile_visited_count, len(env.track),
                        env.tile_visited_count / float(len(env.track)))
                print '*' * 20
        problem.finish_collection(env_name)

    if env_name in ['lake']:
        problem.dataset['x'] = problem.dataset['frames'][
            problem.dataset['prev_states']]
        problem.dataset['x_prime'] = problem.dataset['frames'][
            problem.dataset['next_states']]
        problem.dataset['g'] = problem.dataset['g'][:, 0:1]
        print 'x Distribution:'
        print np.histogram(problem.dataset['x'],
                           bins=np.arange(map_size**2 + 1) - .5)[0].reshape(
                               map_size, map_size)

        print 'x_prime Distribution:'
        print np.histogram(problem.dataset['x_prime'],
                           bins=np.arange(map_size**2 + 1) - .5)[0].reshape(
                               map_size, map_size)

        print 'Number episodes achieved goal: %s. Number episodes fell in hole: %s' % (
            -problem.dataset['c'].sum(axis=0),
            problem.dataset['g'].sum(axis=0)[0])

        number_of_total_state_action_pairs = (state_space_dim - np.sum(
            env.desc == 'H') - np.sum(env.desc == 'G')) * action_space_dim
        number_of_state_action_pairs_seen = len(
            np.unique(np.hstack([
                problem.dataset['x'].reshape(1, -1).T,
                problem.dataset['a'].reshape(1, -1).T
            ]),
                      axis=0))
        print 'Percentage of State/Action space seen: %s' % (
            number_of_state_action_pairs_seen /
            float(number_of_total_state_action_pairs))

    # print 'C(pi_old): %s. G(pi_old): %s' % (exact_policy_algorithm.run(exploratory_policy_old,policy_is_greedy=False, to_monitor=True) )
    ### Solve Batch Constrained Problem

    iteration = 0
    while not problem.is_over(policies,
                              lambdas,
                              infinite_loop=infinite_loop,
                              calculate_gap=calculate_gap,
                              results_name=results_name,
                              policy_improvement_name=policy_improvement_name):
        iteration += 1
        K.clear_session()
        for i in range(1):

            # policy_printer.pprint(policies)
            print '*' * 20
            print 'Iteration %s, %s' % (iteration, i)
            print
            if len(lambdas) == 0:
                # first iteration
                lambdas.append(online_convex_algorithm.get())
                print 'lambda_{0}_{2} = {1}'.format(iteration, lambdas[-1], i)
            else:
                # all other iterations
                lambda_t = problem.online_algo()
                lambdas.append(lambda_t)
                print 'lambda_{0}_{3} = online-algo(pi_{1}_{3}) = {2}'.format(
                    iteration, iteration - 1, lambdas[-1], i)

            lambda_t = lambdas[-1]
            pi_t, values = problem.best_response(lambda_t,
                                                 desc='FQI pi_{0}_{1}'.format(
                                                     iteration, i),
                                                 exact=exact_policy_algorithm)

            # policies.append(pi_t)
            problem.update(pi_t, values,
                           iteration)  #Evaluate C(pi_t), G(pi_t) and save
    env.desc.shape)[np.nonzero(env.desc == 'G')]

#### Hyperparam
gamma = 0.9
max_fitting_epochs = 10  #max number of epochs over which to converge to Q^\ast
lambda_bound = 10.  # l1 bound on lagrange multipliers
epsilon = .01  # termination condition for two-player game
deviation_from_old_policy_eps = .7  #With what probabaility to deviate from the old policy
# convergence_epsilon = 1e-6 # termination condition for model convergence
action_space_dim = env.nA  # action space dimension
state_space_dim = env.nS  # state space dimension
eta = 10.  # param for exponentiated gradient algorithm
initial_states = [
    [0]
]  #The only initial state is [1,0...,0]. In general, this should be a list of initial states
policy_evaluator = ExactPolicyEvaluator(initial_states, state_space_dim, gamma)

#### Get a decent policy. Called pi_old because this will be the policy we use to gather data
policy_old = None
old_policy_path = os.path.join(model_dir, 'pi_old.h5')
policy_old = DeepQLearning(env, gamma)
if not os.path.isfile(old_policy_path):
    print 'Learning a policy using DQN'
    policy_old.learn()
    policy_old.Q.model.save(old_policy_path)
    print policy_old.Q.evaluate(render=True)
else:
    print 'Loading a policy'
    policy_old.Q.model = load_model(old_policy_path)
    print policy_old.Q.evaluate(render=True)
def main(env_name, headless):

    model_dir = os.path.join(os.getcwd(), 'models')
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    #### Get a decent policy.
    #### Called pi_old because this will be the policy we use to gather data
    policy_old = None
    old_policy_path = os.path.join(model_dir, old_policy_name)
    if env_name == 'portfolio':
        policy_old = PortfolioA2C(
            env,
            gamma,
            action_space_dim=action_space_dim,
            model_type=model_type,
            max_time_spent_in_episode=max_time_spent_in_episode,
            num_iterations=num_iterations,
            sample_every_N_transitions=sample_every_N_transitions,
            batchsize=batchsize,
            copy_over_target_every_M_training_iterations=
            copy_over_target_every_M_training_iterations,
            buffer_size=buffer_size,
            min_epsilon=min_epsilon,
            initial_epsilon=initial_epsilon,
            epsilon_decay_steps=epsilon_decay_steps,
            num_frame_stack=num_frame_stack,
            min_buffer_size_to_train=min_buffer_size_to_train,
            models_path=os.path.join(
                model_dir, 'test_weights.{epoch:02d}-{loss:.2f}.hdf5'),
        )

    else:
        raise

    if not os.path.isfile(old_policy_path):
        print('Learning a policy using DQN')
        policy_old.learn()
        policy_old.Q.actor_model.save(
            os.path.join(model_dir, "pi_old_portfolio_actor.hdf5"))
        policy_old.Q.critic_model.save(
            os.path.join(model_dir, "pi_old_portfolio_critic.hdf5"))
    else:
        print('Loading a policy')
        #        policy_old.Q.model = load_model(old_policy_path)
        policy_old.Q.actor_model = load_model(
            os.path.join(model_dir, "pi_old_portfolio_actor.hdf5"))
        policy_old.Q.critic_model = load_model(
            os.path.join(model_dir, "pi_old_portfolio_critic.hdf5"))

    #### Problem setup
    if env_name == 'portfolio':
        state_space_dim = env.observation_space.shape
        best_response_algorithm = PortfolioFittedQIteration(
            state_space_dim,
            action_space_dim,
            max_Q_fitting_epochs,
            gamma,
            model_type=model_type,
            num_frame_stack=num_frame_stack,
            initialization=policy_old)
        #                                                      freeze_cnn_layers=freeze_cnn_layers)# for _ in range(2)]
        fitted_off_policy_evaluation_algorithm = PortfolioFittedQEvaluation(
            state_space_dim,
            action_space_dim,
            max_eval_fitting_epochs,
            gamma,
            model_type=model_type,
            num_frame_stack=num_frame_stack
        )  # for _ in range(2*len(constraints_cared_about) + 2)]
        exact_policy_algorithm = ExactPolicyEvaluator(
            action_space_map=None,
            gamma=gamma,
            env=env,
            frame_skip=frame_skip,
            num_frame_stack=num_frame_stack,
            pic_size=pic_size
        )  #, constraint_thresholds=constraint_thresholds, constraints_cared_about=constraints_cared_about)

    else:
        raise

    online_convex_algorithm = ExponentiatedGradient(
        lambda_bound, len(constraints), eta, starting_lambda=starting_lambda)

    problem = Program(
        constraints,
        action_space_dim,
        best_response_algorithm,
        online_convex_algorithm,
        fitted_off_policy_evaluation_algorithm,
        exact_policy_algorithm,
        lambda_bound,
        epsilon,
        env,
        max_number_of_main_algo_iterations,
        num_frame_stack,
        pic_size,
    )

    lambdas = []
    policies = []

    #### Collect Data
    try:
        print('Loading Prebuilt Data')
        batch_idxs = np.random.choice(len(
            dd.io.load(r".\datasets\finance_a.h5")),
                                      sample_size,
                                      replace=False)
        tic = time.time()
        if env_name == 'portfolio':
            tic = time.time()
            action_data = dd.io.load(r".\datasets\finance_a.h5")
            #            frame_data = dd.io.load()
            done_data = dd.io.load(r".\datasets\finance_done.h5")
            next_state_data = dd.io.load(r".\datasets\finance_next_states.h5")
            current_state_data = dd.io.load(
                r".\datasets\finance_prev_states.h5")
            c = dd.io.load(r".\datasets\finance_c.h5")
            g = dd.io.load(r".\datasets\finance_g.h5")

            problem.dataset.data = {
                'prev_states': [current_state_data[i] for i in batch_idxs],
                'next_states': [next_state_data[i] for i in batch_idxs],
                'a': [action_data[i] for i in batch_idxs],
                'c': [c[i] for i in batch_idxs],
                'g': [g[i] for i in batch_idxs],
                'done': [done_data[i] for i in batch_idxs]
            }
            print('Preprocessed g. Time elapsed: %s' % (time.time() - tic))

        else:
            raise
    except:
        print('Failed to load')
        print('Recreating dataset')
        dataset_size = 0
        main_tic = time.time()
        for i in range(max_epochs):
            tic = time.time()
            x = env.reset()
            problem.collect(x, start=True)
            dataset_size += 1
            done = False
            time_steps = 0
            episode_cost = 0
            while not done:
                punishment = 0
                time_steps += 1
                cur_state = x
                if len(cur_state.shape) == 3:
                    cur_state = np.expand_dims(cur_state, axis=0)
                action = policy_old.Q.actor_model.predict(cur_state)[0]
                cost = []
                for _ in range(frame_skip):
                    x_prime, rewards, done, info = env.step(action)
                    costs = rewards[0] * -1
                    if costs > 0:
                        punishment = 1
                    cost.append((costs))
                    if done:
                        break
                if frame_skip > 1:
                    cost = np.vstack([np.hstack(x) for x in cost]).sum(axis=0)

                episode_cost += cost[0] + punishment
                c = (cost[0] + punishment).tolist()

                g = rewards[1:][0]
                if len(g) < len(constraints): g = np.hstack([g, 0])
                problem.collect(info['action'], x_prime,
                                np.hstack([c, g]).reshape(-1).tolist(), done)
                dataset_size += 1
                x = x_prime
            if (i % 1) == 0:
                print('Epoch: %s' % i)
                print(
                    'Dataset size: %s Time Elapsed: %s. Total time: %s' %
                    (dataset_size, time.time() - tic, time.time() - main_tic))
                if env_name in ['car']:
                    print('Performance: %s/%s = %s' %
                          (env.tile_visited_count, len(env.track),
                           env.tile_visited_count / float(len(env.track))))
                else:
                    print('performance: %s/%s =%s' %
                          (episode_cost, time_steps,
                           float(episode_cost) / float(time_steps)))
                print('*' * 20)
        problem.finish_collection(env_name)

    ### Solve Batch Constrained Problem

    iteration = 0
    while not problem.is_over(policies,
                              lambdas,
                              infinite_loop=infinite_loop,
                              calculate_gap=calculate_gap,
                              results_name=results_name,
                              policy_improvement_name=policy_improvement_name):
        iteration += 1
        K.clear_session()
        for i in range(1):

            # policy_printer.pprint(policies)
            print('*' * 20)
            print('Iteration %s, %s' % (iteration, i))
            print()
            if len(lambdas) == 0:
                # first iteration
                lambdas.append(online_convex_algorithm.get())
                print('lambda_{0}_{2} = {1}'.format(iteration, lambdas[-1], i))
            else:
                # all other iterations
                lambda_t = problem.online_algo()
                lambdas.append(lambda_t)
                print('lambda_{0}_{3} = online-algo(pi_{1}_{3}) = {2}'.format(
                    iteration, iteration - 1, lambdas[-1], i))

            lambda_t = lambdas[-1]
            #FQI here
            pi_t, values = problem.best_response(lambda_t,
                                                 desc='FQI pi_{0}_{1}'.format(
                                                     iteration, i),
                                                 exact=exact_policy_algorithm)
            torch.save(pi_t.state_dict(),
                       os.path.join(model_dir, "pi_final.hdf5"))
            #            pi_t.model_params.save(os.path.join(model_dir,"pi_final.hdf5"))
            #FQE
            problem.update(pi_t, values,
                           iteration)  #Evaluate C(pi_t), G(pi_t) and save