def __init__(self, tpolicy, config=None, bpolicy=None):

        assert isinstance(config, Config)
        """ 
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        n                       int             1                   the n of the n-step method
        gamma                   float           1.0                 the discount factor
        compute_bprobabilities  bool            False               whether to recompute bprobabilities or used
                                                                    the ones stored in the trajectory. This is the 
                                                                    difference between on-policy and off-policy updates.
        truncate_rho            bool            False               whether to truncate the importance sampling ratio
                                                                    at 1    
        """
        self.n = check_attribute_else_default(config, 'n', 1)
        self.gamma = check_attribute_else_default(config, 'gamma', 1.0)
        self.compute_bprobabilities = check_attribute_else_default(
            config, 'compute_bprobabilities', False)
        self.truncate_rho = check_attribute_else_default(
            config, 'truncate_rho', False)
        """
        Other Parameters:
        tpolicy - The target policy
        bpolicy - Behaviour policy. Only required if compute_bprobabilities is True.
        """
        self.tpolicy = tpolicy
        self.bpolicy = bpolicy
        if self.compute_bprobabilities:
            assert self.bpolicy is not None
    def __init__(self, tpolicy, config=None, bpolicy=None):

        assert isinstance(config, Config)
        """ 
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        n                       int             1                   the n of the n-step method
        gamma                   float           1.0                 the discount factor
        compute_bprobs        bool            False                 whether to recompute behaviour policy probabilities 
                                                                    or use the ones stored in the buffer.
        onpolicy                bool            True                whether to compute the on-policy return or the
                                                                    off-policy, i.e. compute the importance sampling
                                                                    ratio or not.
        """
        self.n = check_attribute_else_default(config, 'n', 1)
        self.gamma = check_attribute_else_default(config, 'gamma', 1.0)
        self.compute_bprobs = check_attribute_else_default(
            config, 'compute_bprobs', False)
        self.onpolicy = check_attribute_else_default(config, 'onpolicy', True)
        """
        Other Parameters:
        tpolicy - The target policy
        bpolicy - Behaviour policy. Only required if compute_bprobs is True.
        """
        self.tpolicy = tpolicy
        self.bpolicy = bpolicy
        if self.compute_bprobs:
            assert self.bpolicy is not None
    def __init__(self, tpolicy, config=None):

        assert isinstance(config, Config)
        """ 
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        n                       int             1                   the n of the n-step method
        gamma                   float           1.0                 the discount factor
        sigma                   float           0.5                 Sigma parameters, see De Asis et. al. (2018)
        sigma_decay             float           1.0                 Decay rate of sigma. At the end of each episode
                                                                    we let: sigma *= sigma_decay
        use_buffer_sigma        bool            False               Whether to use the sigma retrieved from the buffer
                                                                    or use the current sigma               
        """
        self.config = config
        self.n = check_attribute_else_default(config, 'n', 1)
        self.gamma = check_attribute_else_default(config, 'gamma', 1.0)
        self.sigma = check_attribute_else_default(config, 'sigma', 0.5)
        self.sigma_decay = check_attribute_else_default(
            config, 'sigma_decay', 1.0)
        self.use_buffer_sigma = check_attribute_else_default(
            config, 'use_buffer_sigma', False)
        """
        Other Parameters:
        tpolicy - The target policy
        """
        self.tpolicy = tpolicy
Ejemplo n.º 4
0
    def __init__(self, config=None, summary=None):

        super().__init__()
        assert isinstance(config, Config)
        """ Parameters:
        Name:                       Type            Default:        Description(omitted when self-explanatory):
        max_actions                 int             5000            The max number of actions executed before forcing
                                                                    termination
        save_summary                bool            False           Whether to save a summary of the environment
        """
        self.max_actions = check_attribute_else_default(config, 'max_actions', 5000)
        self.save_summary = check_attribute_else_default(config, 'save_summary', False)
        self.summary = summary
        if self.save_summary:
            assert isinstance(self.summary, dict)
            check_dict_else_default(self.summary, "steps_per_episode", [])

        " Inner state of the environment "
        self.step_count = 0
        self.current_state = self.reset()
        self.actions = np.array([0, 1, 2], dtype=int)  # 0 = backward, 1 = coast, 2 = forward
        self.high = np.array([0.5, 0.07], dtype=np.float32)
        self.low = np.array([-1.2, -0.07], dtype=np.float32)
        self.action_dictionary = {0: -1,    # accelerate backwards
                                   1: 0,    # coast
                                   2: 1}    # accelerate forwards
Ejemplo n.º 5
0
    def __init__(self, config=None):
        super().__init__()
        assert isinstance(config, Config)
        """
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        num_tilings             int             32                  Number of tilings
        tiling_side_length      int             8                   The length of the tiling side
        num_actions             int             3                   Number of actions
        num_dims                int             2                   Number of dimensions
        alpha                   float           0.1                 Learning rate
        """
        self.num_tilings = check_attribute_else_default(
            config, 'num_tilings', 32)
        self.tiling_side_length = check_attribute_else_default(
            config, 'tiling_side_length', 8)
        self.num_actions = check_attribute_else_default(
            config, 'num_actions', 3)
        self.num_dims = check_attribute_else_default(config, 'num_dims', 2)
        self.alpha = check_attribute_else_default(config, 'alpha', 0.1)

        self.tiles_per_tiling = self.tiling_side_length**self.num_dims
        self.num_tiles = (self.num_tilings * self.tiles_per_tiling)
        self.theta = 0.001 * random(self.num_tiles * self.num_actions)
        self.iht = IHT(self.num_tiles)
    def __init__(self, optimizer, neural_network, config=None, tf_session=None, restore=False, summary=None):
        super().__init__()
        """
        Summary Names:
            cumulative_loss
            training_steps
        """

        assert isinstance(config, Config)
        self.config = config
        """ 
        Parameters in config:
        Name:                       Type:           Default:            Description: (Omitted when self-explanatory)
        alpha                       float           0.001               step size parameter
        obs_dims                    list            [4, 84, 84]         Observations presented to the agent
        save_summary                bool            False               Save the summary of the network 
        """
        self.alpha = check_attribute_else_default(self.config, 'alpha', 0.001)
        self.obs_dims = check_attribute_else_default(self.config, 'obs_dims', [4,84,84])
        self.save_summary = check_attribute_else_default(self.config, 'save_summary', False)

        self.td_error_sqrd = np.random.rand() * 0.0001

        self.number_of_percentiles = 100
        self.percentiles = np.zeros(self.number_of_percentiles, dtype=np.float64)
        self.initialized_percentiles = False
        self.percentiles_record = np.zeros(self.number_of_percentiles, dtype=np.float64)
        self.percentiles_count = 0

        if self.save_summary:
            assert isinstance(summary, dict)
            self.summary = summary
            check_dict_else_default(summary, 'cumulative_loss', [])
            check_dict_else_default(summary, 'training_steps', [])
            self.training_steps = 0
            self.cumulative_loss = 0

        " Neural Network Model "
        self.network = neural_network

        " Training and Learning Evaluation: Tensorflow and variables initializer "
        # self.optimizer = optimizer(self.alpha)
        self.sess = tf_session or tf.Session()

        " Train step "
        # self.learning_rate = tf.placeholder(tf.float64, shape=[])
        self.learning_rate = tf.placeholder(tf.float32, shape=None)
        self.decay = tf.placeholder(tf.float32, shape=None)
        self.train_step = optimizer(self.alpha).minimize(self.network.train_loss,
                                                                 var_list=self.network.train_vars[0])

        # initializing variables in the graph
        if not restore:
            for var in tf.global_variables():
                self.sess.run(var.initializer)
    def __init__(self, config=None):

        assert isinstance(config, Config)
        """ 
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        n                       int             1                   the n of the n-step method
        gamma                   float           1.0                 the discount factor
        num_actions             int             3                   number of actions    
        """
        self.n = check_attribute_else_default(config, 'n', 1)
        self.gamma = check_attribute_else_default(config, 'gamma', 1.0)
        self.num_actions = check_attribute_else_default(
            config, 'num_actions', 3)
Ejemplo n.º 8
0
    def __init__(self,
                 environment,
                 function_approximator,
                 behaviour_policy,
                 er_buffer,
                 config=None,
                 summary=None):
        super().__init__()
        """
        Summary Name: return_per_episode
        """
        self.config = config or Config()
        assert isinstance(config, Config)
        """ 
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        num_actions             int             3                   number_of_actions
        initial_rand_steps      int             0                   number of random steps before training starts
        rand_steps_count        int             0                   number of random steps taken so far
        save_summary            bool            False               Save the summary of the agent (return per episode)
        """
        self.num_actions = check_attribute_else_default(
            self.config, 'num_actions', 3)
        self.initial_rand_steps = check_attribute_else_default(
            self.config, 'initial_rand_steps', 0)
        check_attribute_else_default(self.config, 'rand_steps_count', 0)
        self.save_summary = check_attribute_else_default(
            self.config, 'save_summary', False)

        if self.save_summary:
            assert isinstance(summary, dict)
            self.summary = summary
            check_dict_else_default(self.summary, 'return_per_episode', [])

        " Other Parameters "
        # Behaviour and Target Policies
        self.bpolicy = behaviour_policy

        # Experience Replay Buffer: used for storing and retrieving observations. Mainly for Deep RL
        self.er_buffer = er_buffer

        # Function Approximator: used to approximate the Q-Values
        self.fa = function_approximator

        # Environment that the agent is interacting with
        self.env = environment
Ejemplo n.º 9
0
    def __init__(self, environment, function_approximator, target_policy, behaviour_policy, config=None, er_buffer=None,
                 summary=None):
        super().__init__()
        """
        Summary Name: return_per_episode
        """
        self.config = config or Config()
        assert isinstance(config, Config)
        """ 
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        n                       int             1                   the n of the n-step method
        gamma                   float           1.0                 the discount factor
        beta                    float           1.0                 the decay factor of sigma
        sigma                   float           0.5                 see De Asis et.al. in AAAI 2018 proceedings
        use_er_buffer           bool            False               indicates whether to use experience replay buffer
        initial_rand_steps      int             0                   number of random steps before training starts
        rand_steps_count        int             0                   number of random steps taken so far
        save_summary            bool            False               Save the summary of the agent (return per episode)
        """
        self.n = check_attribute_else_default(self.config, 'n', 1)
        self.gamma = check_attribute_else_default(self.config, 'gamma', 1.0)
        self.beta = check_attribute_else_default(self.config, 'beta', 1.0)
        self.sigma = check_attribute_else_default(self.config, 'sigma', 0.5)
        self.use_er_buffer = check_attribute_else_default(self.config, 'use_er_buffer', False)
        self.initial_rand_steps = check_attribute_else_default(self.config, 'initial_rand_steps', 0)
        check_attribute_else_default(self.config, 'rand_steps_count', 0)
        self.save_summary = check_attribute_else_default(self.config, 'save_summary', False)

        if self.save_summary:
            assert isinstance(summary, dict)
            self.summary = summary
            check_dict_else_default(self.summary, 'return_per_episode', [])

        " Other Parameters "
        # Behaviour and Target Policies
        self.bpolicy = behaviour_policy
        self.tpolicy = target_policy

        # Experience Replay Buffer: used for storing and retrieving observations. Mainly for Deep RL
        self.er_buffer = er_buffer
        if self.use_er_buffer: assert self.er_buffer is not None

        # Function Approximator: used to approximate the Q-Values
        self.fa = function_approximator

        # Environment that the agent is interacting with
        self.env = environment
    def __init__(self, tpolicy, bpolicy, config=None):

        assert isinstance(config, Config)
        """ 
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        n                       int             1                   the n of the n-step method
        gamma                   float           1.0                 the discount factor
        """
        self.n = check_attribute_else_default(config, 'n', 1)
        self.gamma = check_attribute_else_default(config, 'gamma', 1.0)

        """
        Other Parameters:
        tpolicy - The target policy
        bpolicy - Behaviour policy. Only required if compute_bprobs is True.
        """
        self.tpolicy = tpolicy
        self.bpolicy = bpolicy
Ejemplo n.º 11
0
    def __init__(self, config=None, behaviour_policy=False):
        super().__init__()
        """ 
        Parameters in config:
        Name:               Type:           Default:            Description: (Omitted when self-explanatory)
        num_actions         int             2                   Number of actions available to the agent
        initial_epsilon     float           0.1                 Epsilon before annealing
        anneal_epsilon      bool            False               Indicates whether to anneal epsilon
        final_epsilon       float           initial_epsilon     The value of epsilon after annealing    
        annealing_period    int             100,000             Number of steps before reaching final epsilon
        anneal_steps_count  int             0                   Number of times epsilon has been annealed  
                
        Other Parameters:
        Name:               Type:           Default:            Description:
        behaviour_policy    bool            False               Indicates whether this is the behaviour or target policy
        """
        self.config = config or Config()
        assert isinstance(config, Config)
        self.num_actions = check_attribute_else_default(
            self.config, 'num_actions', 2)
        if behaviour_policy:
            current_config = check_attribute_else_default(
                self.config, 'behaviour_policy', Config())
        else:
            current_config = check_attribute_else_default(
                self.config, 'target_policy', Config())
        self.initial_epsilon = check_attribute_else_default(
            current_config, 'initial_epsilon', 0.1)
        self.anneal_epsilon = check_attribute_else_default(
            current_config, 'anneal_epsilon', False)
        self.final_epsilon = check_attribute_else_default(
            current_config, 'final_epsilon', self.initial_epsilon)
        self.annealing_period = check_attribute_else_default(
            current_config, 'annealing_period', 100000)
        check_attribute_else_default(self.config, 'anneal_steps_count', 0)

        self.epsilon = self.initial_epsilon
        self.p_random = (self.epsilon / self.num_actions)
        self.p_optimal = self.p_random + (1 - self.epsilon)
    def __init__(self, optimizer, target_network, update_network, er_buffer, config=None, tf_session=None,
                 restore=False, summary=None):
        """
        Summary Names:
            cumulative_loss
            training_steps
        """

        super().__init__()
        assert isinstance(config, Config)
        self.config = config
        """ 
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        alpha                   float           0.00025             step size parameter
        obs_dims                list            [4,84,84]           the dimensions of the obsevations
        tnetwork_update_freq    int             10,000              number of updates before updating the target network
        update_count            int             0                   number of updates performed
        save_summary            bool            False               indicates whether to save a summary of training
        """
        self.alpha = check_attribute_else_default(self.config, 'alpha', 0.00025)
        self.obs_dims = check_attribute_else_default(self.config, 'obs_dims', [4, 84, 84])
        self.tnetwork_update_freq = check_attribute_else_default(self.config, 'tnetwork_update_freq', 10000)
        self.save_summary = check_attribute_else_default(self.config, 'save_summary', False)
        check_attribute_else_default(self.config, 'update_count', 0)
        self.summary = summary
        if self.save_summary:
            assert isinstance(self.summary, dict)
            check_dict_else_default(self.summary, 'cumulative_loss', [])
            check_dict_else_default(self.summary, 'training_steps', [])
            self.training_steps = 0
            self.cumulative_loss = 0

        """ Other Parameters """
        " Experience Replay Buffer and Return Function "
        self.er_buffer = er_buffer

        " Neural Network Models "
        self.target_network = target_network    # Target Network
        self.update_network = update_network    # Update Network

        " Training and Learning Evaluation: Tensorflow and variables initializer "
        self.optimizer = optimizer(self.alpha)
        self.sess = tf_session or tf.Session()

        " Train step "
        self.train_step = self.optimizer.minimize(self.update_network.train_loss,
                                                  var_list=self.update_network.train_vars[0])

        " Initializing variables in the graph"
        if not restore:
            for var in tf.global_variables():
                self.sess.run(var.initializer)
            self.update_target_network()
Ejemplo n.º 13
0
    def __init__(self, config=None, name="default", SEED=None):
        super().__init__()

        assert isinstance(config, Config)
        """ 
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        dim_out                 list            [10,10,10]          the output dimensions of each layer, i.e. neurons
        filter_dims             list            [2,2]               the dimensions of each filter
        strides                 list            [4, 2]              strides use by each convolutional layer
        obs_dims                list            [4,84,84]           the dimensions of the observations seen by the agent
        num_actions             int             2                   the number of actions available to the agent
        gate_fun                tf gate fun     tf.nn.relu          the gate function used across the whole network
        conv_layers             int             2                   number of convolutional layers
        full_layers             int             1                   number of fully connected layers
        max_pool                bool            True                indicates whether to max pool between each conv layer
        frames_format           str             "NCHW"              Specifies the format of the frames fed to the network
        norm_factor             float           1                   Normalizes the frames by the value provided               
        """
        self.dim_out = check_attribute_else_default(config, 'dim_out', [10,10,10])
        self.filter_dims = check_attribute_else_default(config, 'filter_dims', [2,2])
        self.strides = check_attribute_else_default(config, 'strides', [4,2])
        channels, height, width = check_attribute_else_default(config, 'obs_dims', [4, 84, 84])
        num_actions = check_attribute_else_default(config, 'num_actions', 2)
        self.gate_fun = check_attribute_else_default(config, 'gate_fun', tf.nn.relu)
        self.convolutional_layers = check_attribute_else_default(config, 'conv_layers', 2)
        self.fully_connected_layers = check_attribute_else_default(config, 'full_layers', 1)
        self.max_pool = check_attribute_else_default(config, 'max_pool', True)
        self.frames_format = check_attribute_else_default(config, 'frames_format', 'NCHW')
        self.norm_factor = check_attribute_else_default(config, 'norm_factor', 1.)

        """
        Other Parameters:
        name - name of the network. Should be a string.
        """
        self.name = name
        row_and_action_number = 2
        total_layers = self.convolutional_layers + self.fully_connected_layers

        " Placehodler "
        self.x_frames = tf.placeholder(tf.float32, shape=(None, channels, height, width))   # input frames
        self.x_frames = tf.divide(self.x_frames, self.norm_factor)
        self.x_actions = tf.placeholder(tf.int32, shape=(None, row_and_action_number))      # input actions
        self.y = tf.placeholder(tf.float32, shape=None)                                     # target

        " Variables for Training "
        self.train_vars = []

        """ Convolutional layers """
        dim_in_conv = [channels] + self.dim_out[:self.convolutional_layers - 1]
        current_s_hat = self.x_frames
        if self.frames_format == "NHWC":
            current_s_hat = tf.transpose(current_s_hat, [0, 2, 3, 1])

        for i in range(self.convolutional_layers):
            if self.frames_format == "NHWC":
                out_height = np.ceil(current_s_hat.shape[1]._value / self.strides[i])
                out_width = np.ceil(current_s_hat.shape[2]._value / self.strides[i])
                centers_shape = np.array((out_height, out_width, self.dim_out[i]), dtype=np.uint32)
            else: # Format = "NCHW"
                out_height = np.ceil(current_s_hat.shape[2]._value / self.strides[i])
                out_width = np.ceil(current_s_hat.shape[3]._value /self.strides[i])
                centers_shape = np.array((self.dim_out[i], out_height, out_width), dtype=np.uint32)
            centers = tf.constant(np.random.uniform(0,1, size=centers_shape), dtype=tf.float32)
            stddev = tf.constant(1/self.dim_out[i], dtype=tf.float32)
            # layer n: convolutional
            W, b, z_hat, r_hat = layers.convolution_2d_rbf(
                self.name, "conv_rbf_"+str(i+1), current_s_hat, self.filter_dims[i], dim_in_conv[i], self.dim_out[i],
                tf.random_normal_initializer(stddev=1.0 / np.sqrt(self.filter_dims[i]**2 * dim_in_conv[i] + 1),
                                             seed=SEED),
                center=centers, stddev=stddev, stride=self.strides[i], format=self.frames_format)
            # layer n + 1/2: pool
            if self.max_pool:
                s_hat = tf.nn.max_pool(
                    r_hat, ksize=[1, 1, 2, 2], strides=[1, 1, 2, 2], padding="SAME")
            else:
                s_hat = r_hat

            current_s_hat = s_hat
            self.train_vars.extend([W, b])

        """ Fully Connected layers """
        shape = current_s_hat.get_shape().as_list()
        current_y_hat = tf.reshape(current_s_hat, [-1, shape[1] * shape[2] * shape[3]])
        # shape[-3:] are the last 3 dimensions. Shape has 4 dimensions: dim 1 = None, dim 2 =
        dim_in_fully = [np.prod(shape[-3:])] + self.dim_out[self.convolutional_layers: total_layers-1]
        dim_out_fully = self.dim_out[self.convolutional_layers:]
        for j in range(self.fully_connected_layers):
            centers_shape = (dim_in_fully[j], dim_out_fully[j])
            centers = tf.constant(np.random.uniform(low=0, high=1, size=centers_shape), dtype=tf.float32)
            stddev = tf.constant(1/dim_out_fully[j], dtype=np.float32)

            # layer n + m: fully connected
            W, b, z_hat, y_hat = layers.fully_connected_rbf(
                self.name, "full_rbf_"+str(j+1), current_y_hat, dim_in_fully[j], dim_out_fully[j],
                tf.random_normal_initializer(stddev=1.0 / np.sqrt(dim_in_fully[j]), seed=SEED),
                center=centers, stddev=stddev)

            current_y_hat = y_hat
            self.train_vars.extend([W, b])

        """ Output layer """
        # output layer: fully connected
        W, b, z_hat, self.y_hat = layers.fully_connected(
            self.name, "output_layer", current_y_hat, self.dim_out[-1], num_actions,
            tf.random_normal_initializer(stddev=1.0 / np.sqrt(self.dim_out[-1]), seed=SEED), linear_transfer)
        self.train_vars.extend([W, b])
        self.train_vars = [self.train_vars]

        # Obtaining y_hat and Scaling by the Importance Sampling
        y_hat = tf.gather_nd(self.y_hat, self.x_actions)
        y = self.y
        # Temporal Difference Error
        self.td_error = tf.subtract(y, y_hat)
        # Loss
        self.train_loss = tf.reduce_sum(tf.pow(self.td_error, 2))
Ejemplo n.º 14
0
    def __init__(self, config=None, name="default", SEED=None):
        super().__init__()

        assert isinstance(config, Config)
        """ 
        Parameters in config:
        Name:                   Type:           Default:            Description: (Omitted when self-explanatory)
        dim_out                 list            [10,10,10]          the output dimensions of each layer, i.e. neurons
        obs_dims                list            [2]                 the dimensions of the observations seen by the agent
        num_actions             int             3                   the number of actions available to the agent
        gate_fun                tf gate fun     tf.nn.relu          the gate function used across the whole network
        full_layers             int             3                   number of fully connected layers
        """
        self.dim_out = check_attribute_else_default(config, 'dim_out', [10,10,10])
        self.obs_dims = check_attribute_else_default(config, 'obs_dims', [2])
        self.num_actions = check_attribute_else_default(config, 'num_actions', 3)
        self.gate_fun = check_attribute_else_default(config, 'gate_fun', tf.nn.relu)
        self.full_layers = check_attribute_else_default(config, 'full_layers', 3)

        """
        Other Parameters:
        name - name of the network. Should be a string.
        """
        self.name = name

        " Dimensions "
        dim_in = [np.prod(self.obs_dims)] + self.dim_out[:-1]
        row_and_action_number = 2
        " Placehodler "
        self.x_frames = tf.placeholder(tf.float32, shape=(None, dim_in[0]))             # input frames
        self.x_actions = tf.placeholder(tf.int32, shape=(None, row_and_action_number))  # input actions
        self.y = tf.placeholder(tf.float32, shape=None)                                 # target
        " Variables for Training "
        self.train_vars = []

        " Fully Connected Layers "
        current_y_hat = self.x_frames
        for j in range(self.full_layers):
            # layer n + m: fully connected
            W, b, z_hat, y_hat = layers.fully_connected(
                self.name, "full_" + str(j + 1), current_y_hat, dim_in[j], self.dim_out[j],
                tf.random_normal_initializer(stddev=1.0 / np.sqrt(dim_in[j]), seed=SEED), self.gate_fun)

            current_y_hat = y_hat
            self.train_vars.extend([W, b])

        """ Output layer """
        # output layer: fully connected
        W, b, z_hat, self.y_hat = layers.fully_connected(
            self.name, "output_layer", current_y_hat, self.dim_out[-1], self.num_actions,
            tf.random_normal_initializer(stddev=1.0 / np.sqrt(self.dim_out[-1]), seed=SEED), linear_transfer)
        self.train_vars.extend([W, b])
        self.train_vars = [self.train_vars]

        # Obtaining y_hat and Scaling by the Importance Sampling
        y_hat = tf.gather_nd(self.y_hat, self.x_actions)
        y = self.y
        # Temporal Difference Error
        self.td_error = tf.subtract(y, y_hat)
        # Loss
        self.train_loss = tf.reduce_sum(tf.pow(self.td_error, 2))
Ejemplo n.º 15
0
    def __init__(self,
                 optimizer,
                 neural_network,
                 config=None,
                 tf_session=None,
                 restore=False,
                 summary=None):
        super().__init__()
        """
        Summary Names:
            cumulative_loss
            training_steps
        """

        assert isinstance(config, Config)
        """ 
        Parameters in config:
        Name:                       Type:           Default:            Description: (Omitted when self-explanatory)
        alpha                       float           0.001               step size parameter
        batch_sz                    int             1                   
        obs_dims                    list            [4, 84, 84]         Observations presented to the agent
        train_percentile_index      int             0                   Above which percentile should the td_error be
                                                                        for the observation to be processed (trained on)
        num_percentiles             int             10                  number of percentiles to be estimated
        percentile_estimator        class           see description     Estimates the percentiles. The default is:
                                                                        Percentile_Estimator(num_percentiles). Use
                                                                        default unless you're restoring agent.
        adjust_alpha                bool            False               Indicates whether to use the percentiles
                                                                        information to adjust alpha     
        save_summary                bool            False               Save the summary of the network 
        """
        self.alpha = check_attribute_else_default(config, 'alpha', 0.001)
        self.batch_sz = check_attribute_else_default(config, 'batch_sz', 1)
        self.obs_dims = check_attribute_else_default(config, 'obs_dims',
                                                     [4, 84, 84])
        self.train_percentile_index = check_attribute_else_default(
            config, 'train_percentile_index', 0)
        self.num_percentiles = check_attribute_else_default(
            config, 'num_percentiles', 10)
        self.percentile_estimator = check_attribute_else_default(
            config, 'percentile_estimator',
            Percentile_Estimator(self.num_percentiles))
        self.adjust_alpha = check_attribute_else_default(
            config, 'adjust_alpha', False)
        self.save_summary = check_attribute_else_default(
            config, 'save_summary', False)
        if self.save_summary:
            assert isinstance(summary, dict)
            self.summary = summary
            check_dict_else_default(summary, 'cumulative_loss', [])
            check_dict_else_default(summary, 'training_steps', [])
            self.training_steps = 0
            self.cumulative_loss = 0

        " Neural Network Model "
        self.network = neural_network

        " Training and Learning Evaluation: Tensorflow and variables initializer "
        self.optimizer = optimizer(self.alpha)
        self.sess = tf_session or tf.Session()

        " Train step "
        self.train_step = self.optimizer.minimize(
            self.network.train_loss, var_list=self.network.train_vars[0])

        # initializing variables in the graph
        if not restore:
            for var in tf.global_variables():
                self.sess.run(var.initializer)

        " Buffer "
        self.buffer = Buffer(buffer_size=self.batch_sz,
                             observation_dimensions=self.obs_dims)
        self.train_p = 0.9