Ejemplo n.º 1
0
    def get_action_results(self,last_states,actions,time_i):
        
        #state is a boolean vector: whether or not i-th action
        #was tried already during this session
        #last output[:,end_code] always remains 1 after first being triggered
        
        
        last_state = check_list(last_states)[0]
        action = check_list(actions)[0]
        
        batch_range = T.arange(action.shape[0])

        session_active = T.eq(last_state[:,self.end_action_id],0)
        
        state_after_action = T.set_subtensor(last_state[batch_range,action],1)
        
        new_state = T.switch(
            session_active.reshape([-1,1]),
            state_after_action,
            last_state
        )
        
        session_terminated = T.eq(new_state[:,self.end_action_id],1)
        
        observation = T.concatenate([
                self.joint_data[batch_range,action,None],#uint8[batch,1]
                session_terminated.reshape([-1,1]), #whether session has been terminated by now
                T.extra_ops.to_one_hot(action,self.joint_data.shape[1]),
            ],axis=1)
        
        return new_state, observation
Ejemplo n.º 2
0
    def get_reward(self,state_sequences,action_sequences,batch_i):
        """
        WARNING! this runs on a single session, not on a batch
        reward given for taking the action in current environment state
        arguments:
            state_sequence float[batch_id, memory_id]: environment state before taking action
            action_sequence int[batch_id]: agent action at this tick
        returns:
            reward float[batch_id]: reward for taking action from the given state
        """
        
        state_sequence = check_list(state_sequences)[0]
        action_sequence = check_list(action_sequences)[0]
        
        time_range = T.arange(action_sequence.shape[0])
        

        has_tried_already = state_sequence[time_range,action_sequence]
        session_is_active = T.eq(state_sequence[:,self.end_action_id],0)
        has_finished_now = T.eq(action_sequence,self.end_action_id)
        action_is_categorical = in1d(action_sequence, self.category_action_ids)
        
        response = self.joint_data[batch_i,action_sequence].ravel()
        
        #categorical and attributes
        reward_for_intermediate_action = T.switch(
            action_is_categorical,
            response*6-3,
            response*2-1
        )
        #include end action
        reward_for_action = T.switch(
            has_finished_now,
            0,
            reward_for_intermediate_action,
        )
        
        reward_if_first_time = T.switch(
                has_tried_already,
                -0.5,
                reward_for_action,
            )
        
        final_reward = T.switch(
            session_is_active,
            reward_if_first_time,
            0,

            
        )
        
        
        return final_reward.astype(theano.config.floatX)
Ejemplo n.º 3
0
 def get_whether_alive(self, observation_tensors):
     """Given observations, returns whether session has or has not ended.
     Returns uint8 [batch,time_tick] where 1 means session is alive and 0 means session ended already.
     Note that session is considered still alive while agent is committing terminal action
     """
     observation_tensors = check_list(observation_tensors)
     return T.eq(observation_tensors[0][:, :, -1], 0)
Ejemplo n.º 4
0
    def __init__(self,
                 incoming,
                 broadcasted_axes,
                 force_broadcastable_batch=True,
                 **kwargs):

        self.incoming_ndim = len(incoming.output_shape)

        # axes that are to be broadcasted -- in ascending order
        # ax%self.incoming_ndim is used to replace negative axes with N-ax+1 so that -1 becomes last axis
        self.broadcasted_axes = sorted(
            [ax % self.incoming_ndim for ax in check_list(broadcasted_axes)])

        # sanity checks
        assert max(self.broadcasted_axes) < self.incoming_ndim
        assert len(self.broadcasted_axes) > 0
        if force_broadcastable_batch and (0 not in self.broadcasted_axes):
            raise ValueError(
                "BroadcastLayer was asked NOT to broadcast over batch (0'th) axis.\n"
                "If you know what you're doing, set force_broadcastable_batch=False.\n"
                "Otherwise just add 0 to the broadcasted_axes")

        # axed that are NOT broadcasted = all other axes in respective order
        self.non_broadcasted_axes = [
            ax for ax in range(self.incoming_ndim)
            if ax not in self.broadcasted_axes
        ]

        super(BroadcastLayer, self).__init__(incoming, **kwargs)
Ejemplo n.º 5
0
 def get_whether_alive(self,observation_tensors):
     """Given observations, returns whether session has or has not ended.
     Returns uint8 [batch,time_tick] where 1 means session is alive and 0 means session ended already.
     Note that session is considered still alive while agent is commiting end_action
     """
     observation_tensors = check_list(observation_tensors)
     return T.eq(observation_tensors[0][:,:,1],0)
Ejemplo n.º 6
0
    def get_action_results(self, last_states, actions, **kwargs):

        #unpack state and action
        last_state = check_list(last_states)[0]
        action = check_list(actions)[0]

        #state is a boolean vector: whether or not i-th action
        #was tried already during this session
        #last output[:,end_code] always remains 1 after first being triggered

        #whether session was active before tick
        session_active = T.eq(last_state[:, -1], 0)
        #whether session was terminated by the end of this tick
        session_terminated = T.or_(T.eq(session_active, 0),
                                   in1d(action, self.terminal_action_ids))

        batch_range = T.arange(action.shape[0])
        state_after_action = T.set_subtensor(last_state[batch_range, action],
                                             1)
        state_after_action = T.set_subtensor(state_after_action[:, -1],
                                             session_terminated)

        new_state = T.switch(session_active.reshape([-1, 1]),
                             state_after_action, last_state)

        #if allowed to see attribute
        observed_attrs = T.switch(
            state_after_action[:, :self.attributes.shape[1]], self.attributes,
            -1)

        observation = T.concatenate(
            [
                observed_attrs,  #float32[batch,1] response
                T.extra_ops.to_one_hot(
                    action,
                    self.joint_data.shape[1]),  #what action was commited
                session_terminated.reshape(
                    [-1, 1]),  # whether session is terminated by now
            ],
            axis=1)

        return new_state, observation
Ejemplo n.º 7
0
    def __init__(self, incoming, broadcasted_axes, force_broadcastable_batch=True, **kwargs):

        self.incoming_ndim = len(incoming.output_shape)

        # axes that are to be broadcasted -- in ascending order
        # ax%self.incoming_ndim is used to replace negative axes with N-ax+1 so that -1 becomes last axis
        self.broadcasted_axes = sorted([ax % self.incoming_ndim for ax in check_list(broadcasted_axes)])

        # sanity checks
        assert max(self.broadcasted_axes) < self.incoming_ndim
        assert len(self.broadcasted_axes) > 0
        if force_broadcastable_batch and (0 not in self.broadcasted_axes):
            raise ValueError("BroadcastLayer was asked NOT to broadcast over batch (0'th) axis.\n"
                             "If you know what you're doing, set force_broadcastable_batch=False.\n"
                             "Otherwise just add 0 to the broadcasted_axes")

        # axed that are NOT broadcasted = all other axes in respective order
        self.non_broadcasted_axes = [ax for ax in range(self.incoming_ndim) if ax not in self.broadcasted_axes]


        super(BroadcastLayer, self).__init__(incoming, **kwargs)
Ejemplo n.º 8
0
    def get_reward(self, session_states, session_actions, batch_id):
        """
        WARNING! this runs on a single session, not on a batch
        reward given for taking the action in current environment state
        arguments:
            session_states float[time, memory_id]: environment state before taking action
            session_actions int[time]: agent action at this tick
        returns:
            reward float[time]: reward for taking action from the given state
        """
        #unpach states and actions
        session_states = check_list(session_states)[0]
        session_actions = check_list(session_actions)[0]

        time_range = T.arange(session_actions.shape[0])

        has_tried_already = session_states[time_range, session_actions]
        session_is_active = T.eq(session_states[:, -1], 0)

        action_is_terminal = in1d(session_actions, self.terminal_action_ids)

        at_least_one_terminal_action = T.gt(
            T.cumsum(action_is_terminal, axis=0), 0)

        has_finished_now = T.set_subtensor(action_is_terminal[-1], 1)
        end_tick = has_finished_now.nonzero()[0][0]

        #categorical and attributes
        reward_for_intermediate_action = T.switch(
            has_tried_already,
            self.rw["repeated_poll"],
            self.rw["tick_reward"],
        )

        correct_stage = T.argmax(self.disease_stages[batch_id]) + 1
        predicted_stage = session_actions - self.attributes.shape[1]

        exaggeration_penalty = T.maximum(predicted_stage - correct_stage,0)*\
                               self.rw["stage_exaggerated_penalty_per_point"]
        underestimation_penalty = T.maximum(correct_stage - predicted_stage,0)*\
                                  self.rw["stage_underestimated_penalty_per_point"]

        diagnosis_reward = self.rw[
            "end_action_reward"] + exaggeration_penalty + underestimation_penalty

        #ending session
        reward_for_end_action = T.switch(
            at_least_one_terminal_action,  #if at least 1 diagnosis chosen
            diagnosis_reward,  # than score diagnosis
            self.rw["unfinished"])  #else punish for no diagnosis

        #include end action
        reward_for_action = T.switch(
            has_finished_now,
            reward_for_end_action,
            reward_for_intermediate_action,
        )

        final_reward = T.switch(
            session_is_active,
            reward_for_action,
            0,
        )

        return final_reward.astype(theano.config.floatX)
Ejemplo n.º 9
0
def WrongLSTMCell(prev_cell,
             prev_out,
             input_or_inputs=tuple(),
             num_units=None,
             peepholes=True,
             weight_init=init.Normal(),
             bias_init=init.Constant(),
             peepholes_W_init=init.Normal(),
             forgetgate_nonlinearity=lasagne.nonlinearities.sigmoid,
             inputgate_nonlinearity=lasagne.nonlinearities.sigmoid,
             outputgate_nonlinearity=lasagne.nonlinearities.sigmoid,
             cell_nonlinearity=lasagne.nonlinearities.tanh,
             output_nonlinearity=lasagne.nonlinearities.tanh,
             name='lstm',
             grad_clipping=5.,
             ):


    """

    Implements a one-step gated recurrent unit (GRU) with arbitrary number of units.


    :param prev_cell: input that denotes previous state (shape must be (None, n_units) )
    :type prev_cell: lasagne.layers.Layer
    :param input_or_inputs: a single layer or a list/tuple of layers that go as inputs
    :type input_or_inputs: lasagne.layers.Layer or list of such

    :param num_units: how many recurrent cells to use. None means "as in prev_state"
    :type num_units: int

    :param peepholes: If True, the LSTM uses peephole connections.
        When False, peepholes_W_init are ignored.
    :type peepholes: bool

    :param bias_init: either a lasagne initializer to use for every gate weights
                        or a list of 4 initializers for  [input gate, forget gate, cell, output gate]

    :param weight_init: either a lasagne initializer to use for every gate weights:
        or a list of two initializers,
        - first used for all weights from hidden -> <all>_gate and cell
        - second used for all weights from input(s) -> <all>_gate weights and cell
        or a list of two objects elements,
        - second list is hidden -> input gate, forget gate, cell, output gate,
        - second list of lists where list[i][0,1,2] = input[i] -> [input_gate, forget gate, cell,output gate ]

    :param peepholes_W_init: either a lasagne initializer or a list of 3 initializers for
                        [input_gate, forget gate,output gate ] weights. If peepholes=False, this is ignored.
                        
    :param <any>_nonlinearity: which nonlinearity to use for a particular gate

    :param grad_clipping: maximum gradient absolute value. 0 or None means "no clipping"


    :returns: a tuple of (new_cell,new_output) layers
    :rtype: (lasagne.layers.Layer,lasagne.layers.Layer)


    for developers:
        Works by stacking other lasagne layers;
        is a function mock, not actual class.

    """

    assert len(prev_cell.output_shape) == 2
    # if required, infer num_units
    if num_units is None:
        num_units = prev_cell.output_shape[1]
    # else check it
    assert num_units == prev_cell.output_shape[1]


    # gates and cell (before nonlinearities)

    gates = GateLayer([prev_out] + check_list(input_or_inputs),
                      [num_units] * 4,
                      channel_names=["to_ingate", "to_forgetgate", "to_cell", "to_outgate"],
                      gate_nonlinearities=None,
                      bias_init=bias_init,
                      weight_init=weight_init,
                      name=name or "")

    ingate, forgetgate, cell_input, outputgate = gates.values()




    # clip grads #1
    if grad_clipping:
        ingate, forgetgate, cell_input, outputgate = [clip_grads(lyr, grad_clipping) for lyr in
                                                     [ingate, forgetgate, cell_input, outputgate]]

    if peepholes:
        # cast bias init to a list
        peepholes_W_init = check_list(peepholes_W_init)
        assert len(peepholes_W_init) in (1, 3)
        if len(peepholes_W_init) == 1:
            peepholes_W_init *= 3
        W_cell_to_ingate_init,W_cell_to_forgetgate_init= peepholes_W_init[:2]

        peep_ingate = lasagne.layers.ScaleLayer(prev_cell,W_cell_to_ingate_init,shared_axes=[0,],
                                  name= (name or "") + ".W_cell_to_ingate_peephole")

        peep_forgetgate = lasagne.layers.ScaleLayer(prev_cell,W_cell_to_forgetgate_init,shared_axes=[0,],
                                  name= (name or "") + ".W_cell_to_forgetgate_peephole")


        ingate = add(ingate,peep_ingate)
        forgetgate = add(forgetgate,peep_forgetgate)




    # nonlinearities
    ingate = NonlinearityLayer(
        ingate,
        inputgate_nonlinearity,
        name=name+".inputgate"
    )
    forgetgate = NonlinearityLayer(
        forgetgate,
        forgetgate_nonlinearity,
        name=name+".forgetgate"
    )

    cell_input = NonlinearityLayer(cell_input,
                          nonlinearity=cell_nonlinearity,
                          name=name+'.cell_nonlinearity')


    # cell = input * ingate + prev_cell * forgetgate
    new_cell= add(mul(cell_input,ingate),
                  mul(prev_cell, forgetgate))

    # output gate
    if peepholes:
        W_cell_to_outgate_init = peepholes_W_init[2]

        peep_outgate = lasagne.layers.ScaleLayer(new_cell,W_cell_to_outgate_init,shared_axes=[0,],
                                  name= (name or "") + ".W_cell_to_outgate_peephole")

        outputgate = add(outputgate, peep_outgate)

    outputgate = NonlinearityLayer(
        outputgate,
        outputgate_nonlinearity,
        name=name+".outgate"
    )

    #cell output

    #!!!CHANGES START HERE!!!


    pre_output = mul(
        outputgate,
        new_cell,
        name=name+'.pre_output'
    )
    
    new_output=NonlinearityLayer(pre_output,
                                 output_nonlinearity,
                                 name=name+'.post_outgate_nonlinearity')

    #!!!CHANGES END HERE!!!


    return new_cell, new_output
Ejemplo n.º 10
0
    def get_reward(self,session_states,session_actions,batch_i):
        """
        WARNING! this runs on a single session, not on a batch
        reward given for taking the action in current environment state
        arguments:
            session_states float[batch_id, memory_id]: environment state before taking action
            session_actions int[batch_id]: agent action at this tick
        returns:
            reward float[batch_id]: reward for taking action from the given state
        """
        #unpach states and actions
        session_states = check_list(session_states)[0]
        session_actions = check_list(session_actions)[0]
        
        
        time_range = T.arange(session_actions.shape[0])
        

        has_tried_already = session_states[time_range,session_actions]
        session_is_active = T.eq(session_states[:,self.end_action_id],0)
        
        has_finished_now = T.eq(session_actions,self.end_action_id)
        has_finished_now = T.set_subtensor(has_finished_now[-1],1)
        end_tick = has_finished_now.nonzero()[0][0]
        
        action_is_categorical = in1d(session_actions, self.category_action_ids)
                
        response = self.joint_data[batch_i,session_actions].ravel()
        
        at_least_one_category_guessed = T.any(action_is_categorical[:end_tick] & (response[:end_tick]>0))

        
        #categorical and attributes
        reward_for_intermediate_action = T.switch(
            action_is_categorical,
            response*(self.rw["category_positive"]-self.rw["category_negative"]) + self.rw["category_negative"],
            response*(self.rw["attribute_positive"]-self.rw["attribute_negative"]) + self.rw["attribute_negative"]
        )
        reward_for_intermediate_action_first_time = T.switch(
                has_tried_already,
                self.rw["repeated_poll"],
                reward_for_intermediate_action,
            )

        #ending session
        reward_for_end_action = T.switch(at_least_one_category_guessed, #if chosen at least 1 category
                                          self.rw["end_action"],   #do not penalize
                                          self.rw["end_action_if_no_category_predicted"])  #else punish
        
        #include end action
        reward_for_action = T.switch(
            has_finished_now,
            reward_for_end_action,
            reward_for_intermediate_action_first_time,
        )
        
        
        final_reward = T.switch(
            session_is_active,
            reward_for_action,
            0,

            
        )
        
        
        return final_reward.astype(theano.config.floatX)
Ejemplo n.º 11
0
    def get_reward(self, session_states, session_actions, batch_id):
        """
        WARNING! this runs on a single session, not on a batch
        reward given for taking the action in current environment state
        arguments:
            session_states float[time, memory_id]: environment state before taking action
            session_actions int[time]: agent action at this tick
        returns:
            reward float[time]: reward for taking action from the given state
        """
        #unpach states and actions
        session_states = check_list(session_states)[0]
        session_actions = check_list(session_actions)[0]

        time_range = T.arange(session_actions.shape[0])

        has_tried_already = session_states[time_range, session_actions]
        session_is_active = T.eq(session_states[:, self.end_action_id], 0)

        has_finished_now = T.eq(session_actions, self.end_action_id)
        has_finished_now = T.set_subtensor(has_finished_now[-1], 1)
        end_tick = has_finished_now.nonzero()[0][0]

        action_is_categorical = in1d(session_actions, self.category_action_ids)

        response = self.joint_data[batch_id, session_actions].ravel()

        at_least_one_category_guessed = T.any(action_is_categorical[:end_tick]
                                              & (response[:end_tick] > 0))

        #categorical and attributes
        reward_for_intermediate_action = T.switch(
            action_is_categorical,
            response *
            (self.rw["category_positive"] - self.rw["category_negative"]) +
            self.rw["category_negative"],
            response *
            (self.rw["attribute_positive"] - self.rw["attribute_negative"]) +
            self.rw["attribute_negative"])
        reward_for_intermediate_action_first_time = T.switch(
            has_tried_already,
            self.rw["repeated_poll"],
            reward_for_intermediate_action,
        )

        #ending session
        reward_for_end_action = T.switch(
            at_least_one_category_guessed,  #if chosen at least 1 category
            self.rw["end_action"],  #do not penalize
            self.rw["end_action_if_no_category_predicted"])  #else punish

        #include end action
        reward_for_action = T.switch(
            has_finished_now,
            reward_for_end_action,
            reward_for_intermediate_action_first_time,
        )

        final_reward = T.switch(
            session_is_active,
            reward_for_action,
            0,
        )

        return final_reward.astype(theano.config.floatX)