def get_action_results(self,last_states,actions,time_i): #state is a boolean vector: whether or not i-th action #was tried already during this session #last output[:,end_code] always remains 1 after first being triggered last_state = check_list(last_states)[0] action = check_list(actions)[0] batch_range = T.arange(action.shape[0]) session_active = T.eq(last_state[:,self.end_action_id],0) state_after_action = T.set_subtensor(last_state[batch_range,action],1) new_state = T.switch( session_active.reshape([-1,1]), state_after_action, last_state ) session_terminated = T.eq(new_state[:,self.end_action_id],1) observation = T.concatenate([ self.joint_data[batch_range,action,None],#uint8[batch,1] session_terminated.reshape([-1,1]), #whether session has been terminated by now T.extra_ops.to_one_hot(action,self.joint_data.shape[1]), ],axis=1) return new_state, observation
def get_reward(self,state_sequences,action_sequences,batch_i): """ WARNING! this runs on a single session, not on a batch reward given for taking the action in current environment state arguments: state_sequence float[batch_id, memory_id]: environment state before taking action action_sequence int[batch_id]: agent action at this tick returns: reward float[batch_id]: reward for taking action from the given state """ state_sequence = check_list(state_sequences)[0] action_sequence = check_list(action_sequences)[0] time_range = T.arange(action_sequence.shape[0]) has_tried_already = state_sequence[time_range,action_sequence] session_is_active = T.eq(state_sequence[:,self.end_action_id],0) has_finished_now = T.eq(action_sequence,self.end_action_id) action_is_categorical = in1d(action_sequence, self.category_action_ids) response = self.joint_data[batch_i,action_sequence].ravel() #categorical and attributes reward_for_intermediate_action = T.switch( action_is_categorical, response*6-3, response*2-1 ) #include end action reward_for_action = T.switch( has_finished_now, 0, reward_for_intermediate_action, ) reward_if_first_time = T.switch( has_tried_already, -0.5, reward_for_action, ) final_reward = T.switch( session_is_active, reward_if_first_time, 0, ) return final_reward.astype(theano.config.floatX)
def get_whether_alive(self, observation_tensors): """Given observations, returns whether session has or has not ended. Returns uint8 [batch,time_tick] where 1 means session is alive and 0 means session ended already. Note that session is considered still alive while agent is committing terminal action """ observation_tensors = check_list(observation_tensors) return T.eq(observation_tensors[0][:, :, -1], 0)
def __init__(self, incoming, broadcasted_axes, force_broadcastable_batch=True, **kwargs): self.incoming_ndim = len(incoming.output_shape) # axes that are to be broadcasted -- in ascending order # ax%self.incoming_ndim is used to replace negative axes with N-ax+1 so that -1 becomes last axis self.broadcasted_axes = sorted( [ax % self.incoming_ndim for ax in check_list(broadcasted_axes)]) # sanity checks assert max(self.broadcasted_axes) < self.incoming_ndim assert len(self.broadcasted_axes) > 0 if force_broadcastable_batch and (0 not in self.broadcasted_axes): raise ValueError( "BroadcastLayer was asked NOT to broadcast over batch (0'th) axis.\n" "If you know what you're doing, set force_broadcastable_batch=False.\n" "Otherwise just add 0 to the broadcasted_axes") # axed that are NOT broadcasted = all other axes in respective order self.non_broadcasted_axes = [ ax for ax in range(self.incoming_ndim) if ax not in self.broadcasted_axes ] super(BroadcastLayer, self).__init__(incoming, **kwargs)
def get_whether_alive(self,observation_tensors): """Given observations, returns whether session has or has not ended. Returns uint8 [batch,time_tick] where 1 means session is alive and 0 means session ended already. Note that session is considered still alive while agent is commiting end_action """ observation_tensors = check_list(observation_tensors) return T.eq(observation_tensors[0][:,:,1],0)
def get_action_results(self, last_states, actions, **kwargs): #unpack state and action last_state = check_list(last_states)[0] action = check_list(actions)[0] #state is a boolean vector: whether or not i-th action #was tried already during this session #last output[:,end_code] always remains 1 after first being triggered #whether session was active before tick session_active = T.eq(last_state[:, -1], 0) #whether session was terminated by the end of this tick session_terminated = T.or_(T.eq(session_active, 0), in1d(action, self.terminal_action_ids)) batch_range = T.arange(action.shape[0]) state_after_action = T.set_subtensor(last_state[batch_range, action], 1) state_after_action = T.set_subtensor(state_after_action[:, -1], session_terminated) new_state = T.switch(session_active.reshape([-1, 1]), state_after_action, last_state) #if allowed to see attribute observed_attrs = T.switch( state_after_action[:, :self.attributes.shape[1]], self.attributes, -1) observation = T.concatenate( [ observed_attrs, #float32[batch,1] response T.extra_ops.to_one_hot( action, self.joint_data.shape[1]), #what action was commited session_terminated.reshape( [-1, 1]), # whether session is terminated by now ], axis=1) return new_state, observation
def __init__(self, incoming, broadcasted_axes, force_broadcastable_batch=True, **kwargs): self.incoming_ndim = len(incoming.output_shape) # axes that are to be broadcasted -- in ascending order # ax%self.incoming_ndim is used to replace negative axes with N-ax+1 so that -1 becomes last axis self.broadcasted_axes = sorted([ax % self.incoming_ndim for ax in check_list(broadcasted_axes)]) # sanity checks assert max(self.broadcasted_axes) < self.incoming_ndim assert len(self.broadcasted_axes) > 0 if force_broadcastable_batch and (0 not in self.broadcasted_axes): raise ValueError("BroadcastLayer was asked NOT to broadcast over batch (0'th) axis.\n" "If you know what you're doing, set force_broadcastable_batch=False.\n" "Otherwise just add 0 to the broadcasted_axes") # axed that are NOT broadcasted = all other axes in respective order self.non_broadcasted_axes = [ax for ax in range(self.incoming_ndim) if ax not in self.broadcasted_axes] super(BroadcastLayer, self).__init__(incoming, **kwargs)
def get_reward(self, session_states, session_actions, batch_id): """ WARNING! this runs on a single session, not on a batch reward given for taking the action in current environment state arguments: session_states float[time, memory_id]: environment state before taking action session_actions int[time]: agent action at this tick returns: reward float[time]: reward for taking action from the given state """ #unpach states and actions session_states = check_list(session_states)[0] session_actions = check_list(session_actions)[0] time_range = T.arange(session_actions.shape[0]) has_tried_already = session_states[time_range, session_actions] session_is_active = T.eq(session_states[:, -1], 0) action_is_terminal = in1d(session_actions, self.terminal_action_ids) at_least_one_terminal_action = T.gt( T.cumsum(action_is_terminal, axis=0), 0) has_finished_now = T.set_subtensor(action_is_terminal[-1], 1) end_tick = has_finished_now.nonzero()[0][0] #categorical and attributes reward_for_intermediate_action = T.switch( has_tried_already, self.rw["repeated_poll"], self.rw["tick_reward"], ) correct_stage = T.argmax(self.disease_stages[batch_id]) + 1 predicted_stage = session_actions - self.attributes.shape[1] exaggeration_penalty = T.maximum(predicted_stage - correct_stage,0)*\ self.rw["stage_exaggerated_penalty_per_point"] underestimation_penalty = T.maximum(correct_stage - predicted_stage,0)*\ self.rw["stage_underestimated_penalty_per_point"] diagnosis_reward = self.rw[ "end_action_reward"] + exaggeration_penalty + underestimation_penalty #ending session reward_for_end_action = T.switch( at_least_one_terminal_action, #if at least 1 diagnosis chosen diagnosis_reward, # than score diagnosis self.rw["unfinished"]) #else punish for no diagnosis #include end action reward_for_action = T.switch( has_finished_now, reward_for_end_action, reward_for_intermediate_action, ) final_reward = T.switch( session_is_active, reward_for_action, 0, ) return final_reward.astype(theano.config.floatX)
def WrongLSTMCell(prev_cell, prev_out, input_or_inputs=tuple(), num_units=None, peepholes=True, weight_init=init.Normal(), bias_init=init.Constant(), peepholes_W_init=init.Normal(), forgetgate_nonlinearity=lasagne.nonlinearities.sigmoid, inputgate_nonlinearity=lasagne.nonlinearities.sigmoid, outputgate_nonlinearity=lasagne.nonlinearities.sigmoid, cell_nonlinearity=lasagne.nonlinearities.tanh, output_nonlinearity=lasagne.nonlinearities.tanh, name='lstm', grad_clipping=5., ): """ Implements a one-step gated recurrent unit (GRU) with arbitrary number of units. :param prev_cell: input that denotes previous state (shape must be (None, n_units) ) :type prev_cell: lasagne.layers.Layer :param input_or_inputs: a single layer or a list/tuple of layers that go as inputs :type input_or_inputs: lasagne.layers.Layer or list of such :param num_units: how many recurrent cells to use. None means "as in prev_state" :type num_units: int :param peepholes: If True, the LSTM uses peephole connections. When False, peepholes_W_init are ignored. :type peepholes: bool :param bias_init: either a lasagne initializer to use for every gate weights or a list of 4 initializers for [input gate, forget gate, cell, output gate] :param weight_init: either a lasagne initializer to use for every gate weights: or a list of two initializers, - first used for all weights from hidden -> <all>_gate and cell - second used for all weights from input(s) -> <all>_gate weights and cell or a list of two objects elements, - second list is hidden -> input gate, forget gate, cell, output gate, - second list of lists where list[i][0,1,2] = input[i] -> [input_gate, forget gate, cell,output gate ] :param peepholes_W_init: either a lasagne initializer or a list of 3 initializers for [input_gate, forget gate,output gate ] weights. If peepholes=False, this is ignored. :param <any>_nonlinearity: which nonlinearity to use for a particular gate :param grad_clipping: maximum gradient absolute value. 0 or None means "no clipping" :returns: a tuple of (new_cell,new_output) layers :rtype: (lasagne.layers.Layer,lasagne.layers.Layer) for developers: Works by stacking other lasagne layers; is a function mock, not actual class. """ assert len(prev_cell.output_shape) == 2 # if required, infer num_units if num_units is None: num_units = prev_cell.output_shape[1] # else check it assert num_units == prev_cell.output_shape[1] # gates and cell (before nonlinearities) gates = GateLayer([prev_out] + check_list(input_or_inputs), [num_units] * 4, channel_names=["to_ingate", "to_forgetgate", "to_cell", "to_outgate"], gate_nonlinearities=None, bias_init=bias_init, weight_init=weight_init, name=name or "") ingate, forgetgate, cell_input, outputgate = gates.values() # clip grads #1 if grad_clipping: ingate, forgetgate, cell_input, outputgate = [clip_grads(lyr, grad_clipping) for lyr in [ingate, forgetgate, cell_input, outputgate]] if peepholes: # cast bias init to a list peepholes_W_init = check_list(peepholes_W_init) assert len(peepholes_W_init) in (1, 3) if len(peepholes_W_init) == 1: peepholes_W_init *= 3 W_cell_to_ingate_init,W_cell_to_forgetgate_init= peepholes_W_init[:2] peep_ingate = lasagne.layers.ScaleLayer(prev_cell,W_cell_to_ingate_init,shared_axes=[0,], name= (name or "") + ".W_cell_to_ingate_peephole") peep_forgetgate = lasagne.layers.ScaleLayer(prev_cell,W_cell_to_forgetgate_init,shared_axes=[0,], name= (name or "") + ".W_cell_to_forgetgate_peephole") ingate = add(ingate,peep_ingate) forgetgate = add(forgetgate,peep_forgetgate) # nonlinearities ingate = NonlinearityLayer( ingate, inputgate_nonlinearity, name=name+".inputgate" ) forgetgate = NonlinearityLayer( forgetgate, forgetgate_nonlinearity, name=name+".forgetgate" ) cell_input = NonlinearityLayer(cell_input, nonlinearity=cell_nonlinearity, name=name+'.cell_nonlinearity') # cell = input * ingate + prev_cell * forgetgate new_cell= add(mul(cell_input,ingate), mul(prev_cell, forgetgate)) # output gate if peepholes: W_cell_to_outgate_init = peepholes_W_init[2] peep_outgate = lasagne.layers.ScaleLayer(new_cell,W_cell_to_outgate_init,shared_axes=[0,], name= (name or "") + ".W_cell_to_outgate_peephole") outputgate = add(outputgate, peep_outgate) outputgate = NonlinearityLayer( outputgate, outputgate_nonlinearity, name=name+".outgate" ) #cell output #!!!CHANGES START HERE!!! pre_output = mul( outputgate, new_cell, name=name+'.pre_output' ) new_output=NonlinearityLayer(pre_output, output_nonlinearity, name=name+'.post_outgate_nonlinearity') #!!!CHANGES END HERE!!! return new_cell, new_output
def get_reward(self,session_states,session_actions,batch_i): """ WARNING! this runs on a single session, not on a batch reward given for taking the action in current environment state arguments: session_states float[batch_id, memory_id]: environment state before taking action session_actions int[batch_id]: agent action at this tick returns: reward float[batch_id]: reward for taking action from the given state """ #unpach states and actions session_states = check_list(session_states)[0] session_actions = check_list(session_actions)[0] time_range = T.arange(session_actions.shape[0]) has_tried_already = session_states[time_range,session_actions] session_is_active = T.eq(session_states[:,self.end_action_id],0) has_finished_now = T.eq(session_actions,self.end_action_id) has_finished_now = T.set_subtensor(has_finished_now[-1],1) end_tick = has_finished_now.nonzero()[0][0] action_is_categorical = in1d(session_actions, self.category_action_ids) response = self.joint_data[batch_i,session_actions].ravel() at_least_one_category_guessed = T.any(action_is_categorical[:end_tick] & (response[:end_tick]>0)) #categorical and attributes reward_for_intermediate_action = T.switch( action_is_categorical, response*(self.rw["category_positive"]-self.rw["category_negative"]) + self.rw["category_negative"], response*(self.rw["attribute_positive"]-self.rw["attribute_negative"]) + self.rw["attribute_negative"] ) reward_for_intermediate_action_first_time = T.switch( has_tried_already, self.rw["repeated_poll"], reward_for_intermediate_action, ) #ending session reward_for_end_action = T.switch(at_least_one_category_guessed, #if chosen at least 1 category self.rw["end_action"], #do not penalize self.rw["end_action_if_no_category_predicted"]) #else punish #include end action reward_for_action = T.switch( has_finished_now, reward_for_end_action, reward_for_intermediate_action_first_time, ) final_reward = T.switch( session_is_active, reward_for_action, 0, ) return final_reward.astype(theano.config.floatX)
def get_reward(self, session_states, session_actions, batch_id): """ WARNING! this runs on a single session, not on a batch reward given for taking the action in current environment state arguments: session_states float[time, memory_id]: environment state before taking action session_actions int[time]: agent action at this tick returns: reward float[time]: reward for taking action from the given state """ #unpach states and actions session_states = check_list(session_states)[0] session_actions = check_list(session_actions)[0] time_range = T.arange(session_actions.shape[0]) has_tried_already = session_states[time_range, session_actions] session_is_active = T.eq(session_states[:, self.end_action_id], 0) has_finished_now = T.eq(session_actions, self.end_action_id) has_finished_now = T.set_subtensor(has_finished_now[-1], 1) end_tick = has_finished_now.nonzero()[0][0] action_is_categorical = in1d(session_actions, self.category_action_ids) response = self.joint_data[batch_id, session_actions].ravel() at_least_one_category_guessed = T.any(action_is_categorical[:end_tick] & (response[:end_tick] > 0)) #categorical and attributes reward_for_intermediate_action = T.switch( action_is_categorical, response * (self.rw["category_positive"] - self.rw["category_negative"]) + self.rw["category_negative"], response * (self.rw["attribute_positive"] - self.rw["attribute_negative"]) + self.rw["attribute_negative"]) reward_for_intermediate_action_first_time = T.switch( has_tried_already, self.rw["repeated_poll"], reward_for_intermediate_action, ) #ending session reward_for_end_action = T.switch( at_least_one_category_guessed, #if chosen at least 1 category self.rw["end_action"], #do not penalize self.rw["end_action_if_no_category_predicted"]) #else punish #include end action reward_for_action = T.switch( has_finished_now, reward_for_end_action, reward_for_intermediate_action_first_time, ) final_reward = T.switch( session_is_active, reward_for_action, 0, ) return final_reward.astype(theano.config.floatX)