def __init__(self, input, We): """ Input = a list (minibatch) of lists of indexes (pre-processed sentences) We = a word embedding matrix (vocabulary * dimensions) """ # initialise the word embeddings self.We = theano.shared(numpy.asarray(We, dtype=theano.config.floatX), name='We', borrow=True) # Mapping to vector: # from: input (batch_size * indices) # to: vectors (batch_size * indices * dimensions) lookup = self.We[input] # This step concatenates along the vector-dimension axis three versions of the lookup 3D tensor: # 1) lookup 'rolled' forwards by 1 along the indices axis, 2) original tesor 3) tensor shifted backwards by 1 # Note that in 1) and 3) a 0-valued vector represent sentence boundaries. forwards = T.set_subtensor(T.roll(lookup, 1, axis=1)[:, 0], 0.) backwards = T.set_subtensor(T.roll(lookup, -1, axis=1)[:, -1], 0.) window_processing = T.concatenate([forwards, lookup, backwards], axis=2) # I/O self.input = input self.output = window_processing # parameters of the model self.params = [self.We]
def get_output_for(self, input, **kwargs): def norm_fn(f, mask, label, previous, W_sim): # f: batch * class, mask: batch, label: batch, previous: batch * class, W_sim: class * class # previous: batch * class next = previous.dimshuffle(0, 1, 'x') + f.dimshuffle(0, 'x', 1) + W_sim.dimshuffle('x', 0, 1) # batch * class * class next = theano_logsumexp(next, axis = 1) # batch * class mask = mask.dimshuffle(0, 'x') next = previous * (1.0 - mask) + next * mask return next f = input # batch * time * class if self.end_points: for i in range(self.num_classes): f = T.inc_subtensor(f[:, 0, i], self.W_end_points[0, i]) f = T.inc_subtensor(f[:, -1, i], self.W_end_points[1, i]) initial = f[:, 0, :] outputs, _ = theano.scan(fn = norm_fn, \ sequences = [f.dimshuffle(1, 0, 2)[1: ], self.mask_input.dimshuffle(1, 0)[1: ], self.label_input.dimshuffle(1, 0)[1:]], \ outputs_info = initial, non_sequences = [self.W_sim], strict = True) norm = T.sum(theano_logsumexp(outputs[-1], axis = 1)) f_pot = (f.reshape((-1, f.shape[-1]))[T.arange(f.shape[0] * f.shape[1]), self.label_input.flatten()] * self.mask_input.flatten()).sum() labels = self.label_input # batch * time shift_labels = T.roll(labels, -1, axis = 1) mask = self.mask_input # batch * time shift_mask = T.roll(mask, -1, axis = 1) g_pot = (self.W_sim[labels.flatten(), shift_labels.flatten()] * mask.flatten() * shift_mask.flatten()).sum() return - (f_pot + g_pot - norm) / f.shape[0] if self.normalize else - (f_pot + g_pot - norm)
def WeeklyRandomWalk(name,n,initial,flt=np.array([.05,.1,.7,.1,.05],dtype=np.float64),sigma=.05,offset=0): additional_week = np.array([0,1,1,1,1,1,1])[offset%7] offset = tt.cast(offset,"int64") delay_list_length = n//7+additional_week rw_list = [] rw_list.append(initial) sigma_random_walk = pm.HalfNormal(name=name+"_sigma_random_walk", sigma=sigma) delay_ratio_random_walk = pm.distributions.timeseries.GaussianRandomWalk( name=name+"_random_walk",mu=0, sigma=sigma_random_walk,shape=delay_list_length, init=pm.Normal.dist(sigma=sigma), ) flt = tt.cast(flt,np.float64) flt = flt / tt.sum(flt) val = delay_ratio_random_walk lval = tt.alloc(0.,val.shape[0]+4) lval = tt.cast(lval,"float64") lval = tt.set_subtensor(lval[2:-2],val) lval = tt.set_subtensor(lval[:2],val[0]) lval = tt.set_subtensor(lval[-2:],val[-1]) # extend the m = tt.alloc(lval,7,lval.shape[0]) mf = tt.flatten(m.T,ndim=1) mf = tt.roll(mf,offset) mf2 = tt.alloc(mf,1,mf.shape[0]) kern2 = tt.alloc(flt,1,flt.shape[0]) r = tt.signal.conv.conv2d(mf2,kern2,border_mode='full') r = tt.roll(r[0],offset) rs = r[(14+flt.shape[0]//2):(7*val.shape[0]+14+flt.shape[0]//2)][:n] return rw_list[0]+rs
def ShiftConv(w_t_g, s_t, N): shift = 2.*s_t-1. Z = T.mod(shift+N, N) simj = 1 - (Z - T.floor(Z)) imj = T.mod(T.arange(N) + T.iround(T.floor(Z)),N) w_t_g_roll_1 = T.roll(w_t_g, -T.iround(T.floor(Z))) w_t_g_roll_2 = T.roll(w_t_g, -(T.iround(T.floor(Z))+1)) w_t_s = w_t_g_roll_1*simj + w_t_g_roll_2*(1-simj) return w_t_s
def TransferWeekendReported(r_t,f,mask): """ Moves f* value at r_t to r_t+2 / r_t+1 on saturdays and sundays """ sat = r_t * mask[5] * f # Trnasfer cases sut = r_t * mask[6] * f r_t = r_t - sat - sut # Substract the transfered cases satr = tt.roll(sat,2) # Shift the transfered cases satr = tt.set_subtensor(satr[:2],0) sutr = tt.roll(sut,1) sutr = tt.set_subtensor(sutr[:1],0) r_t = r_t + satr + sutr # Add up return r_t
def manhatten_corr(self, a, b): # [0,0,0,1,1,1,2,2,2] i = T.arange(a.shape[2]).repeat(a.shape[3]) # [1,2,3,1,2,3,1,2,3] j = T.tile(T.arange(a.shape[3]), (a.shape[2], )) manhatten, _ = theano.scan(lambda i, j: T.sum( T.abs_(T.roll(T.roll(a, shift=j, axis=3), shift=i, axis=2) - b)), sequences=[i, j]) return T.sum(manhatten)
def get_output_for(self, input, **kwargs): def norm_fn(f, mask, label, previous, W_sim): # f: inst * class, mask: inst, previous: inst * class, W_sim: class * class next = previous.dimshuffle(0, 1, 'x') + f.dimshuffle( 0, 'x', 1) + W_sim.dimshuffle('x', 0, 1) if COST: next = next + COST_CONST * (1.0 - T.extra_ops.to_one_hot( label, self.num_classes).dimshuffle(0, 'x', 1)) # next: inst * prev * cur next = theano_logsumexp(next, axis=1) # next: inst * class mask = mask.dimshuffle(0, 'x') next = previous * (1.0 - mask) + next * mask return next f = T.dot(input, self.W) # f: inst * time * class initial = f[:, 0, :] if CRF_INIT: initial = initial + self.W_init[0].dimshuffle('x', 0) if COST: initial = initial + COST_CONST * (1.0 - T.extra_ops.to_one_hot( self.label_input[:, 0], self.num_classes)) outputs, _ = theano.scan(fn=norm_fn, \ sequences=[f.dimshuffle(1, 0, 2)[1:], self.mask_input.dimshuffle(1, 0)[1:], self.label_input.dimshuffle(1, 0)[1:]], \ outputs_info=initial, non_sequences=[self.W_sim], strict=True) norm = T.sum(theano_logsumexp(outputs[-1], axis=1)) f_pot = (f.reshape( (-1, f.shape[-1]))[T.arange(f.shape[0] * f.shape[1]), self.label_input.flatten()] * self.mask_input.flatten()).sum() if CRF_INIT: f_pot += self.W_init[0][self.label_input[:, 0]].sum() labels = self.label_input # labels: inst * time shift_labels = T.roll(labels, -1, axis=1) mask = self.mask_input # mask : inst * time shift_mask = T.roll(mask, -1, axis=1) g_pot = (self.W_sim[labels.flatten(), shift_labels.flatten()] * mask.flatten() * shift_mask.flatten()).sum() return -(f_pot + g_pot - norm) / f.shape[0]
def infer(self, keys, key_mask, values, initial_state, target_embedding, target_bias, keep_prob): def infer_step(y_prev, mask, state, keys, values, key_mask, embedding, embedding_bias): return self._infer_step(y_prev, mask, state, keys, values, key_mask, embedding, embedding_bias, keep_prob) n_steps, batch_size = key_mask.shape seq = None initial_inputs = T.zeros((batch_size, target_embedding.shape[1]), "float32") initial_mask = T.ones((batch_size, 1), "float32") outputs_info = [ initial_inputs, initial_mask, initial_state, None, None ] non_seq = [keys, values, key_mask, target_embedding, target_bias] # max length is len_src*3 inputs, mask, states, contexts, probs = ops.scan(infer_step, seq, outputs_info, non_seq, n_steps=n_steps * 2) mask = T.reshape(mask, mask.shape[:-1]) mask = T.roll(mask, 1, 0) mask = T.set_subtensor(mask[0, :], initial_mask[:, 0]) # (step, batch, n_voc)->(step*batch, n_voc) probs = T.reshape(probs, (probs.shape[0] * probs.shape[1], probs.shape[2])) return states, contexts, probs, mask
def window_batch_timewise(t, b, w, full_index): for i in range(w): full_index = T.set_subtensor(full_index[i], T.roll(full_index[i], i)) if i > 0: full_index = T.inc_subtensor( full_index[i], T.where(full_index[i] > 0, i * t * b - i, 0)) return full_index
def hybo_channel(x, p, shift, seed=None, unif=True, just_dropout=False): '''Theano hybrid bootstrap backend''' if p.get_value() < 0. or p.get_value() > 1: raise Exception('Hybrid bootstrap p must be in interval [0, 1].') if seed is None: seed = np.random.randint(1, 10e6) rng = K.RandomStreams(seed=seed) if (unif == True): retain_prob = 1. - rng.uniform((x.shape[0], ), 0, p, dtype=x.dtype) for dim in range(x.ndim - 1): retain_prob = K.expand_dims(retain_prob, dim + 1) else: retain_prob = 1. - p mask = rng.binomial((x.shape[0], 1, 1, x.shape[3]), p=retain_prob, dtype=x.dtype) mask = T.extra_ops.repeat(mask, x.shape[1], axis=1) mask = T.extra_ops.repeat(mask, x.shape[2], axis=2) if just_dropout: x = x * mask / retain_prob else: x = x * mask + (1 - mask) * T.roll(x, shift=shift, axis=0) return x
def getScoreOfPath(self, s, path): prevPath = T.roll(path,1) prevPath = T.set_subtensor(prevPath[0], -1) scores, _ = theano.scan(fn = self.computeScore, sequences = [s, path, prevPath], n_steps = path.shape[0]) return T.sum(scores)
def WeeklyRandomWalkWeekend(name,n,initial,wfactor,flt=np.array([.05,.1,.7,.1,.05],dtype=np.float64),sigma=.05,offset=0): additional_week = np.array([0,1,1,1,1,1,1])[offset%7] # if firstday == monday, no additional week is needed offset = tt.cast(offset,"int64") walk_len = n//7+additional_week rw_list = [] rw_list.append(initial) # Generate "stepsize" sigma_random_walk = pm.HalfNormal(name=name+"_sigma_random_walk", sigma=sigma) random_walk = pm.distributions.timeseries.GaussianRandomWalk( name=name+"_random_walk",mu=0, sigma=sigma_random_walk,shape=walk_len, init=pm.Normal.dist(sigma=sigma), ) flt = flt / tt.sum(flt) val = random_walk # generates a longer list, 2 at front, two at the back with the same vaule as the original front / back # --> 2 weeks pre / post to allow simple filtering and offset of up to one week length eacht. lval = tt.alloc(0.,val.shape[0]+4) # streched list of values lval = tt.cast(lval,"float64") lval = tt.set_subtensor(lval[2:-2],val) lval = tt.set_subtensor(lval[:2],val[0]) lval = tt.set_subtensor(lval[-2:],val[-1]) # extend the # Generate Matrix 7x(#weeks) shape, which was weekly values dublicated over 7 entries m = tt.alloc(lval,7,lval.shape[0]) mf = tt.flatten(m.T,ndim=1) # Flatten it, now 7 weekly values are # Format Matrix mf2 = tt.alloc(mf,1,mf.shape[0]) kern2 = tt.alloc(flt,1,flt.shape[0]) daily_values = tt.signal.conv.conv2d(mf2,kern2,border_mode='full') daily_values = tt.roll(daily_values[0],-offset) daily_values_ranged = daily_values[(14+flt.shape[0]//2):(7*val.shape[0]+14+flt.shape[0]//2)][:n] # Generate 7x(n days) maxtrix marking day of week d_oeye = tt.roll(tt.eye(7),-offset,axis=1) week_mask = tt.tile(d_oeye,walk_len)[:,:n] daily_walk = rw_list[0]+daily_values_ranged # Create Mask with wfactor at the weekends otherwiese 1, then multiply with daily_walk weekend_m = week_mask[5] + week_mask[6] # Saturday + Sunday weekend_f = weekend_m*wfactor - weekend_m + tt.ones_like(weekend_m) daily_walk = daily_walk * weekend_f return daily_walk,week_mask
def evaluate(self, application_call, outputs, mask=None, **kwargs): # We assume the data has axes (time, batch, features, ...) batch_size = outputs.shape[1] # Prepare input for the iterative part states = dict_subset(kwargs, self._state_names, must_have=False) # masks in context are optional (e.g. `attended_mask`) contexts = dict_subset(kwargs, self._context_names, must_have=False) feedback = self.readout.feedback(outputs) inputs = self.fork.apply(feedback, as_dict=True) # Run the recurrent network results = self.transition.apply( mask=mask, return_initial_states=True, as_dict=True, **dict_union(inputs, states, contexts)) # Separate the deliverables. The last states are discarded: they # are not used to predict any output symbol. The initial glimpses # are discarded because they are not used for prediction. # Remember, glimpses are computed _before_ output stage, states are # computed after. states = OrderedDict((name, results[name][:-1]) for name in self._state_names) glimpses = OrderedDict((name, results[name][1:]) for name in self._glimpse_names) # Compute the cost feedback = tensor.roll(feedback, 1, 0) feedback = tensor.set_subtensor( feedback[0], self.readout.feedback(self.readout.initial_outputs(batch_size))) # Run the language model if self.language_model: lm_states = self.language_model.evaluate( outputs=outputs, mask=mask, as_dict=True) lm_states = {'lm_' + name: value for name, value in lm_states.items()} else: lm_states = {} readouts = self.readout.readout( feedback=feedback, **dict_union(lm_states, states, glimpses, contexts)) costs = self.readout.cost(readouts, outputs) if mask is not None: costs *= mask for name, variable in list(glimpses.items()) + list(states.items()): application_call.add_auxiliary_variable( variable.copy(), name=name) # This variables can be used to initialize the initial states of the # next batch using the last states of the current batch. for name in self._state_names + self._glimpse_names: application_call.add_auxiliary_variable( results[name][-1].copy(), name=name+"_final_value") return [costs] + states.values() + glimpses.values()
def mask_for_prediction(self, prediction): prediction_mask = tensor.lt( tensor.cumsum(tensor.eq(prediction, self.eos_label) .astype(theano.config.floatX), axis=0), 1).astype(theano.config.floatX) prediction_mask = tensor.roll(prediction_mask, 1, 0) prediction_mask = tensor.set_subtensor( prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :])) return prediction_mask
def cost(self, application_call, outputs, mask=None, **kwargs): """Returns generation costs for output sequences. Parameters ---------- outputs : :class:`~tensor.TensorVariable` The 3(2) dimensional tensor containing output sequences. The dimension 0 must stand for time, the dimension 1 for the position on the batch. mask : :class:`~tensor.TensorVariable` The binary matrix identifying fake outputs. Notes ----- The contexts are expected as keyword arguments. """ batch_size = outputs.shape[-2] # TODO Assumes only 1 features dim # Prepare input for the iterative part states = { name: kwargs[name] for name in self.state_names if name in kwargs } contexts = {name: kwargs[name] for name in self.context_names} feedback = self.readout.feedback(outputs) inputs = (self.fork.apply(feedback, return_dict=True) if self.fork else { 'feedback': feedback }) # Run the recurrent network results = self.transition.apply(mask=mask, return_initial_states=True, return_dict=True, **dict_union(inputs, states, contexts)) # Separate the deliverables states = {name: results[name][:-1] for name in self.state_names} glimpses = {name: results[name] for name in self.glimpse_names} # Compute the cost feedback = tensor.roll(feedback, 1, 0) feedback = tensor.set_subtensor( feedback[0], self.readout.feedback( self.readout.initial_outputs(batch_size, **contexts))) readouts = self.readout.readout(feedback=feedback, **dict_union(states, glimpses, contexts)) costs = self.readout.cost(readouts, outputs) for name, variable in glimpses.items(): application_call.add_auxiliary_variable(variable.copy(), name=name) # In case the user needs some glimpses or states or smth else return costs
def cost_matrix(self, application_call, outputs, mask=None, **kwargs): """Returns generation costs for output sequences. See Also -------- :meth:`cost` : Scalar cost. """ # We assume the data has axes (time, batch, features, ...) batch_size = outputs.shape[1] # Prepare input for the iterative part states = dict_subset(kwargs, self._state_names, must_have=False) # masks in context are optional (e.g. `attended_mask`) # contexts = dict_subset(kwargs, self._context_names, must_have=False) contexts = dict_subset(kwargs, self._context_names, must_have=False) contexts['initial_state_context'] = kwargs['initial_state_context'] feedback = self.readout.feedback(outputs) inputs = self.fork.apply(feedback, as_dict=True) # Run the recurrent network results = self.transition.apply(mask=mask, return_initial_states=True, as_dict=True, **dict_union(inputs, states, contexts)) # Separate the deliverables. The last states are discarded: they # are not used to predict any output symbol. The initial glimpses # are discarded because they are not used for prediction. # Remember, glimpses are computed _before_ output stage, states are # computed after. states = {name: results[name][:-1] for name in self._state_names} glimpses = {name: results[name][1:] for name in self._glimpse_names} # Compute the cost feedback = tensor.roll(feedback, 1, 0) feedback = tensor.set_subtensor( feedback[0], self.readout.feedback(self.readout.initial_outputs(batch_size))) readouts = self.readout.readout(feedback=feedback, **dict_union(states, glimpses, contexts)) costs = self.readout.cost(readouts, outputs) if mask is not None: costs *= mask for name, variable in list(glimpses.items()) + list(states.items()): application_call.add_auxiliary_variable(variable.copy(), name=name) # This variables can be used to initialize the initial states of the # next batch using the last states of the current batch. for name in self._state_names + self._glimpse_names: application_call.add_auxiliary_variable(results[name][-1].copy(), name=name + "_final_value") return costs
def new_day(lambda_at_t,imported_at_t,infected,E_t,beta,N): f = E_t / N new = imported_at_t + theano.dot(infected,beta) * lambda_at_t * f new = tt.clip(new,0,N) infected = tt.roll(infected,1,0) infected = tt.set_subtensor(infected[:1],new,inplace=False) E_t = tt.clip(E_t-new,0,E_t) return new,infected,E_t
def roll_and_dot(wvec, xvec): """ wvec.shape = (n_in, ) xvec.shape = (timesteps, n_in) """ dot = T.dot(xvec, wvec) wvec = T.roll(wvec, 1) return wvec, dot, xvec
def GenInit(l,a1,a2,t1=10,t2=27,offset=8): x = tt.arange(l) d1 = tt_lognormal(x,tt.log(t1),.8)*2350 #.4 / 23500 d2 = tt_lognormal(x,tt.log(t2),.25)*12500 din = d1*a1 + d2*a2 din = tt.roll(din,-offset) din = tt.set_subtensor(din[-offset:],0.) return din
def mse2consist_err(self, y): print '=== using mse2consist error. ===' # mean square error mse = T.mean(T.pow(y - self.y_t, 2)) # consistency error cst_err = T.mean(T.pow(self.y_t - T.roll(self.y_t, shift=1, axis=0), 2)) hybrid_err = 0.9 * mse + 0.1 * cst_err return hybrid_err
def ShiftConv(w_t_g, s_t, N, num_shifts): # pad = (num_shifts//2, (num_shifts-1)//2) # w_t_g_pd_ = T.concatenate([w_t_g[(-pad[0]-1):-1], w_t_g, w_t_g[:(pad[1])]]) # w_t_g_pd = w_t_g_pd_.dimshuffle('x','x','x', 0) # filter = s_t.dimshuffle('x', 'x', 'x', 0) # convolution = T.nnet.conv2d(w_t_g_pd, filter, # input_shape=(1, 1, 1, N + pad[0] + pad[1]), # filter_shape=(1, 1, 1, num_shifts), # subsample=(1, 1), # border_mode='valid') # w_t_s = convolution[0, 0, 0, :] shift = 2.*s_t-1. Z = T.mod(shift+N, N) simj = 1 - (Z - T.floor(Z)) imj = T.mod(T.arange(N) + T.iround(T.floor(Z)),N) w_t_g_roll_1 = T.roll(w_t_g, -T.iround(T.floor(Z))) w_t_g_roll_2 = T.roll(w_t_g, -(T.iround(T.floor(Z))+1)) w_t_s = w_t_g_roll_1*simj + w_t_g_roll_2*(1-simj) return w_t_s
def roll(x, shift, axis): """ A numpy-theano agnostic version of the numpy.roll operator calls either numpy.roll or theano.tensor.roll depending on class See numpy.roll for usage """ if isinstance(x, np.ndarray): return np.roll(x, shift, axis) if isinstance(x, T.basic.TensorVariable): return T.roll(x, shift, axis) raise NotImplementedError()
def _shift_step(c_mem, c_shift): # c_mem is (note, mem) # c_shift is an int if self.mode=="drop": def _clamp_w(x): return T.maximum(0,T.minimum(x,self.window_size)) ins_at_front = T.zeros((_clamp_w(-c_shift),per_note)) ins_at_back = T.zeros((_clamp_w(c_shift),per_note)) take_part = c_mem[_clamp_w(c_shift):self.window_size-_clamp_w(-c_shift),:] return T.concatenate([ins_at_front, take_part, ins_at_back], 0) elif self.mode=="roll": return T.roll(c_mem, (-c_shift)%12, axis=0)
def evolve_system(self, x, n, k, gamma): """ Compute time-derivative at current state Model: dx/dt = k^n / (x^n + K^n) - gamma*x This leads to 3+ species sustained oscillations. Note that x is matrix. We have dependency only on preceding variable, which can be efficiently implemented by rolling the matrix by `shift=-1` along corresponding axis. """ temp = T.pow(k, n)/(T.pow(x, n)+T.pow(k,n)) dxdt = T.roll(temp, shift = -1, axis = 1) - gamma*x return dxdt
def cost_matrix(self, application_call, outputs, mask=None, **kwargs): """Returns generation costs for output sequences. See Also -------- :meth:`cost` : Scalar cost. """ # We assume the data has axes (time, batch, features, ...) batch_size = outputs.shape[1] # Prepare input for the iterative part states = dict_subset(kwargs, self._state_names, must_have=False) # masks in context are optional (e.g. `attended_mask`) contexts = dict_subset(kwargs, self._context_names, must_have=False) feedback = self.readout.feedback(outputs) inputs = self.fork.apply(feedback, as_dict=True) # Run the recurrent network results = self.transition.apply( mask=mask, return_initial_states=True, as_dict=True, **dict_union(inputs, states, contexts)) # Separate the deliverables. The last states are discarded: they # are not used to predict any output symbol. The initial glimpses # are discarded because they are not used for prediction. # Remember, glimpses are computed _before_ output stage, states are # computed after. states = {name: results[name][:-1] for name in self._state_names} glimpses = {name: results[name][1:] for name in self._glimpse_names} # Compute the cost feedback = tensor.roll(feedback, 1, 0) feedback = tensor.set_subtensor( feedback[0], self.readout.feedback(self.readout.initial_outputs(batch_size))) readouts = self.readout.readout( feedback=feedback, **dict_union(states, glimpses, contexts)) costs = self.readout.cost(readouts, outputs) if mask is not None: costs *= mask for name, variable in list(glimpses.items()) + list(states.items()): application_call.add_auxiliary_variable( variable.copy(), name=name) # This variables can be used to initialize the initial states of the # next batch using the last states of the current batch. for name in self._state_names: application_call.add_auxiliary_variable( results[name][-1].copy(), name=name+"_final_value") return costs
def __init__(self, input, We, features, longest): """ Input = a list (minibatch) of lists of indexes (pre-processed sentences) We = a word embedding matrix (vocabulary * dimensions) """ # initialise the word embeddings self.We = theano.shared(numpy.asarray(We, dtype=theano.config.floatX), name='We', borrow=True) # Mapping to vector: # from: input (batch_size * indices) # to: vectors (batch_size * indices * dimensions) lookup = self.We[input] # This step concatenates along the vector-dimension axis three versions of the lookup 3D tensor: # 1) lookup 'rolled' forwards by 1 along the indices axis, 2) original tesor 3) tensor shifted backwards by 1 # Note that in 1) and 3) a 0-valued vector represent sentence boundaries. forwards = T.set_subtensor(T.roll(lookup, 1, axis=1)[:, 0], 0.) backwards = T.set_subtensor(T.roll(lookup, -1, axis=1)[:, -1], 0.) window_processing = T.concatenate([forwards, lookup, backwards], axis=2) event1 = self.We[features[:, 0]] event2 = self.We[features[:, 1]] participants1 = T.max(self.We[features[:, 2:5]], axis=1) participants2 = T.max(self.We[features[:, 5:8]], axis=1) # Lexical features n_examples * dimensions lex = T.concatenate([event1, event2, participants1, participants2], axis=1) positions1 = features[:, 8:8 + longest, numpy.newaxis] / 100. positions2 = features[:, 8 + longest:, numpy.newaxis] / 100. senpos = T.concatenate([window_processing, positions1, positions2], axis=2) # I/O self.input = input self.output = [senpos, lex] # parameters of the model self.params = [self.We]
def lanczos(linear_op, z, m, batch_size): s = z.norm(2, axis=1) v = z / s.dimshuffle(0, 'x') alpha = [] beta = [] V = [] V.append(v) v_curr = v b = None v_prev = None for j in xrange(m): if j == 0: r = linear_op(v_curr) else: r = linear_op(v_curr) - b.dimshuffle(0, 'x') * v_prev a = T.batched_dot(v_curr, r) r = r - a.dimshuffle(0, 'x') * v_curr b = r.norm(2, axis=1) v_prev = v_curr v_curr = r / b.dimshuffle(0, 'x') alpha.append(a) if j < m - 1: V.append(v_curr) beta.append(b) Az_list = [] for idx in xrange(batch_size): alpha_diag = T.diag(T.stacklists([a_[idx] for a_ in alpha])) beta_diag = T.diag(T.stacklists([b_[idx] for b_ in beta] + [0])) M = alpha_diag + T.roll(beta_diag, 1, 0) + T.roll(beta_diag, 1, 1) V_matrix = T.stacklists([v_[idx] for v_ in V]).T approx_sqrt = s[idx] * V_matrix.dot(theano_sqrtm(M)[:, 0]) Az_list.append(approx_sqrt) Azs = T.stacklists(Az_list) return Azs
def cost(self, outputs, mask=None, **kwargs): """Returns generation costs for output sequences. Parameters ---------- outputs : Theano variable The 3(2) dimensional tensor containing output sequences. The dimension 0 must stand for time, the dimension 1 for the position on the batch. mask : The 0/1 matrix identifying fake outputs. Notes ----- The contexts are expected as keyword arguments. """ batch_size = outputs.shape[-2] # TODO Assumes only 1 features dim # Prepare input for the iterative part states = {name: kwargs[name] for name in self.state_names if name in kwargs} contexts = {name: kwargs[name] for name in self.context_names} feedback = self.readout.feedback(outputs) inputs = (self.fork.apply(feedback, return_dict=True) if self.fork else {'feedback': feedback}) # Run the recurrent network results = self.transition.apply( mask=mask, return_initial_states=True, return_dict=True, **dict_union(inputs, states, contexts)) # Separate the deliverables states = {name: results[name][:-1] for name in self.state_names} glimpses = {name: results[name] for name in self.glimpse_names} # Compute the cost feedback = tensor.roll(feedback, 1, 0) feedback = tensor.set_subtensor( feedback[0], self.readout.feedback(self.readout.initial_outputs( batch_size, **contexts))) readouts = self.readout.readout( feedback=feedback, **dict_union(states, glimpses, contexts)) costs = self.readout.cost(readouts, outputs) # In case the user needs some glimpses or states or smth else also_return = kwargs.get("also_return") if also_return: others = {name: results[name] for name in also_return} return (costs, others) return costs
def get_probs(self): t = self.temperatures t_term = (1. / t - T.roll(1. / t, shift=-1)) t_term = T.set_subtensor(t_term[-1], 0) e_term = self.energy_(self.pps) - T.roll(self.energy_(self.pps), shift=-1) e_term = T.set_subtensor(e_term[-1], 0.) probs = T.exp(t_term * e_term) actions = T.cast(T.gt(probs, self.t_rng.uniform((probs.shape))), fx) add = T.concatenate([[np.cast[fx](0.)], actions]) add = T.roll(add, shift=-1) - add add = add[:-1] add = T.switch(T.gt(add, 0), 1., 0.) add = T.set_subtensor(add[-1], 0.) add = add - T.roll(add, shift=1) idx = T.arange(actions.shape[0], dtype=fx) idx = idx + add return self.energy_(self.pps)
def mask_for_prediction(self, prediction, groundtruth_mask=None, extra_generation_steps=None): prediction_mask = tensor.lt( tensor.cumsum(tensor.eq(prediction, self.eos_label) .astype(theano.config.floatX), axis=0), 1).astype(theano.config.floatX) prediction_mask = tensor.roll(prediction_mask, 1, 0) prediction_mask = tensor.set_subtensor( prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :])) if groundtruth_mask: max_lengths = groundtruth_mask.sum(axis=0) + extra_generation_steps prediction_mask *= tensor.lt( tensor.arange(prediction.shape[0])[:, None], max_lengths[None, :]) return prediction_mask
def chunk_grad(i): ''' operates on a subset of the gradient variables ''' wrt_rep = tt.tile(wrt, (chunk_size, 1)) if func is not None: expr_rep = func(wrt_rep) else: expr_rep, _ = theano.scan( fn=lambda wrt_: theano.clone(expr, {wrt: wrt_}), sequences=wrt_rep) chunk_expr_grad = tt.roll(tt.identity_like(expr_rep), i * chunk_size, axis=1) return tt.grad(cost=None, wrt=wrt_rep, known_grads={expr_rep: chunk_expr_grad})
def _shift_step(c_mem, c_shift): # c_mem is (note, mem) # c_shift is an int if self.mode == "drop": def _clamp_w(x): return T.maximum(0, T.minimum(x, self.window_size)) ins_at_front = T.zeros((_clamp_w(-c_shift), per_note)) ins_at_back = T.zeros((_clamp_w(c_shift), per_note)) take_part = c_mem[_clamp_w(c_shift):self.window_size - _clamp_w(-c_shift), :] return T.concatenate( [ins_at_front, take_part, ins_at_back], 0) elif self.mode == "roll": return T.roll(c_mem, (-c_shift) % 12, axis=0)
def _scan_fn(cprobs, cpos): if self.with_artic: abs_probs = cprobs[:2] rel_probs = cprobs[2:] else: rel_probs = cprobs abs_probs = T.ones((2,)) aligned = T.roll(rel_probs, (cpos-low_bound)%12) num_tile = int(math.ceil((high_bound-low_bound)/self.WINDOW_SIZE)) tiled = T.tile(aligned, (num_tile,))[:(high_bound-low_bound)] full = T.concatenate([abs_probs, tiled], 0) return full
def l2_paired(x): """Spectral smoothing Applies a modified L2 norm to a 1D vector that takes into account the locality of the information Parameters ---------- x : theano tensor The input tensor. Returns ------- theano tensor The output tensor """ shapes = x.shape.eval() mask = np.eye(shapes[-1]) mask[-1, -1] = 0 rolled = T.roll(x, -1, axis=len(shapes) - 1) return T.sum((x - T.dot(rolled, mask))**2)
def get_output_for(self, inputs, **kwargs): ''' Parameters ------------------------------ inputs: two 5d tensors, [kspace_data, mask], each of shape (n, 2, nx, ny, nt) Returns ------------------------------ output: 5d tensor, missing lines of k-space are filled using neighbouring frames. shape becomes (n* (len(frame_dist), 2, nx, ny, nt) ''' x = inputs[0] mask = inputs[1] result, _ = theano.scan(fn=roll_and_sum, outputs_info=T.zeros_like(x), non_sequences=(x), n_steps=T.constant(np.max(self.n_samples))) mask_result, _ = theano.scan(fn=roll_and_sum, outputs_info=T.zeros_like(x), non_sequences=(mask), n_steps=T.constant(np.max( self.n_samples))) results = [x] for i, t in enumerate(self.n_samples): # divide unbiasedly if self.divide_by_n: c = float(t) else: c = 1.0 acc = result[t - 1] mask_acc = mask_result[t - 1] # when rolling back, need extra 1 because roll_and_sum rolls after adding a val. avg = T.roll(acc / T.maximum(c, mask_acc), -self.frame_dist[i] - 1, axis=-1) res = avg * (1 - mask) + x * mask results.append(res) return T.concatenate(results, axis=1) # concatenate along channels
def __init__(self, input, n_in): delta = 0.01 self.n_in = n_in self.input = input self.name = "VL" + str(n_in) self.A = theano.shared((np.random.uniform(-1,1,(n_in+1,n_in))*delta).astype(T.config.floatX)) indices = T.ivector('indices') prevIndices = T.roll(indices,1,axis=0) prevIndices = T.set_subtensor(prevIndices[0],n_in) self.params = [self.A] scores, _ = theano.scan(fn = self.score, sequences = [input, indices,prevIndices], n_steps = input.shape[0]) score = T.sum(scores) self.f_score = theano.function([self.input, indices], score) initScore = self.A[n_in] + input[0] [bestScore,bestIndex], _ = theano.scan(fn = self.viterbi, outputs_info=[initScore,None], sequences = input[1:], n_steps=input.shape[0]-1) last = T.argmax(bestScore[-1]) bestPath, _ = theano.scan(fn = self.findPath, outputs_info=[last], sequences = bestIndex, go_backwards=True, n_steps=bestIndex.shape[0]) self.path = T.concatenate(([last],bestPath)) self.output = T.max(bestScore,axis=0) self.predict = self.path # self.predict = theano.function([input], self.path) # self.output = theano.function([input], bestScore[-1]) self.updates = None
def get_output_for(self, inputs, **kwargs): ''' Parameters ------------------------------ inputs: two 5d tensors, [kspace_data, mask], each of shape (n, 2, nx, ny, nt) Returns ------------------------------ output: 5d tensor, missing lines of k-space are filled using neighbouring frames. shape becomes (n* (len(frame_dist), 2, nx, ny, nt) ''' x = inputs[0] mask = inputs[1] result, _ = theano.scan(fn=roll_and_sum, outputs_info=T.zeros_like(x), non_sequences=(x), n_steps=T.constant(np.max(self.n_samples))) mask_result, _ = theano.scan(fn=roll_and_sum, outputs_info=T.zeros_like(x), non_sequences=(mask), n_steps=T.constant(np.max(self.n_samples))) results = [x] for i, t in enumerate(self.n_samples): # divide unbiasedly if self.divide_by_n: c = float(t) else: c = 1.0 acc = result[t-1] mask_acc = mask_result[t-1] # when rolling back, need extra 1 because roll_and_sum rolls after adding a val. avg = T.roll(acc / T.maximum(c, mask_acc), -self.frame_dist[i]-1, axis=-1) res = avg * (1-mask) + x * mask results.append(res) return T.concatenate(results, axis=1) # concatenate along channels
def add_exploration(recognizer, data, train_conf): prediction = None prediction_mask = None explore_conf = train_conf.get('exploration', 'imitative') if explore_conf in ['greedy', 'mixed']: length_expand = 10 prediction = recognizer.get_generate_graph( n_steps=recognizer.labels.shape[0] + length_expand)['outputs'] prediction_mask = tensor.lt( tensor.cumsum(tensor.eq(prediction, data.eos_label), axis=0), 1).astype(floatX) prediction_mask = tensor.roll(prediction_mask, 1, 0) prediction_mask = tensor.set_subtensor( prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :])) if explore_conf == 'mixed': batch_size = recognizer.labels.shape[1] targets = tensor.concatenate([ recognizer.labels, tensor.zeros((length_expand, batch_size), dtype='int64') ]) targets_mask = tensor.concatenate([ recognizer.labels_mask, tensor.zeros((length_expand, batch_size), dtype=floatX) ]) rng = MRG_RandomStreams() generate = rng.binomial((batch_size, ), p=0.5, dtype='int64') prediction = (generate[None, :] * prediction + (1 - generate[None, :]) * targets) prediction_mask = ( tensor.cast(generate[None, :] * prediction_mask, floatX) + tensor.cast((1 - generate[None, :]) * targets_mask, floatX)) prediction_mask = theano.gradient.disconnected_grad(prediction_mask) elif explore_conf != 'imitative': raise ValueError return prediction, prediction_mask
def add_exploration(recognizer, data, train_conf): prediction = None prediction_mask = None explore_conf = train_conf.get('exploration', 'imitative') if explore_conf in ['greedy', 'mixed']: length_expand = 10 prediction = recognizer.get_generate_graph( n_steps=recognizer.labels.shape[0] + length_expand)['outputs'] prediction_mask = tensor.lt( tensor.cumsum(tensor.eq(prediction, data.eos_label), axis=0), 1).astype(floatX) prediction_mask = tensor.roll(prediction_mask, 1, 0) prediction_mask = tensor.set_subtensor( prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :])) if explore_conf == 'mixed': batch_size = recognizer.labels.shape[1] targets = tensor.concatenate([ recognizer.labels, tensor.zeros((length_expand, batch_size), dtype='int64')]) targets_mask = tensor.concatenate([ recognizer.labels_mask, tensor.zeros((length_expand, batch_size), dtype=floatX)]) rng = MRG_RandomStreams() generate = rng.binomial((batch_size,), p=0.5, dtype='int64') prediction = (generate[None, :] * prediction + (1 - generate[None, :]) * targets) prediction_mask = (tensor.cast(generate[None, :] * prediction_mask, floatX) + tensor.cast((1 - generate[None, :]) * targets_mask, floatX)) prediction_mask = theano.gradient.disconnected_grad(prediction_mask) elif explore_conf != 'imitative': raise ValueError return prediction, prediction_mask
def get_output(self, train): shift = self.shift axis = self.axis x = self.get_input(train) return T.roll(x, shift, axis=axis)
def __init__(self, n_out = None, n_units = None, direction = 1, truncation = -1, sampling = 1, encoder = None, unit = 'lstm', n_dec = 0, attention = "none", recurrent_transform = "none", recurrent_transform_attribs = "{}", attention_template = 128, attention_distance = 'l2', attention_step = "linear", attention_beam = 0, attention_norm = "exp", attention_momentum = "none", attention_sharpening = 1.0, attention_nbest = 0, attention_store = False, attention_smooth = False, attention_glimpse = 1, attention_filters = 1, attention_accumulator = 'sum', attention_loss = 0, attention_bn = 0, attention_lm = 'none', attention_ndec = 1, attention_memory = 0, attention_alnpts = 0, attention_epoch = 1, attention_segstep=0.01, attention_offset=0.95, attention_method="epoch", attention_scale=10, context=-1, base = None, aligner = None, lm = False, force_lm = False, droplm = 1.0, forward_weights_init=None, bias_random_init_forget_shift=0.0, copy_weights_from_base=False, segment_input=False, join_states=False, sample_segment=None, **kwargs): """ :param n_out: number of cells :param n_units: used when initialized via Network.from_hdf_model_topology :param direction: process sequence in forward (1) or backward (-1) direction :param truncation: gradient truncation :param sampling: scan every nth frame only :param encoder: list of encoder layers used as initalization for the hidden state :param unit: cell type (one of 'lstm', 'vanilla', 'gru', 'sru') :param n_dec: absolute number of steps to unfold the network if integer, else relative number of steps from encoder :param recurrent_transform: name of recurrent transform :param recurrent_transform_attribs: dictionary containing parameters for a recurrent transform :param attention_template: :param attention_distance: :param attention_step: :param attention_beam: :param attention_norm: :param attention_sharpening: :param attention_nbest: :param attention_store: :param attention_align: :param attention_glimpse: :param attention_lm: :param base: list of layers which outputs are considered as based during attention mechanisms :param lm: activate RNNLM :param force_lm: expect previous labels to be given during testing :param droplm: probability to take the expected output as predecessor instead of the real one when LM=true :param bias_random_init_forget_shift: initialize forget gate bias of lstm networks with this value """ source_index = None if len(kwargs['sources']) == 1 and (kwargs['sources'][0].layer_class.endswith('length') or kwargs['sources'][0].layer_class.startswith('length')): kwargs['sources'] = [] source_index = kwargs['index'] unit_given = unit from Device import is_using_gpu if unit == 'lstm': # auto selection if not is_using_gpu(): unit = 'lstme' elif recurrent_transform == 'none' and (not lm or droplm == 0.0): unit = 'lstmp' else: unit = 'lstmc' elif unit in ("lstmc", "lstmp") and not is_using_gpu(): unit = "lstme" if segment_input: if is_using_gpu(): unit = "lstmps" else: unit = "lstms" if n_out is None: assert encoder n_out = sum([enc.attrs['n_out'] for enc in encoder]) kwargs.setdefault("n_out", n_out) if n_units is not None: assert n_units == n_out self.attention_weight = T.constant(1.,'float32') if len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('length'): kwargs['sources'] = [] elif len(kwargs['sources']) == 1 and kwargs['sources'][0].layer_class.startswith('signal'): kwargs['sources'] = [] super(RecurrentUnitLayer, self).__init__(**kwargs) self.set_attr('from', ",".join([s.name for s in self.sources]) if self.sources else "null") self.set_attr('n_out', n_out) self.set_attr('unit', unit_given.encode("utf8")) self.set_attr('truncation', truncation) self.set_attr('sampling', sampling) self.set_attr('direction', direction) self.set_attr('lm', lm) self.set_attr('force_lm', force_lm) self.set_attr('droplm', droplm) if bias_random_init_forget_shift: self.set_attr("bias_random_init_forget_shift", bias_random_init_forget_shift) self.set_attr('attention_beam', attention_beam) self.set_attr('recurrent_transform', recurrent_transform.encode("utf8")) if isinstance(recurrent_transform_attribs, str): recurrent_transform_attribs = json.loads(recurrent_transform_attribs) if attention_template is not None: self.set_attr('attention_template', attention_template) self.set_attr('recurrent_transform_attribs', recurrent_transform_attribs) self.set_attr('attention_distance', attention_distance.encode("utf8")) self.set_attr('attention_step', attention_step.encode("utf8")) self.set_attr('attention_norm', attention_norm.encode("utf8")) self.set_attr('attention_sharpening', attention_sharpening) self.set_attr('attention_nbest', attention_nbest) attention_store = attention_store or attention_smooth or attention_momentum != 'none' self.set_attr('attention_store', attention_store) self.set_attr('attention_smooth', attention_smooth) self.set_attr('attention_momentum', attention_momentum.encode('utf8')) self.set_attr('attention_glimpse', attention_glimpse) self.set_attr('attention_filters', attention_filters) self.set_attr('attention_lm', attention_lm) self.set_attr('attention_bn', attention_bn) self.set_attr('attention_accumulator', attention_accumulator) self.set_attr('attention_ndec', attention_ndec) self.set_attr('attention_memory', attention_memory) self.set_attr('attention_loss', attention_loss) self.set_attr('n_dec', n_dec) self.set_attr('segment_input', segment_input) self.set_attr('attention_alnpts', attention_alnpts) self.set_attr('attention_epoch', attention_epoch) self.set_attr('attention_segstep', attention_segstep) self.set_attr('attention_offset', attention_offset) self.set_attr('attention_method', attention_method) self.set_attr('attention_scale', attention_scale) if segment_input: if not self.eval_flag: #if self.eval_flag: if isinstance(self.sources[0],RecurrentUnitLayer): self.inv_att = self.sources[0].inv_att #NBT else: if not join_states: self.inv_att = self.sources[0].attention #NBT else: assert hasattr(self.sources[0], "nstates"), "source does not have number of states!" ns = self.sources[0].nstates self.inv_att = self.sources[0].attention[(ns-1)::ns] inv_att = T.roll(self.inv_att.dimshuffle(2, 1, 0),1,axis=0)#TBN inv_att = T.set_subtensor(inv_att[0],T.zeros((inv_att.shape[1],inv_att.shape[2]))) inv_att = T.max(inv_att,axis=-1) else: inv_att = T.zeros((self.sources[0].output.shape[0],self.sources[0].output.shape[1])) if encoder and hasattr(encoder[0],'act'): self.set_attr('encoder', ",".join([e.name for e in encoder])) if base: self.set_attr('base', ",".join([b.name for b in base])) else: base = encoder self.base = base self.encoder = encoder if aligner: self.aligner = aligner self.set_attr('n_units', n_out) unit = eval(unit.upper())(**self.attrs) assert isinstance(unit, Unit) self.unit = unit kwargs.setdefault("n_out", unit.n_out) n_out = unit.n_out self.set_attr('n_out', unit.n_out) if n_dec < 0: source_index = self.index n_dec *= -1 if n_dec != 0: self.target_index = self.index if isinstance(n_dec,float): if not source_index: source_index = encoder[0].index if encoder else base[0].index lengths = T.cast(T.ceil(T.sum(T.cast(source_index,'float32'),axis=0) * n_dec), 'int32') idx, _ = theano.map(lambda l_i, l_m:T.concatenate([T.ones((l_i,),'int8'),T.zeros((l_m-l_i,),'int8')]), [lengths], [T.max(lengths)+1]) self.index = idx.dimshuffle(1,0)[:-1] n_dec = T.cast(T.ceil(T.cast(source_index.shape[0],'float32') * numpy.float32(n_dec)),'int32') else: if encoder: self.index = encoder[0].index self.index = T.ones((n_dec,self.index.shape[1]),'int8') else: n_dec = self.index.shape[0] # initialize recurrent weights self.W_re = None if unit.n_re > 0: self.W_re = self.add_param(self.create_recurrent_weights(unit.n_units, unit.n_re, name="W_re_%s" % self.name)) # initialize forward weights bias_init_value = self.create_bias(unit.n_in).get_value() if bias_random_init_forget_shift: assert unit.n_units * 4 == unit.n_in # (input gate, forget gate, output gate, net input) bias_init_value[unit.n_units:2 * unit.n_units] += bias_random_init_forget_shift self.b.set_value(bias_init_value) if not forward_weights_init: forward_weights_init = "random_uniform(p_add=%i)" % unit.n_re else: self.set_attr('forward_weights_init', forward_weights_init) self.forward_weights_init = forward_weights_init self.W_in = [] sample_mean, gamma = None, None if copy_weights_from_base: self.params = {} #self.W_re = self.add_param(base[0].W_re) #self.W_in = [ self.add_param(W) for W in base[0].W_in ] #self.b = self.add_param(base[0].b) self.W_re = base[0].W_re self.W_in = base[0].W_in self.b = base[0].b if self.attrs.get('batch_norm', False): sample_mean = base[0].sample_mean gamma = base[0].gamma #self.masks = base[0].masks #self.mass = base[0].mass else: for s in self.sources: W = self.create_forward_weights(s.attrs['n_out'], unit.n_in, name="W_in_%s_%s" % (s.name, self.name)) self.W_in.append(self.add_param(W)) # make input z = self.b for x_t, m, W in zip(self.sources, self.masks, self.W_in): if x_t.attrs['sparse']: if x_t.output.ndim == 3: out_dim = x_t.output.shape[2] elif x_t.output.ndim == 2: out_dim = 1 else: assert False, x_t.output.ndim if x_t.output.ndim == 3: z += W[T.cast(x_t.output[:,:,0], 'int32')] elif x_t.output.ndim == 2: z += W[T.cast(x_t.output, 'int32')] else: assert False, x_t.output.ndim elif m is None: z += T.dot(x_t.output, W) else: z += self.dot(self.mass * m * x_t.output, W) #if self.attrs['batch_norm']: # z = self.batch_norm(z, unit.n_in) num_batches = self.index.shape[1] self.num_batches = num_batches non_sequences = [] if self.attrs['lm'] or attention_lm != 'none': if not 'target' in self.attrs: self.attrs['target'] = 'classes' if self.attrs['droplm'] > 0.0 or not (self.train_flag or force_lm): if copy_weights_from_base: self.W_lm_in = base[0].W_lm_in self.b_lm_in = base[0].b_lm_in else: l = sqrt(6.) / sqrt(unit.n_out + self.y_in[self.attrs['target']].n_out) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(unit.n_out, self.y_in[self.attrs['target']].n_out)), dtype=theano.config.floatX) self.W_lm_in = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_in_"+self.name)) self.b_lm_in = self.create_bias(self.y_in[self.attrs['target']].n_out, 'b_lm_in') l = sqrt(6.) / sqrt(unit.n_in + self.y_in[self.attrs['target']].n_out) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(self.y_in[self.attrs['target']].n_out, unit.n_in)), dtype=theano.config.floatX) if copy_weights_from_base: self.W_lm_out = base[0].W_lm_out else: self.W_lm_out = self.add_param(self.shared(value=values, borrow=True, name = "W_lm_out_"+self.name)) if self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm): self.lmmask = 1 #if recurrent_transform != 'none': # recurrent_transform = recurrent_transform[:-3] elif self.attrs['droplm'] < 1.0 and (self.train_flag or force_lm): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(self.rng.randint(1234) + 1) self.lmmask = T.cast(srng.binomial(n=1, p=1.0 - self.attrs['droplm'], size=self.index.shape), theano.config.floatX).dimshuffle(0,1,'x').repeat(unit.n_in,axis=2) else: self.lmmask = T.zeros_like(self.index, dtype='float32').dimshuffle(0,1,'x').repeat(unit.n_in,axis=2) if recurrent_transform == 'input': # attention is just a sequence dependent bias (lstmp compatible) src = [] src_names = [] n_in = 0 for e in base: #src_base = [ s for s in e.sources if s.name not in src_names ] #src_names += [ s.name for s in e.sources ] src_base = [ e ] src_names += [e.name] src += [s.output for s in src_base] n_in += sum([s.attrs['n_out'] for s in src_base]) self.xc = T.concatenate(src, axis=2) l = sqrt(6.) / sqrt(self.attrs['n_out'] + n_in) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, 1)), dtype=theano.config.floatX) self.W_att_xc = self.add_param(self.shared(value=values, borrow=True, name = "W_att_xc")) values = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(n_in, self.attrs['n_out'] * 4)), dtype=theano.config.floatX) self.W_att_in = self.add_param(self.shared(value=values, borrow=True, name = "W_att_in")) zz = T.exp(T.tanh(T.dot(self.xc, self.W_att_xc))) # TB1 self.zc = T.dot(T.sum(self.xc * (zz / T.sum(zz, axis=0, keepdims=True)).repeat(self.xc.shape[2],axis=2), axis=0, keepdims=True), self.W_att_in) recurrent_transform = 'none' elif recurrent_transform == 'attention_align': max_skip = base[0].attrs['max_skip'] values = numpy.zeros((max_skip,), dtype=theano.config.floatX) self.T_b = self.add_param(self.shared(value=values, borrow=True, name="T_b"), name="T_b") l = sqrt(6.) / sqrt(self.attrs['n_out'] + max_skip) values = numpy.asarray(self.rng.uniform( low=-l, high=l, size=(self.attrs['n_out'], max_skip)), dtype=theano.config.floatX) self.T_W = self.add_param(self.shared(value=values, borrow=True, name="T_W"), name="T_W") y_t = T.dot(self.base[0].attention, T.arange(self.base[0].output.shape[0], dtype='float32')) # NB y_t = T.concatenate([T.zeros_like(y_t[:1]), y_t], axis=0) # (N+1)B y_t = y_t[1:] - y_t[:-1] # NB self.y_t = y_t # T.clip(y_t,numpy.float32(0),numpy.float32(max_skip - 1)) self.y_t = T.cast(self.base[0].backtrace,'float32') elif recurrent_transform == 'attention_segment': assert aligner.attention, "Segment-wise attention requires attention points!" recurrent_transform_inst = RecurrentTransform.transform_classes[recurrent_transform](layer=self) assert isinstance(recurrent_transform_inst, RecurrentTransform.RecurrentTransformBase) unit.recurrent_transform = recurrent_transform_inst self.recurrent_transform = recurrent_transform_inst # scan over sequence for s in range(self.attrs['sampling']): index = self.index[s::self.attrs['sampling']] if context > 0: from TheanoUtil import context_batched n_batches = z.shape[1] time, batch, dim = z.shape[0], z.shape[1], z.shape[2] #z = context_batched(z[::direction or 1], window=context)[::direction or 1] # TB(CD) from theano.ifelse import ifelse def context_window(idx, x_in, i_in): x_out = x_in[idx:idx + context] x_out = x_out.dimshuffle('x',1,0,2).reshape((1, batch, dim * context)) i_out = i_in[idx:idx+1].repeat(context, axis=0) i_out = ifelse(T.lt(idx,context),T.set_subtensor(i_out[:context - idx],numpy.int8(0)),i_out).reshape((1, batch * context)) return x_out, i_out z = z[::direction or 1] i = index[::direction or 1] out, _ = theano.map(context_window, sequences = [T.arange(z.shape[0])], non_sequences = [T.concatenate([T.zeros((context - 1,z.shape[1],z.shape[2]),dtype='float32'),z],axis=0), i]) z = out[0][::direction or 1] i = out[1][::direction or 1] # T(BC) direction = 1 z = z.reshape((time * batch, context * dim)) # (TB)(CD) z = z.reshape((time * batch, context, dim)).dimshuffle(1,0,2) # C(TB)D i = i.reshape((time, context, batch)).dimshuffle(1,0,2).reshape((context, time * batch)) index = i num_batches = time * batch sequences = z sources = self.sources if encoder: if recurrent_transform == "attention_segment": if hasattr(encoder[0],'act'): outputs_info = [T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act)] else: # outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ] outputs_info[0] = self.aligner.output[-1] elif hasattr(encoder[0],'act'): outputs_info = [ T.concatenate([e.act[i][-1] for e in encoder], axis=1) for i in range(unit.n_act) ] else: outputs_info = [ T.concatenate([e[i] for e in encoder], axis=1) for i in range(unit.n_act) ] sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0)) else: outputs_info = [ T.alloc(numpy.cast[theano.config.floatX](0), num_batches, unit.n_units) for a in range(unit.n_act) ] if self.attrs['lm'] and self.attrs['droplm'] == 0.0 and (self.train_flag or force_lm): if self.network.y[self.attrs['target']].ndim == 3: sequences += T.dot(self.network.y[self.attrs['target']],self.W_lm_out) else: y = self.y_in[self.attrs['target']].flatten() sequences += self.W_lm_out[y].reshape((index.shape[0],index.shape[1],unit.n_in)) if sequences == self.b: sequences += T.alloc(numpy.cast[theano.config.floatX](0), n_dec, num_batches, unit.n_in) + (self.zc if self.attrs['recurrent_transform'] == 'input' else numpy.float32(0)) if unit.recurrent_transform: outputs_info += unit.recurrent_transform.get_sorted_state_vars_initial() index_f = T.cast(index, theano.config.floatX) unit.set_parent(self) if segment_input: outputs = unit.scan_seg(x=sources, z=sequences[s::self.attrs['sampling']], att = inv_att, non_sequences=non_sequences, i=index_f, outputs_info=outputs_info, W_re=self.W_re, W_in=self.W_in, b=self.b, go_backwards=direction == -1, truncate_gradient=self.attrs['truncation']) else: outputs = unit.scan(x=sources, z=sequences[s::self.attrs['sampling']], non_sequences=non_sequences, i=index_f, outputs_info=outputs_info, W_re=self.W_re, W_in=self.W_in, b=self.b, go_backwards=direction == -1, truncate_gradient=self.attrs['truncation']) if not isinstance(outputs, list): outputs = [outputs] if outputs: outputs[0].name = "%s.act[0]" % self.name if context > 0: for i in range(len(outputs)): outputs[i] = outputs[i][-1].reshape((outputs[i].shape[1]//n_batches,n_batches,outputs[i].shape[2])) if unit.recurrent_transform: unit.recurrent_transform_state_var_seqs = outputs[-len(unit.recurrent_transform.state_vars):] if self.attrs['sampling'] > 1: if s == 0: self.act = [ T.alloc(numpy.cast['float32'](0), self.index.shape[0], self.index.shape[1], n_out) for act in outputs ] self.act = [ T.set_subtensor(tot[s::self.attrs['sampling']], act) for tot,act in zip(self.act, outputs) ] else: self.act = outputs[:unit.n_act] if len(outputs) > unit.n_act: self.aux = outputs[unit.n_act:] if self.attrs['attention_store']: self.attention = [ self.aux[i].dimshuffle(0,2,1) for i,v in enumerate(sorted(unit.recurrent_transform.state_vars.keys())) if v.startswith('att_') ] # NBT for i in range(len(self.attention)): vec = T.eye(self.attention[i].shape[2], 1, -direction * (self.attention[i].shape[2] - 1)) last = vec.dimshuffle(1, 'x', 0).repeat(self.index.shape[1], axis=1) self.attention[i] = T.concatenate([self.attention[i][1:],last],axis=0)[::direction] self.cost_val = numpy.float32(0) if recurrent_transform == 'attention_align': back = T.ceil(self.aux[sorted(unit.recurrent_transform.state_vars.keys()).index('t')]) def make_output(base, yout, trace, length): length = T.cast(length, 'int32') idx = T.cast(trace[:length][::-1],'int32') x_out = T.concatenate([base[idx],T.zeros((self.index.shape[0] + 1 - length, base.shape[1]), 'float32')],axis=0) y_out = T.concatenate([yout[idx,T.arange(length)],T.zeros((self.index.shape[0] + 1 - length, ), 'float32')],axis=0) return x_out, y_out output, _ = theano.map(make_output, sequences = [base[0].output.dimshuffle(1,0,2), self.y_t.dimshuffle(1,2,0), back.dimshuffle(1,0), T.sum(self.index,axis=0,dtype='float32')]) self.attrs['n_out'] = base[0].attrs['n_out'] self.params.update(unit.params) self.output = output[0].dimshuffle(1,0,2)[:-1] z = T.dot(self.act[0], self.T_W)[:-1] + self.T_b z = z.reshape((z.shape[0] * z.shape[1], z.shape[2])) idx = (self.index[1:].flatten() > 0).nonzero() idy = (self.index[1:][::-1].flatten() > 0).nonzero() y_out = T.cast(output[1],'int32').dimshuffle(1, 0)[:-1].flatten() nll, _ = T.nnet.crossentropy_softmax_1hot(x=z[idx], y_idx=y_out[idy]) self.cost_val = T.sum(nll) recog = T.argmax(z[idx], axis=1) real = y_out[idy] self.errors = lambda: T.sum(T.neq(recog, real)) return back += T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32') idx = (self.index[:-1].flatten() > 0).nonzero() idx = T.cast(back[::-1].flatten()[idx],'int32') x_out = base[0].output #x_out = x_out.dimshuffle(1,0,2).reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx] #x_out = x_out.reshape((self.index.shape[1], self.index.shape[0] - 1, x_out.shape[1])).dimshuffle(1,0,2) x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx] x_out = x_out.reshape((self.index.shape[0] - 1, self.index.shape[1], x_out.shape[1])) self.output = T.concatenate([x_out, base[0].output[1:]],axis=0) self.attrs['n_out'] = base[0].attrs['n_out'] self.params.update(unit.params) return skips = T.dot(T.nnet.softmax(z), T.arange(z.shape[1], dtype='float32')).reshape(self.index[1:].shape) shift = T.arange(self.index.shape[1], dtype='float32') * T.cast(self.base[0].index.shape[0], 'float32') skips = T.concatenate([T.zeros_like(self.y_t[:1]),self.y_t[:-1]],axis=0) idx = shift + T.cumsum(skips, axis=0) idx = T.cast(idx[:-1].flatten(),'int32') #idx = (idx.flatten() > 0).nonzero() #idx = base[0].attention.flatten() x_out = base[0].output[::-1] x_out = x_out.reshape((x_out.shape[0] * x_out.shape[1], x_out.shape[2]))[idx] x_out = x_out.reshape((self.index.shape[0], self.index.shape[1], x_out.shape[1])) self.output = T.concatenate([base[0].output[-1:], x_out], axis=0)[::-1] self.attrs['n_out'] = base[0].attrs['n_out'] self.params.update(unit.params) return if recurrent_transform == 'batch_norm': self.params['sample_mean_batch_norm'].custom_update = T.dot(T.mean(self.act[0],axis=[0,1]),self.W_re) self.params['sample_mean_batch_norm'].custom_update_normalized = True self.make_output(self.act[0][::direction or 1], sample_mean=sample_mean, gamma=gamma) self.params.update(unit.params)
def add_fun(A, B, max_int, mem): """Returns the distribution for a sum of integers.""" rows = [roll(B[:, ::-1], shift + 1, axis=1) for shift in range(max_int)] B_prime = stack(rows, axis=1).transpose(0, 2, 1) return batched_dot(A, B_prime), mem
def negate_fun(A, max_int, mem): """Negate a distribution over integers.""" return roll(A[:, ::-1], 1, axis=1), mem
def build_model(self): trng = RandomStreams(self.random_seed) use_noise = theano.shared(numpy_floatX(0.)) # Simply encode this x = T.matrix('x', dtype='int64') y = T.matrix('y', dtype='int64') y_prime = T.roll(y, -1, 0) # Since we are simply predicting the next word, the # following statement shifts the content of the x by 1 # in the time dimension for prediction (axis 0, assuming TxN) mask_x = T.matrix('mask_x', dtype=theano.config.floatX) mask_y = T.matrix('mask_y', dtype=theano.config.floatX) n_timesteps = x.shape[0] n_samples = x.shape[1] # Convert word indices to their embeddings # Resulting dims are (T x N x dim_proj) emb = self.tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, self.dim_proj]) # Compute the hidden states # Note that these contain hidden states for elements which were # padded in input. The cost for these time steps are removed # before the calculation of the cost. enc_proj_1 = self.layers['enc_lstm_1'].lstm_layer(emb, self.dim_proj, mask=mask) # Use dropout on non-recurrent connections (Zaremba et al.) if self.use_dropout: proj_1 = dropout_layer(enc_proj_1, use_noise, trng) enc_proj_2 = self.layers['enc_lstm_2'].lstm_layer(enc_proj_1, self.dim_proj, mask=mask) if self.use_dropout: enc_proj_2 = dropout_layer(enc_proj_2, use_noise, trng) # Use the final state of the encoder as the initial hidden state of the decoder src_embedding = enc_proj_2[-1] # Run decoder LSTM dec_proj_1 = self.layers['enc_lstm_1'].lstm_layer(emb, self.dim_proj, mask=mask) # Use dropout on non-recurrent connections (Zaremba et al.) if self.use_dropout: proj_1 = dropout_layer(enc_proj_1, use_noise, trng) enc_proj_2 = self.layers['enc_lstm_2'].lstm_layer(enc_proj_1, self.dim_proj, mask=mask) if self.use_dropout: enc_proj_2 = dropout_layer(enc_proj_2, use_noise, trng) pre_s = T.dot(proj, self.tparams['U']) + self.tparams['b'] # Softmax works for 2-tensors (matrices) only. We have a 3-tensor # TxNxV. So we reshape it to (T*N)xV, apply softmax and reshape again # -1 is a proxy for infer dim based on input (numpy style) pre_s_r = T.reshape(pre_s, (pre_s.shape[0] * pre_s.shape[1], -1)) pred_r = T.nnet.softmax(pre_s_r) off = 1e-8 if pred_r.dtype == 'float16': off = 1e-6 # Note the use of flatten here. We can't directly index a 3-tensor # and hence we use the (T*N)xV view which is indexed by the flattened # label matrix, dim = (T*N)x1 # Also, the cost (before calculating the mean) is multiplied (element-wise) # with the mask to eliminate the cost of elements that do not really exist. # i.e. Do not include the cost for elements which are padded cost = -T.sum(T.log(pred_r[T.arange(pred_r.shape[0]), y.flatten()] + off) * mask.flatten()) / T.sum(mask) self.f_cost = theano.function([x, mask], cost, name='f_cost') return use_noise, x, mask, cost
def _step_state(v_h_, x_h_, v_t_, x_t_, a_t_, a, is_aggressive): next_x_t_ = tt.roll(x_t_,-1) relx = next_x_t_ - x_t_ # fix the jump between -pi and +pi relx = (relx>=0) *relx + (relx<0)*(self.two_pi_r + relx) relx_to_host = x_h_ - x_t_ is_host_cipv = (x_h_ > -0.5*self.host_length) * (x_h_ > x_t_) * (relx_to_host < relx) # If host CIPV - Change relx to him relx = (is_host_cipv * ((x_h_ > 0)*x_h_ - x_t_)) + ((1-is_host_cipv) * relx) is_host_approaching = (x_h_ > -1.5*self.host_length) * (x_h_ <= -0.5 *self.host_length) * (x_t_ < 0) * (x_t_ > -0.25*self.two_pi_r) * ((next_x_t_ > 0) + (next_x_t_ < x_t_)) accel_default = 5*(relx - 2*v_t_) # accel_is_aggressive = (3 - v_t_)/self.dt accel_is_aggressive = tt.maximum(3, 3*(relx_to_host - 1.5*v_t_)) accel_not_aggressive = (0.5 - v_t_)/self.dt accel_host_approaching = is_aggressive * accel_is_aggressive + (1 - is_aggressive) * accel_not_aggressive accel = is_host_approaching * accel_host_approaching + (1- is_host_approaching) * accel_default #1. exact next state #1.1 host v_h = v_h_ + self.dt * a # clip host speed to the section [0,v0] v_h = tt.clip(v_h, 0, 3*self.v_0) x_h = x_h_ + self.dt * v_h x_h = (x_h>=(self.two_pi_r/2)) * (x_h - self.two_pi_r) + (x_h < (self.two_pi_r/2)) * x_h #1.2 targets v_t_e = tt.maximum(0, v_t_ + self.dt * accel) x_t_e = x_t_ + self.dt * v_t_e a_t_e = v_t_e - v_t_ #2. learn the transition model between states state_ = tt.concatenate([v_h_ , x_h_, tt.flatten(v_t_), tt.flatten(x_t_), tt.flatten(a_t_), a]) state_t_e = tt.concatenate([tt.flatten(v_t_e), tt.flatten(x_t_e), tt.flatten(a_t_e)]) state_ = common.disconnected_grad(state_) state_t_e = common.disconnected_grad(state_t_e) h0 = tt.dot(state_, self.W_t_0) + self.b_t_0 relu0 = tt.nnet.relu(h0) h1 = tt.dot(relu0, self.W_t_1) + self.b_t_1 relu1 = tt.nnet.relu(h1) h2 = tt.dot(relu1, self.W_t_2) + self.b_t_2 relu2 = tt.nnet.relu(h2) state_t_hat = tt.dot(relu2, self.W_t_c) cost_transition = tt.mean(tt.abs_(state_t_hat - state_t_e)) v_t_a = (state_t_hat[0 : self.n_t]).dimshuffle(0,'x') x_t_a = (state_t_hat[self.n_t : 2 * self.n_t]).dimshuffle(0,'x') a_t_a = (state_t_hat[2 * self.n_t : ]).dimshuffle(0,'x') #3. prediction noise n_v_t = v_t_e - v_t_a n_x_t = x_t_e - x_t_a n_a_t = a_t_e - a_t_a #4. disconnect the gradient of the noise signals n_v_t = common.disconnected_grad(n_v_t) n_x_t = common.disconnected_grad(n_x_t) n_a_t = common.disconnected_grad(n_a_t) #5. add the noise to the approximation v_t = v_t_a + n_v_t x_t = x_t_a + n_x_t a_t = a_t_a + n_a_t # apply [-pi,pi] discontinuity x_t = (x_t>=(self.two_pi_r/2)) * (x_t - self.two_pi_r) + (x_t < (self.two_pi_r/2)) * x_t return v_h, x_h, v_t, x_t, a_t, cost_transition
import os import time from RNN_theano import RNN_theano import numpy as np import theano as theano import theano.tensor as T from RNN_theano import train_with_sgd li = T.matrix('list') m = T.scalar("m") n = T.scalar("n") outf = theano.function([m,n], 2*m + 3*n) cost = 2*m + 3*n dm = T.grad(cost, m) dn = T.grad(cost, n) rollf = theano.function([li], T.roll(li, -1, 1)) entry = [[0.5,1.1,3], [0.8,0.1,3] , [0.3,1.5,3] ] res = rollf(entry) oll = theano.function([m,n], [dm]) oll2 = theano.function([m,n], [dn]) value = oll(4,5) value2 = oll2(4,5) inputd = 2 outputd = 1 hiddend = 20 U1 = np.random.uniform(-np.sqrt(1./inputd), np.sqrt(1./inputd), (hiddend, inputd)) V1 = np.random.uniform(-np.sqrt(1./outputd), np.sqrt(1./outputd), (outputd, hiddend))
def __init__(self, rng, x, n_in, n_h, n_out, p, training, y=None, rnn_batch_training=False): """ This is to initialise a standard RNN hidden unit :param rng: random state, fixed value for randome state for reproducible objective results :param x: input data to current layer :param n_in: dimension of input data :param n_h: number of hidden units/blocks :param n_out: dimension of output data :param p: the probability of dropout :param training: a binary value to indicate training or testing (for dropout training) """ self.input = x if y is not None: self.groundtruth = y if p > 0.0: if training==1: srng = RandomStreams(seed=123456) self.input = T.switch(srng.binomial(size=x.shape,p=p), x, 0) else: self.input = (1-p) * x #(1-p) * self.n_in = int(n_in) self.n_h = int(n_h) self.n_out = int(n_out) self.rnn_batch_training = rnn_batch_training # random initialisation Wx_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_in), size=(n_in, n_h)), dtype=config.floatX) #Wh_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_h), size=(n_h, n_h)), dtype=config.floatX) #Wy_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_out), size=(n_out, n_h)), dtype=config.floatX) Ux_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_in), size=(n_in, n_out)), dtype=config.floatX) #Uh_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_h), size=(n_h, n_out)), dtype=config.floatX) #Uy_value = np.asarray(rng.normal(0.0, 1.0/np.sqrt(n_out), size=(n_out, n_out)), dtype=config.floatX) # identity matrix initialisation Wh_value = np.asarray(np.eye(n_h, n_h), dtype=config.floatX) Wy_value = np.asarray(np.eye(n_out, n_h), dtype=config.floatX) Uh_value = np.asarray(np.eye(n_in, n_out), dtype=config.floatX) Uy_value = np.asarray(np.zeros(n_out, n_out), dtype=config.floatX) # Input gate weights self.W_xi = theano.shared(value=Wx_value, name='W_xi') self.W_hi = theano.shared(value=Wh_value, name='W_hi') self.W_yi = theano.shared(value=Wy_value, name='W_yi') # Output gate weights self.U_xi = theano.shared(value=Ux_value, name='U_xi') self.U_hi = theano.shared(value=Uh_value, name='U_hi') self.U_yi = theano.shared(value=Uy_value, name='U_yi') # bias self.b_i = theano.shared(value=np.zeros((n_h, ), dtype=config.floatX), name='b_i') self.b = theano.shared(value=np.zeros((n_out, ), dtype=config.floatX), name='b') # initial value of hidden and cell state and output if self.rnn_batch_training: self.h0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'h0') self.c0 = theano.shared(value=np.zeros((1, n_h), dtype = config.floatX), name = 'c0') self.y0 = theano.shared(value=np.zeros((1, n_out), dtype = config.floatX), name = 'y0') self.h0 = T.repeat(self.h0, x.shape[1], 0) self.c0 = T.repeat(self.c0, x.shape[1], 0) self.y0 = T.repeat(self.c0, x.shape[1], 0) else: self.h0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'h0') self.c0 = theano.shared(value=np.zeros((n_h, ), dtype = config.floatX), name = 'c0') self.y0 = theano.shared(value=np.zeros((n_out, ), dtype = config.floatX), name = 'y0') self.h0 = self.input[-1, 0:-4] # hard coded to remove coarse coding features self.outytm1 = T.roll(self.groundtruth, 1, 0) self.Wix = T.dot(self.input, self.W_xi) self.Uix = T.dot(self.input, self.U_xi) [self.h, self.c], _ = theano.scan(self.recurrent_as_activation_function, sequences = [self.Wix, self.Wiy], outputs_info = [self.h0, self.c0]) self.y = self.Uix + self.Uiy + T.dot(self.h, self.U_hi) + self.b self.output = T.nnet.softmax(self.y) # recurrent output params and additional input params self.params = [self.W_xi, self.W_hi, self.W_yi, self.U_xi, self.U_hi, self.U_yi, self.b_i, self.b] self.L2_cost = (self.W_xi ** 2).sum() + (self.W_hi ** 2).sum() + (self.W_yi ** 2).sum() + (self.U_hi ** 2).sum()
def get_interpolated_hiddens(old_hidden, n_timesteps, n_samples, interpolation_mask, number_cons_hiddens): ''' old_hidden: old_hidden_matrix which needs to be interpolated. : number_of_hiddens * batch_size * Hidden_Size number_of_reduced_timstamps alphas = [1, 0.8, 0.6, 0.4, 0.2] alpha is the interpolation mask as of now, which ne eds to be passed as a function parameter. For ex, given hiddens, h1, h2, h3, h_n-1 You get, [h1, h2], [h2, h3], [h_n-2, h_n-1] so basically, n-1 pairs. Number of interolations need to be done. i.e relative clock times. ''' alpha = interpolation_mask hidden_size = 1024 batch_size = 32 num_cons_hiddens = number_cons_hiddens num_reduced_hiddens = num_cons_hiddens + 1 number_interp = len(interpolation_mask) X = old_hidden.dimshuffle(1, 0, 2) new_matrix2 = repeat(X, 2, axis=1) new_matrix2 = tensor.roll(new_matrix2, -1, axis=1) new_matrix2 = new_matrix2[:, 0:2*num_reduced_hiddens-2, :] new_matrix2 = new_matrix2.reshape([n_samples, num_cons_hiddens, 2, hidden_size]) def _step_slice(m_, interp_mask): interp_ret = [] for i in range(number_interp): interp_ret.append(interp_mask[i] * m_[0] + (1-interp_mask[i])* m_[1]) return interp_ret _step = _step_slice def step_batch(m_, alpha): seqs = m_ rval, updates = theano.scan(_step, sequences=seqs, non_sequences=[alpha]) return rval _batch_step = step_batch seqs = new_matrix2 rval, updates = theano.scan(_batch_step, sequences=seqs, non_sequences=[alpha]) out=[] out_batch =[] for batch_index in range(batch_size): for i in range(num_cons_hiddens): something = [rval[j][batch_index][i] for j in range(number_interp)] if i==0: out = something if i >=1: out = tensor.concatenate([out, something], axis=0) if batch_index == 0: out_batch = out if batch_index == 1: out_batch = tensor.stacklists([out_batch, out]) if batch_index > 1: out = tensor.reshape(out,[1, n_timesteps-2, hidden_size]) out_batch = tensor.concatenate([out_batch, out]) zero_pad = tensor.zeros([out_batch.shape[0], number_interp , out_batch.shape[2]]) out_batch = tensor.concatenate([zero_pad, out_batch], axis=1) return out_batch
def mycost(y_true, y_pred): y_roll=T.roll(y_pred,1,axis=1) y_roll=T.set_subtensor(y_roll[:,0,:],y_pred[:,0,:]) return T.mean(T.square(y_pred - y_true), axis=-1)+ 10*T.mean(T.square(y_pred - y_roll), axis=-1)
def roll_and_sum(prior_result, orig): res = prior_result + orig res = T.roll(res, 1, axis=-1) return res
def build_encoder(self, x, xmask=None, **kwargs): one_step = False if len(kwargs): one_step = True # if x.ndim == 2 then # x = (n_steps, batch_size) if x.ndim == 2: batch_size = x.shape[1] # else x = (word_1, word_2, word_3, ...) # or x = (last_word_1, last_word_2, last_word_3, ..) # in this case batch_size is else: batch_size = 1 # if it is not one_step then we initialize everything to 0 if not one_step: h_0 = T.alloc(np.float32(0), batch_size, self.qdim) hs_0 = T.alloc(np.float32(0), batch_size, self.sdim) # in sampling mode (i.e. one step) we require else: # in this case x.ndim != 2 assert x.ndim != 2 assert 'prev_h' in kwargs assert 'prev_hs' in kwargs h_0 = kwargs['prev_h'] hs_0 = kwargs['prev_hs'] xe = self.approx_embedder(x) if xmask == None: xmask = T.neq(x, self.eos_sym) # Here we roll the mask so we avoid the need for separate # hr and h. The trick is simple: if the original mask is # 0 1 1 0 1 1 1 0 0 0 0 0 -- batch is filled with eos_sym # the rolled mask will be # 0 0 1 1 0 1 1 1 0 0 0 0 -- roll to the right # ^ ^ # two resets </s> <s> # the first reset will reset h_init = 0 # the second will reset </s> and update given x_t = <s> if xmask.ndim == 2: rolled_xmask = T.roll(xmask, 1, axis=0) else: rolled_xmask = T.roll(xmask, 1) # Gated Encoder if self.sent_step_type == "gated": f_enc = self.gated_sent_step o_enc_info = [h_0, None, None, None] else: f_enc = self.plain_sent_step o_enc_info = [h_0] if self.triple_step_type == "gated": f_hier = self.gated_triple_step o_hier_info = [hs_0, None, None, None] else: f_hier = self.plain_triple_step o_hier_info = [hs_0] # Run through all the sentence (encode everything) if not one_step: _res, _ = theano.scan(f_enc, sequences=[xe, rolled_xmask],\ outputs_info=o_enc_info) # Make just one step further else: _res = f_enc(xe, rolled_xmask, h_0) # Get the hidden state sequence h = _res[0] # All hierarchical sentence # The hs sequence is based on the original mask if not one_step: _res, _ = theano.scan(f_hier,\ sequences=[h, xmask],\ outputs_info=o_hier_info) # Just one step further else: _res = f_hier(h, xmask, hs_0) if isinstance(_res, list) or isinstance(_res, tuple): hs = _res[0] else: hs = _res return h, hs
def roll(x, shift, axis=-1): return T.roll(x, shift, axis=axis)
def get_output(self, train): X = self.get_input(train) tensors = [ T.roll(X, off, axis=self.axis) for off in self.offsets ] return T.stack(tensors, axis=self.offset_axis)
def get_output_mask(self, train=False): X = self.get_input_mask(train) if X is None: return None tensors = [ T.roll(X, off, axis=self.axis) for off in self.offsets ] return T.stack(tensors, axis=self.offset_axis)
def activation(self, network, in_vw): in_var = in_vw.variable return in_var * T.roll(in_var, shift=1, axis=1)