class MemoryNetworkBase(Initializable): def __init__(self, config, prefix_encoder, candidate_encoder, **kwargs): super(MemoryNetworkBase, self).__init__(**kwargs) self.prefix_encoder = prefix_encoder self.candidate_encoder = candidate_encoder self.config = config self.softmax = Softmax() self.children = [self.softmax, prefix_encoder, candidate_encoder] self.inputs = self.prefix_encoder.apply.inputs \ + ['candidate_%s'%x for x in self.candidate_encoder.apply.inputs] \ + ['candidate_destination_latitude', 'candidate_destination_longitude'] def candidate_destination(self, **kwargs): return tensor.concatenate( (tensor.shape_padright(kwargs['candidate_destination_latitude']), tensor.shape_padright(kwargs['candidate_destination_longitude'])), axis=1) @application(outputs=['cost']) def cost(self, **kwargs): y_hat = self.predict(**kwargs) y = tensor.concatenate((kwargs['destination_latitude'][:, None], kwargs['destination_longitude'][:, None]), axis=1) return error.erdist(y_hat, y).mean() @application(outputs=['destination']) def predict(self, **kwargs): prefix_representation = self.prefix_encoder.apply( **{x: kwargs[x] for x in self.prefix_encoder.apply.inputs}) candidate_representation = self.candidate_encoder.apply( **{ x: kwargs['candidate_' + x] for x in self.candidate_encoder.apply.inputs }) if self.config.normalize_representation: prefix_representation = prefix_representation \ / tensor.sqrt((prefix_representation ** 2).sum(axis=1, keepdims=True)) candidate_representation = candidate_representation \ / tensor.sqrt((candidate_representation ** 2).sum(axis=1, keepdims=True)) similarity_score = tensor.dot(prefix_representation, candidate_representation.T) similarity = self.softmax.apply(similarity_score) return tensor.dot(similarity, self.candidate_destination(**kwargs)) @predict.property('inputs') def predict_inputs(self): return self.inputs @cost.property('inputs') def cost_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude']
class Model(RNN): @lazy() def __init__(self, config, **kwargs): super(Model, self).__init__(config, rec_input_len=4, output_dim=config.tgtcls.shape[0], **kwargs) self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') self.softmax = Softmax() self.sequences.extend(['latitude_lag', 'longitude_lag']) self.children.append(self.softmax) def before_predict_all(self, kwargs): super(Model, self).before_predict_all(kwargs) kwargs['latitude_lag'] = tensor.extra_ops.repeat(kwargs['latitude'], 2, axis=0) kwargs['longitude_lag'] = tensor.extra_ops.repeat(kwargs['longitude'], 2, axis=0) def process_rto(self, rto): return tensor.dot(self.softmax.apply(rto), self.classes) def rec_input(self, latitude, longitude, latitude_lag, longitude_lag, **kwargs): return (tensor.shape_padright(latitude), tensor.shape_padright(longitude), tensor.shape_padright(latitude_lag), tensor.shape_padright(longitude_lag))
class rewatching: def __init__(self, batch_size, output_length, visual_dim, word_dim, visual_feature_dim, question_feature_dim, joint_dim, memory_dim, output_dim, fc1_dim, fc2_dim, voc_size): # the video encoder self.video_encoder = visualEncoder(visual_dim, visual_feature_dim) self.sentence_encoder = questionEncoder(word_dim, question_feature_dim) self.toJoint = embeddingLayer(2 * question_feature_dim, 2 * visual_feature_dim, joint_dim) self.rewatcher = impatientLayer(joint_dim, memory_dim, output_dim) self.seq_gen = seqDecoder(joint_dim, output_dim, fc1_dim, fc2_dim) self.softmax_layer = Softmax() self.bs = batch_size self.output_length = output_length self.voc_size = voc_size def build_model(self, frame, q, q_rev, mask, maskMat, mask01, padding): bs = self.bs # visual dim -> visual feature dim video_embedding = self.video_encoder.apply(frame) # wod_dim -> question feature dimA question_embedding, u1, u2 = self.sentence_encoder.apply( q, q_rev, mask, bs) # -> joint_dim questionJoint, videoJoint, u = self.toJoint.apply( words=question_embedding, video=video_embedding, u1=u1, u2=u2) # bs x joint_dim, bs x output_dim question = questionJoint[:, -1, :] #video = videoJoint[:, -1, :] r_q, seq_r_q = self.rewatcher.apply(videoJoint, questionJoint, mask, bs) fc_r = self.seq_gen.apply(self.output_length, r_q, question, padding) fc = fc_r.reshape((self.bs * self.output_length, self.voc_size)) self.softmax_result = self.softmax_layer.apply(fc) self.pred = T.argmax(self.softmax_result, axis=1) self.pred = self.pred.reshape((self.bs, self.output_length)) # groundtruth_: batch_size x output_length # mask_01: (batch_size x output_length) # this mask is a 0-1 matrix where 0 indicates padding area of the answer def loss(self, groundtruth_, mask_01): mask = mask_01.flatten() gt = groundtruth_.flatten() self.p = self.softmax_result[T.arange(self.bs * self.output_length), gt] self.cost_ = T.log(self.p + 1e-20) self.cost = -T.sum(self.cost_ * mask) / self.bs self.cost.name = 'softmax_cost' return self.cost def error(self, groundtruth, mask_01): return T.neq(T.sum(T.neq(self.pred, groundtruth) * mask_01, axis=1), 0).sum() / self.bs def predict(self): return self.pred
def onestepContextAttn(hContextAttn): preContextatt = attentionmlpContext.apply(hContextAttn) attContextsoft = Softmax() attContextpyx = attContextsoft.apply(preContextatt.flatten()) attContextpred = attContextpyx.flatten() attcontext = T.mul(hContextAttn.dimshuffle(1,0), attContextpred).dimshuffle(1,0) return attcontext
def onestepEncAttn(hEncAttn): preEncattn = attentionmlpEnc.apply(hEncAttn) attEncsoft = Softmax() attEncpyx = attEncsoft.apply(preEncattn.flatten()) attEncpred = attEncpyx.flatten() attenc = T.mul(hEncAttn.dimshuffle(1,0), attEncpred).dimshuffle(1,0) return attenc
class Model(RNN): @lazy() def __init__(self, config, **kwargs): super(Model, self).__init__(config, output_dim=config.tgtcls.shape[0], **kwargs) self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') self.softmax = Softmax() self.children.append(self.softmax) def process_rto(self, rto): return tensor.dot(self.softmax.apply(rto), self.classes)
class SoftmaxLinear(Initializable): def __init__(self, input_dim, output_dim, **kwargs): super(SoftmaxLinear, self).__init__(**kwargs) self.linear = Linear(input_dim=input_dim, output_dim=output_dim) self.sofmax = Softmax() self.children = [self.linear, self.sofmax] def apply(self, input_): output = self.sofmax.apply(self.linear.apply(input_)) return output
class MemoryNetworkBase(Initializable): def __init__(self, config, prefix_encoder, candidate_encoder, **kwargs): super(MemoryNetworkBase, self).__init__(**kwargs) self.prefix_encoder = prefix_encoder self.candidate_encoder = candidate_encoder self.config = config self.softmax = Softmax() self.children = [ self.softmax, prefix_encoder, candidate_encoder ] self.inputs = self.prefix_encoder.apply.inputs \ + ['candidate_%s'%x for x in self.candidate_encoder.apply.inputs] \ + ['candidate_destination_latitude', 'candidate_destination_longitude'] def candidate_destination(self, **kwargs): return tensor.concatenate( (tensor.shape_padright(kwargs['candidate_destination_latitude']), tensor.shape_padright(kwargs['candidate_destination_longitude'])), axis=1) @application(outputs=['cost']) def cost(self, **kwargs): y_hat = self.predict(**kwargs) y = tensor.concatenate((kwargs['destination_latitude'][:, None], kwargs['destination_longitude'][:, None]), axis=1) return error.erdist(y_hat, y).mean() @application(outputs=['destination']) def predict(self, **kwargs): prefix_representation = self.prefix_encoder.apply(**{ x: kwargs[x] for x in self.prefix_encoder.apply.inputs }) candidate_representation = self.candidate_encoder.apply(**{ x: kwargs['candidate_'+x] for x in self.candidate_encoder.apply.inputs }) if self.config.normalize_representation: prefix_representation = prefix_representation \ / tensor.sqrt((prefix_representation ** 2).sum(axis=1, keepdims=True)) candidate_representation = candidate_representation \ / tensor.sqrt((candidate_representation ** 2).sum(axis=1, keepdims=True)) similarity_score = tensor.dot(prefix_representation, candidate_representation.T) similarity = self.softmax.apply(similarity_score) return tensor.dot(similarity, self.candidate_destination(**kwargs)) @predict.property('inputs') def predict_inputs(self): return self.inputs @cost.property('inputs') def cost_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude']
def train(self, X, Y, idx_folds, hyper_params, model_prefix, verbose=False): import os from collections import OrderedDict from fuel.datasets import IndexableDataset from blocks.model import Model from blocks.bricks import Linear, Softmax from blocks.bricks.conv import MaxPooling from blocks.initialization import Uniform from deepthought.bricks.cost import HingeLoss import numpy as np import theano from theano import tensor assert model_prefix is not None fold_weights_filename = '{}_weights.npy'.format(model_prefix) # convert Y to one-hot encoding n_classes = len(set(Y)) Y = np.eye(n_classes, dtype=int)[Y] features = tensor.matrix('features', dtype=theano.config.floatX) targets = tensor.lmatrix('targets') input_ = features dim = X.shape[-1] # optional additional layers if self.pipeline_factory is not None: # need to re-shape flattened input to restore bc01 format input_shape = (input_.shape[0],) + hyper_params['classifier_input_shape'] # tuple, uses actual batch size input_ = input_.reshape(input_shape) pipeline = self.pipeline_factory.build_pipeline(input_shape, hyper_params) input_ = pipeline.apply(input_) input_ = input_.flatten(ndim=2) # this is very hacky, but there seems to be no elegant way to obtain a value for dim dummy_fn = theano.function(inputs=[features], outputs=input_) dummy_out = dummy_fn(X[:1]) dim = dummy_out.shape[-1] if hyper_params['classifier_pool_width'] > 1: # FIXME: this is probably broken! # c = hyper_params['num_components'] # input_ = input_.reshape((input_.shape[0], c, input_.shape[-1] // c, 1)) # restore bc01 # need to re-shape flattened input to restore bc01 format input_shape = hyper_params['classifier_pool_input_shape'] # tuple input_ = input_.reshape(input_shape) pool = MaxPooling(name='pool', input_dim=input_shape[1:], # (c, X.shape[-1] // c, 1), pooling_size=(hyper_params['classifier_pool_width'], 1), step=(hyper_params['classifier_pool_stride'], 1)) input_ = pool.apply(input_) input_ = input_.reshape((input_.shape[0], tensor.prod(input_.shape[1:]))) dim = np.prod(pool.get_dim('output')) linear = Linear(name='linear', input_dim=dim, output_dim=n_classes, weights_init=Uniform(mean=0, std=0.01), use_bias=False) linear.initialize() softmax = Softmax('softmax') probs = softmax.apply(linear.apply(input_)) prediction = tensor.argmax(probs, axis=1) model = Model(probs) # classifier with raw probability outputs predict = theano.function([features], prediction) # ready-to-use predict function if os.path.isfile(fold_weights_filename): # load filter weights from existing file fold_weights = np.load(fold_weights_filename) print 'loaded filter weights from', fold_weights_filename else: # train model from blocks.bricks.cost import MisclassificationRate from blocks.filter import VariableFilter from blocks.graph import ComputationGraph from blocks.roles import WEIGHT from blocks.bricks import Softmax from blocks.model import Model from blocks.algorithms import GradientDescent, Adam from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.predicates import OnLogRecord from fuel.streams import DataStream from fuel.schemes import SequentialScheme, ShuffledScheme from blocks.monitoring import aggregation from blocks.main_loop import MainLoop from blocks.extensions.training import TrackTheBest from deepthought.extensions.parameters import BestParams # from deepthought.datasets.selection import DatasetMetaDB init_param_values = model.get_parameter_values() cost = HingeLoss().apply(targets, probs) # Note: this requires just the class labels, not in a one-hot encoding error_rate = MisclassificationRate().apply(targets.argmax(axis=1), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) # L1 regularization if hyper_params['classifier_l1wdecay'] > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + hyper_params['classifier_l1wdecay'] * sum([abs(W).sum() for W in weights]) cost.name = 'cost' # iterate over trial folds fold_weights = [] fold_errors = [] # for ifi, ifold in fold_generator.get_inner_cv_folds(outer_fold): # # train_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['train']) # valid_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['valid']) # # metadb = DatasetMetaDB(meta, train_selectors.keys()) # # # get selected trial IDs # train_idx = metadb.select(train_selectors) # valid_idx = metadb.select(valid_selectors) for train_idx, valid_idx in idx_folds: # print train_idx # print valid_idx trainset = IndexableDataset(indexables=OrderedDict( [('features', X[train_idx]), ('targets', Y[train_idx])])) validset = IndexableDataset(indexables=OrderedDict( [('features', X[valid_idx]), ('targets', Y[valid_idx])])) model.set_parameter_values(init_param_values) best_params = BestParams() best_params.add_condition(['after_epoch'], predicate=OnLogRecord('error_rate_valid_best_so_far')) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) extensions = [Timing(), FinishAfter(after_n_epochs=hyper_params['classifier_max_epochs']), DataStreamMonitoring( [cost, error_rate], DataStream.default_stream( validset, iteration_scheme=SequentialScheme( validset.num_examples, hyper_params['classifier_batch_size'])), suffix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], suffix="train", after_epoch=True), TrackTheBest('error_rate_valid'), best_params # after TrackTheBest! ] if verbose: extensions.append(Printing()) # optional extensions.append(ProgressBar()) main_loop = MainLoop( algorithm, DataStream.default_stream( trainset, iteration_scheme=ShuffledScheme(trainset.num_examples, hyper_params['classifier_batch_size'])), model=model, extensions=extensions) main_loop.run() fold_weights.append(best_params.values['/linear.W']) fold_errors.append(main_loop.status['best_error_rate_valid']) # break # FIXME fold_errors = np.asarray(fold_errors).squeeze() print 'simple NN fold classification errors:', fold_errors fold_weights = np.asarray(fold_weights) # store filter weights for later analysis np.save(fold_weights_filename, fold_weights) weights = fold_weights.mean(axis=0) linear.parameters[0].set_value(weights) return model, predict
class videoAttentionLayer: # both visual and word feature are in the joint space # of dim: feature_dim # hidden_dim: dim of m # output_dim: final joint document query representation dim def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='m_to_s') self.attention_dist = Softmax(name='attention_dist_softmax') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() # the sequence to sequence LSTM self.seq = LSTM(output_dim, name='rewatcher_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rewatcher_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize() # doc: row major batch_size x doc_length x feature_dim # query: row major batch_size x feature_dim # mask: mask of query batch_size # mask: length of a sentence - 1 def apply(self, doc, query, mask_, batch_size): # batch_size x doc_length x hidden_dim mask = mask_.flatten() att1 = self.image_embed.apply(doc) # y_q_i: the ith token of question # batch_size x feature_dim # r_1: r_m_1 # batch_size x feature_dim # y_d: document # batch_size x doc_length x feature_dim # y_d_m: d-to-m # batch_size x doc_length x hidden_dim # batch_size x hidden_dim # batch_size x hidden_dim y_d = doc att3 = self.word_embed.apply(query) att = att1 + att3.dimshuffle(0, 'x', 1) # batch_size x doc_length x hidden_dim m = T.tanh(att) # batch_size x doc_length x 1 s = self.m_to_s.apply(m) # batch_size x doc_length s = s.reshape((s.shape[0], s.shape[1])) s = self.attention_dist.apply(s) y_d_s = y_d.swapaxes(1, 2) # return batch_size x feature_dim r = T.batched_dot(y_d_s, s) # batch_size x output_dim return r
class iwLayer: def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_m_to_s') self.attention_dist = Softmax(name='iw_attetion') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='iw_r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() self.seq = LSTM(feature_dim, name='rereader_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rereader_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize() # video: batch_size x video_length x feature_dim # query: batch_size x q x feature_dim # mask: this mask is different from other masks # batch_size x q # eg. # -10000 == -np.Inf # 1: 0, 0, 0, 0, 0, -10000, -10000, -10000 # 2: 0, 0, 0, 0, -10000, -10000, -10000 # 3: 0, 0, 0, 0, 0, 0, 0, -10000 def apply(self, video, query, mask, batch_size): # batch_size x q x hidden_dim att1 = self.word_embed.apply(query) def one_step(y_d_i, r_1, y_q, y_q_m): # batch_size x hidden_dim att2 = self.r_embed.apply(r_1) att3 = self.image_embed.apply(y_d_i) att = y_q_m + att2.dimshuffle(0, 'x', 1) + att3.dimshuffle(0, 'x', 1) # batch_size x q x hidden_dim m = T.tanh(att) # batch_size x q s = self.m_to_s.apply(m) s = s.reshape((s.shape[0], s.shape[1])) # ignore the question padding 0s s = s + mask s = self.attention_dist.apply(s) y_q_s = y_q.swapaxes(1, 2) return T.batched_dot(y_q_s, s) + T.tanh(self.r_to_r.apply(r_1)) # r: video_length x batch_size x feature_dim r, updates = theano.scan(fn=one_step, sequences=[video.swapaxes(0, 1)], outputs_info=T.zeros_like(video[:, 0, :]), non_sequences=[query, att1], n_steps=video.shape[1], name='iw layer') # video_length x batch_size x output_dim Wr = self.seq_embed.apply(r) seq_r, garbage = self.seq.apply(Wr) # batch_size x feature_dim r_V = r[-1, :, :] # batch_size x output_dim seq_r_V = seq_r[-1, :, :] return r_V, seq_r_V
################### #### Softmax ################### from blocks.bricks import Softmax from blocks.bricks.cost import MisclassificationRate W2 = theano.shared( numpy.random.normal(size=(n_out, num_protos)).astype('float32')) b = theano.shared(numpy.zeros((num_protos, )).astype('float32')) y = tensor.ivector('y') h = tensor.dot(h3, W2) + b h = tensor.switch(h < 0, -h, h) sm = Softmax() pred = sm.apply(h) misclass = MisclassificationRate().apply(y, pred) c = sm.categorical_cross_entropy(y, h).mean() s_params = [W2, b] s_grad = theano.grad(c, s_params) s_updates = [p - numpy.float32(0.05) * g for p, g in zip(s_params, s_grad)] s_f = theano.function([h3, y], [c, misclass], updates=zip(s_params, s_updates)) s_pred = theano.function([h3], pred) for j in range(200): for i in range(n_batches): if i == 0: print s_f(data[i * batch_size:(i + 1) * batch_size, :], labels[i * batch_size:(i + 1) * batch_size]) else:
class EncoderDecoder(Initializable, Random): """Encapsulate all reusable logic. This class plays a few roles: (a) it's a top brick that knows how to combine bottom, bidirectional and recognizer network, (b) it has the inputs variables and can build whole computation graphs starting with them (c) it hides compilation of Theano functions and initialization of beam search. I find it simpler to have it all in one place for research code. Parameters ---------- All defining the structure and the dimensions of the model. Typically receives everything from the "net" section of the config. """ def __init__(self, input_dims, input_num_chars, bos_label, eos_label, num_labels, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, token_map=None, bidir=True, window_size=None, max_length=None, subsample=None, dims_top=None, extra_input_dim=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, reuse_bottom_lookup_table=False, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, # for criterions involving generation of outputs, whether # or not they should be generated by the recognizer itself generate_predictions=True, compute_targets=True, extra_generation_steps=3, **kwargs): all_arguments = copy.deepcopy(locals()) all_arguments.update(copy.deepcopy(kwargs)) del all_arguments['kwargs'] del all_arguments['self'] if post_merge_activation is None: post_merge_activation = Tanh() super(EncoderDecoder, self).__init__(**kwargs) self.bos_label = bos_label self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.generate_predictions = generate_predictions self.extra_generation_steps = extra_generation_steps self.compute_targets = compute_targets self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class( input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if dims_bidir: if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) elif window_size: encoder = ConvEncoder( max_length, bottom.get_dim(bottom.apply.outputs[0]), window_size) else: raise ValueError("Don't know which Encoder to use") dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition( dim=dim_dec, activation=Tanh(), name="transition") else: assert not extra_input_dim transitions = [self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack)] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError("Unknown attention type {}" .format(attention_type)) if not embed_outputs: raise ValueError("embed_outputs=False is not supported any more") if not reuse_bottom_lookup_table: embedding = LookupTable(num_labels + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: embedding = bottom.children[0] feedback = Feedback( embedding=embedding, output_names=[s for s in transition.apply.sequences if s != 'mask']) # Create a readout readout_config = dict( num_tokens=num_labels, input_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], name="readout") if post_merge_dims: readout_config['merge_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence([ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [d//getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims] + [num_labels]).apply, ], name='post_merge') if 'reward' in criterion and criterion['name'] != 'log_likelihood': if criterion['reward'] == 'edit_distance': readout_config['reward_brick'] = EditDistanceReward( self.bos_label, self.eos_label) elif criterion['reward'] == 'delta_edit_distance': readout_config['reward_brick'] = EditDistanceReward( self.bos_label, self.eos_label, deltas=True) elif criterion['reward'] == 'bleu': readout_config['reward_brick'] = BleuReward( self.bos_label, self.eos_label, deltas=False) elif criterion['reward'] == 'delta_bleu': readout_config['reward_brick'] = BleuReward( self.bos_label, self.eos_label, deltas=True) else: raise ValueError("Unknown reward type") if criterion['name'] == 'log_likelihood': readout_class = SoftmaxReadout elif criterion['name'] == 'critic': readout_class = CriticReadout criterion_copy = dict(criterion) del criterion_copy['name'] readout_config.update(**criterion_copy) elif criterion['name'] == 'reinforce': readout_class = ReinforceReadout readout_config['merge_names'] = list(readout_config['input_names']) readout_config['entropy'] = criterion.get('entropy') readout_config['input_names'] += ['attended', 'attended_mask'] elif criterion['name'] in ['sarsa', 'actor_critic']: readout_class = ActorCriticReadout if criterion['name'] == 'actor_critic': critic_arguments = dict(all_arguments) # No worries, critic will not compute log likelihood values. # We critic_arguments['criterion'] = { 'name': 'critic', 'value_softmax': criterion.get('value_softmax'), 'same_value_for_wrong': criterion.get('same_value_for_wrong'), 'groundtruth_word_bonus': criterion.get('groundtruth_word_bonus'), 'dueling_outputs': criterion.get('dueling_outputs')} critic_arguments['name'] = 'critic' if criterion.get('critic_uses_actor_states'): critic_arguments['extra_input_dim'] = dim_dec if (criterion.get('value_softmax') or criterion.get('same_value_for_wrong') or criterion.get('dueling_outputs')): # Add an extra output for the critic critic_arguments['num_labels'] = num_labels + 1 if criterion.get('force_bidir'): critic_arguments['dims_bidir'] = [dim_dec] critic_arguments['reuse_bottom_lookup_table'] = True critic_arguments['input_num_chars'] = {'inputs': num_labels} if criterion.get('downsize_critic'): critic_arguments = _downsize_config( critic_arguments, criterion['downsize_critic']) critic = EncoderDecoder(**critic_arguments) readout_config['critic'] = critic readout_config['merge_names'] = list(readout_config['input_names']) readout_config['freeze_actor'] = criterion.get('freeze_actor') readout_config['freeze_critic'] = criterion.get('freeze_critic') readout_config['critic_uses_actor_states'] = criterion.get('critic_uses_actor_states') readout_config['critic_uses_groundtruth'] = criterion.get('critic_uses_groundtruth') readout_config['critic_burnin_steps'] = criterion.get('critic_burnin_steps') readout_config['critic_loss'] = criterion.get('critic_loss') readout_config['discount'] = criterion.get('discount') readout_config['entropy_reward_coof'] = criterion.get('entropy_reward_coof') readout_config['cross_entropy_reward_coof'] = criterion.get('cross_entropy_reward_coof') readout_config['value_penalty'] = criterion.get('value_penalty') readout_config['value_penalty_type'] = criterion.get('value_penalty_type') readout_config['critic_policy_t'] = criterion.get('critic_policy_t') readout_config['bos_token'] = bos_label readout_config['accumulate_outputs'] = criterion.get('accumulate_outputs') readout_config['use_value_biases'] = criterion.get('use_value_biases') readout_config['actor_grad_estimate'] = criterion.get('actor_grad_estimate') readout_config['input_names'] += ['attended', 'attended_mask'] # Note, that settings below are for the "clean" mode. # When get_cost_graph() is run with training=True, they # are temporarily overriden with the "real" settings from # "criterion" readout_config['compute_targets'] = True readout_config['trpo_coef'] = 0.0 readout_config['solve_bellman'] = True else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout = readout_class(**readout_config) if lm: raise ValueError("LM is currently not supported") recurrent = AttentionRecurrent(transition, attention) if extra_input_dim: recurrent = RecurrentWithExtraInput( recurrent, "extra_inputs", extra_input_dim, name="with_extra_inputs") generator = SequenceGenerator( recurrent=recurrent, readout=readout, feedback=feedback, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.softmax = Softmax() self.children = [encoder, top, bottom, generator, self.softmax] # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.predicted_labels = tensor.lmatrix('predicted_labels') self.predicted_mask = tensor.matrix('predicted_mask') self.prefix_labels = tensor.lmatrix('prefix_labels') self.prefix_steps = tensor.lscalar('prefix_steps') self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.single_predicted_labels = tensor.lvector('predicted_labels') self.n_steps = tensor.lscalar('n_steps') # Configure mixed_generate if criterion['name'] == 'actor_critic': critic = self.generator.readout.critic self.mixed_generate.sequences = [] self.mixed_generate.states = ( ['step'] + self.generator.recurrent.apply.states + ['critic_' + name for name in critic.generator.recurrent.apply.states]) self.mixed_generate.outputs = ( ['samples', 'step'] + self.generator.recurrent.apply.outputs + ['critic_' + name for name in critic.generator.recurrent.apply.outputs]) self.mixed_generate.contexts = ( self.generator.recurrent.apply.contexts + ['critic_' + name for name in critic.generator.recurrent.apply.contexts] + ['groundtruth', 'groundtruth_mask']) self.initial_states.outputs = self.mixed_generate.states self.prefix_generate.sequences = [] self.prefix_generate.states = ['step'] + self.generator.recurrent.apply.states self.prefix_generate.outputs = ['samples', 'step'] + self.generator.recurrent.apply.outputs self.prefix_generate.contexts = self.generator.recurrent.apply.contexts def push_initialization_config(self): super(EncoderDecoder, self).push_initialization_config() if self.rec_weights_init: rec_weights_config = {'weights_init': self.rec_weights_init, 'recurrent_weights_init': self.rec_weights_init} global_push_initialization_config(self, rec_weights_config, BaseRecurrent) if self.initial_states_init: global_push_initialization_config(self, {'initial_states_init': self.initial_states_init}) @application def costs(self, **kwargs): # pop inputs we know about prediction = kwargs.pop('prediction') prediction_mask = kwargs.pop('prediction_mask') groundtruth = kwargs.pop('groundtruth', None) groundtruth_mask = kwargs.pop('groundtruth_mask', None) inputs_mask = kwargs.pop('inputs_mask') extra_inputs = kwargs.pop('extra_inputs', None) # the rest is for bottom bottom_processed = self.bottom.apply(**kwargs) encoded, encoded_mask = self.encoder.apply( input_=bottom_processed, mask=inputs_mask) encoded = self.top.apply(encoded) costs_kwargs = dict( prediction=prediction, prediction_mask=prediction_mask, groundtruth=groundtruth, groundtruth_mask=groundtruth_mask, attended=encoded, attended_mask=encoded_mask) if extra_inputs: costs_kwargs['extra_inputs'] = extra_inputs return self.generator.costs(**costs_kwargs) @application def generate(self, return_initial_states=False, **kwargs): inputs_mask = kwargs.pop('inputs_mask') n_steps = kwargs.pop('n_steps') encoded, encoded_mask = self.encoder.apply( input_=self.bottom.apply(**kwargs), mask=inputs_mask) encoded = self.top.apply(encoded) return self.generator.generate( n_steps=n_steps if n_steps is not None else self.n_steps, batch_size=encoded.shape[1], attended=encoded, attended_mask=encoded_mask, return_initial_states=return_initial_states, as_dict=True) @recurrent def prefix_generate(self, return_initial_states=True, **kwargs): step = kwargs.pop('step') sampling_inputs = dict_subset( kwargs, self.generator.readout.sample.inputs) samples, scores = self.generator.readout.sample(**sampling_inputs) prefix_mask = tensor.lt(step, self.prefix_steps) samples = (prefix_mask * self.prefix_labels[step[0]] + (1 - prefix_mask) * samples) feedback = self.generator.feedback.apply(samples, as_dict=True) states_contexts = dict_subset( kwargs, self.generator.recurrent.apply.states + self.generator.recurrent.apply.contexts) states_outputs = self.generator.recurrent.apply( as_dict=True, iterate=False, **dict_union(feedback, states_contexts)) return ([samples, step + 1] + states_outputs.values()) @recurrent def mixed_generate(self, return_initial_states=True, **kwargs): critic = self.generator.readout.critic groundtruth = kwargs.pop('groundtruth') groundtruth_mask = kwargs.pop('groundtruth_mask') step = kwargs.pop('step') sampling_inputs = dict_subset( kwargs, self.generator.readout.sample.inputs) actor_scores = self.generator.readout.scores(**sampling_inputs) critic_inputs = { name: kwargs['critic_' + name] for name in critic.generator.readout.merge_names} critic_outputs = critic.generator.readout.outputs( groundtruth, groundtruth_mask, **critic_inputs) epsilon = numpy.array(self.generator.readout.epsilon, dtype=theano.config.floatX) actor_probs = tensor.exp(actor_scores) # This is a poor man's 1-hot argmax critic_probs = self.softmax.apply(critic_outputs * 1000) probs = (actor_probs * (tensor.constant(1) - epsilon) + critic_probs * epsilon) x = self.theano_rng.uniform(size=(probs.shape[0],)) samples = (tensor.gt(x[:, None], tensor.cumsum(probs, axis=1)) .astype(theano.config.floatX) .sum(axis=1) .astype('int64')) samples = tensor.minimum(samples, probs.shape[1] - 1) actor_feedback = self.generator.feedback.apply(samples, as_dict=True) actor_states_contexts = dict_subset( kwargs, self.generator.recurrent.apply.states + self.generator.recurrent.apply.contexts) actor_states_outputs = self.generator.recurrent.apply( as_dict=True, iterate=False, **dict_union(actor_feedback, actor_states_contexts)) critic_feedback = critic.generator.feedback.apply(samples, as_dict=True) critic_states_contexts = { name: kwargs['critic_' + name] for name in critic.generator.recurrent.apply.states + critic.generator.recurrent.apply.contexts} critic_apply_kwargs = dict( as_dict=True, iterate=False, **dict_union(critic_feedback, critic_states_contexts)) if self.generator.readout.critic_uses_actor_states: critic_apply_kwargs['extra_inputs'] = actor_states_outputs['states'] critic_states_outputs = critic.generator.recurrent.apply(**critic_apply_kwargs) return ([samples, step + 1] + actor_states_outputs.values() + critic_states_outputs.values()) @application def initial_states(self, batch_size, *args, **kwargs): critic = self.generator.readout.critic result = ([tensor.zeros((batch_size,), dtype='int64')] + self.generator.initial_states(batch_size, *args, **kwargs)) critic_kwargs = {name[7:]: kwargs[name] for name in kwargs if name.startswith('critic_')} # This method can be called for two different recurrent application method, # "mixed_generate" and "prefix_generate". That's why this dirty hack is needed. if critic_kwargs: result += critic.generator.initial_states(batch_size, **critic_kwargs) return result def get_dim(self, name): critic = self.generator.readout.critic if name.startswith('critic_'): return critic.generator.get_dim(name[7:]) elif name == 'step': return 0 else: return self.generator.get_dim(name) @application def mask_for_prediction(self, prediction, groundtruth_mask=None, extra_generation_steps=None): prediction_mask = tensor.lt( tensor.cumsum(tensor.eq(prediction, self.eos_label) .astype(theano.config.floatX), axis=0), 1).astype(theano.config.floatX) prediction_mask = tensor.roll(prediction_mask, 1, 0) prediction_mask = tensor.set_subtensor( prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :])) if groundtruth_mask: max_lengths = groundtruth_mask.sum(axis=0) + extra_generation_steps prediction_mask *= tensor.lt( tensor.arange(prediction.shape[0])[:, None], max_lengths[None, :]) return prediction_mask def load_params(self, path): cg = self.get_cost_graph() with open(path, 'r') as src: param_values = load_parameters(src) Model(cg.outputs).set_parameter_values(param_values) def get_generate_graph(self, use_mask=True, n_steps=None, return_initial_states=False, use_softmax_t=False): if use_softmax_t: self.generator.readout.softmax_t = self.criterion.get('softmax_t', 1.0) inputs_mask = None if use_mask: inputs_mask = self.inputs_mask result = self.generate( n_steps=n_steps, inputs_mask=inputs_mask, return_initial_states=return_initial_states, **self.inputs) self.generator.readout.softmax_t = 1. return result def get_mixed_generate_graph(self, n_steps=None, return_initial_states=False): critic = self.generator.readout.critic attended, attended_mask = self.encoder.apply( input_=self.bottom.apply(**self.inputs), mask=self.inputs_mask) attended = self.top.apply(attended) critic_attended, critic_attended_mask = critic.encoder.apply( input_=critic.bottom.apply(inputs=self.labels), mask=self.labels_mask) critic_attended = critic.top.apply(critic_attended) return self.mixed_generate( n_steps=n_steps, batch_size=attended.shape[1], return_initial_states=return_initial_states, as_dict=True, attended=attended, attended_mask=attended_mask, critic_attended=critic_attended, critic_attended_mask=critic_attended_mask, groundtruth=self.labels, groundtruth_mask=self.labels_mask) def get_prefix_generate_graph(self, n_steps=None, return_initial_states=False): attended, attended_mask = self.encoder.apply( input_=self.bottom.apply(**self.inputs), mask=self.inputs_mask) attended = self.top.apply(attended) return self.prefix_generate( n_steps=n_steps, batch_size=attended.shape[1], return_initial_states=return_initial_states, as_dict=True, attended=attended, attended_mask=attended_mask) def get_cost_graph(self, batch=True, use_prediction=False, training=False, groundtruth_as_predictions=False, with_mixed_generation=False): # "use_predictions" means use the Theano input variable # for predictions. readout = self.generator.readout if training and self.criterion['name'] == 'actor_critic': logger.debug("Switching to training mode") readout.compute_targets = self.compute_targets readout.trpo_coef = self.criterion.get('trpo_coef', 0.0) if 'solve_bellman' in self.criterion: readout.solve_bellman = self.criterion['solve_bellman'] if with_mixed_generation and 'epsilon' in self.criterion: readout.epsilon = self.criterion['epsilon'] if batch: inputs, inputs_mask = self.inputs, self.inputs_mask groundtruth, groundtruth_mask = self.labels, self.labels_mask prediction, prediction_mask = self.predicted_labels, self.predicted_mask else: inputs, inputs_mask = self.bottom.single_to_batch_inputs( self.single_inputs) groundtruth = self.single_labels[:, None] groundtruth_mask = self.mask_for_prediction(groundtruth) prediction = self.single_predicted_labels[:, None] prediction_mask = self.mask_for_prediction(prediction) if self.cost_involves_generation() and not groundtruth_as_predictions: if ((training and self.generate_predictions) or (not training and not use_prediction)): generation_routine = (self.get_mixed_generate_graph if with_mixed_generation else self.get_generate_graph) generated = generation_routine( n_steps=self.labels.shape[0] + self.extra_generation_steps) prediction = disconnected_grad(generated['samples']) prediction_mask = self.mask_for_prediction( prediction, groundtruth_mask, self.extra_generation_steps) else: logger.debug("Using provided predictions") cost = self.costs(inputs_mask=inputs_mask, prediction=prediction, prediction_mask=prediction_mask, groundtruth=groundtruth, groundtruth_mask=groundtruth_mask, **inputs) else: if use_prediction: cost = self.costs(inputs_mask=inputs_mask, prediction=prediction, prediction_mask=prediction_mask, **inputs) else: cost = self.costs(inputs_mask=inputs_mask, prediction=groundtruth, prediction_mask=groundtruth_mask, groundtruth=groundtruth, groundtruth_mask=groundtruth_mask, **inputs) cost_cg = ComputationGraph(cost) # This *has to* be done only when # "training" or "with_mixed_generation" is True, # but it does not hurt to do it every time. logger.debug("Switching back to the normal mode") readout = self.generator.readout readout.compute_targets = True readout.trpo_coef = 0.0 readout.solve_bellman = True readout.epsilon = 0. return cost_cg def analyze(self, inputs, groundtruth, prediction): """Compute cost and aligment.""" if not hasattr(self, "_analyze"): input_variables = list(self.single_inputs.values()) input_variables.append(self.single_labels) input_variables.append(self.single_predicted_labels) cg = self.get_cost_graph(batch=False, use_prediction=True) costs = cg.outputs[0] weights, = VariableFilter( bricks=[self.generator], name="weights")(cg) energies = VariableFilter( bricks=[self.generator], name="energies")(cg) energies_output = [energies[0][:, 0, :] if energies else tensor.zeros_like(weights)] self._analyze = theano.function( input_variables, [costs[0], weights[:, 0, :]] + energies_output, on_unused_input='warn') input_values_dict = dict(inputs) input_values_dict['labels'] = groundtruth input_values_dict['predicted_labels'] = prediction return self._analyze(**input_values_dict) def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ if hasattr(self, '_beam_search') and self.beam_size == beam_size: # Only recompile if the user wants a different beam size return self.beam_size = beam_size generated = self.get_generate_graph(use_mask=False, n_steps=3) cg = ComputationGraph(generated.values()) samples, = VariableFilter( applications=[self.generator.generate], name="samples")(cg) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile() def beam_search(self, inputs, **kwargs): # When a recognizer is unpickled, self.beam_size is available # but beam search has to be recompiled. self.init_beam_search(self.beam_size) inputs = dict(inputs) max_length = int(self.bottom.num_time_steps(**inputs) / self.max_decoded_length_scale) search_inputs = {} for var in self.inputs.values(): search_inputs[var] = inputs.pop(var.name)[:, numpy.newaxis, ...] if inputs: raise Exception( 'Unknown inputs passed to beam search: {}'.format( inputs.keys())) outputs, search_costs = self._beam_search.search( search_inputs, self.eos_label, max_length, ignore_first_eol=self.data_prepend_eos, **kwargs) return outputs, search_costs def init_generate(self): generated = self.get_generate_graph(use_mask=False) cg = ComputationGraph(generated['samples']) self._do_generate = cg.get_theano_function() def sample(self, inputs, n_steps=None): if not hasattr(self, '_do_generate'): self.init_generate() batch, unused_mask = self.bottom.single_to_batch_inputs(inputs) batch['n_steps'] = n_steps if n_steps is not None \ else int(self.bottom.num_time_steps(**batch) / self.max_decoded_length_scale) sample = self._do_generate(**batch)[0] sample = list(sample[:, 0]) if self.eos_label in sample: sample = sample[:sample.index(self.eos_label) + 1] return sample def __getstate__(self): state = dict(self.__dict__) for attr in ['_analyze', '_beam_search']: state.pop(attr, None) return state def __setstate__(self, state): self.__dict__.update(state) # To use bricks used on a GPU first on a CPU later try: emitter = self.generator.readout.emitter del emitter._theano_rng except: pass def cost_involves_generation(self): return self.criterion['name'] in ['reinforce', 'sarsa', 'actor_critic']
key=lambda (i, v): -v) print( "Top Similarities for %10s @ %4d:" % ( token_target, token_i, ), map( lambda (i, v): "%s %.1f%%" % (code2word[i], v * 100.), sorted_similarities[1:4] # Element [0] is token itself )) #exit(0) if not run_test: # i.e. do training phase label_probs = p_labels.apply( labels_raw) # This is a list of label probabilities print("label_probs shape", label_probs.shape.tag.test_value) # array([ 464, 5])) # -- so :: this is an in-place rescaling y = tensor.matrix( 'labels', dtype="int32" ) # This is a symbolic vector of ints (implies one-hot in categorical_crossentropy) y.tag.test_value = np.random.randint( labels_size, size=batch_of_sentences).astype(np.int32) print("y shape", y.shape.tag.test_value) # array([ 29, 16])) print("y.flatten() shape", y.flatten().shape.tag.test_value) # array([464])) print("y.flatten() dtype", y.flatten().dtype) # int32
rnn = DropLSTM(dim=h_dim, model_type=model_type, update_prob=update_prob, name="rnn") h1, c1 = rnn.apply(pre_rnn, drops, is_for_test) else: rnn = DropGRU(dim=h_dim, model_type=model_type, update_prob=update_prob, name="rnn") h1, sd = rnn.apply(pre_rnn[:, :, :h_dim], pre_rnn[:, :, h_dim:], drops, is_for_test) h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=y_dim) pre_softmax = h1_to_o.apply(h1) softmax = Softmax() shape = pre_softmax.shape softmax_out = softmax.apply(pre_softmax.reshape((-1, y_dim))) softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y, softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y, softmax_out[-1]) error_rate.name = 'error_rate' # Initialization for brick in (x_to_h1, h1_to_o, rnn): brick.weights_init = Glorot() brick.biases_init = Constant(0) brick.initialize()
class impatientLayer: # both visual and word feature are in the joint space # of dim: feature_dim # hidden_dim: dim of m # output_dim: final joint document query representation dim def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='m_to_s') self.attention_dist = Softmax(name='attention_dist_softmax') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() # the sequence to sequence LSTM self.seq = LSTM(output_dim, name='rewatcher_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rewatcher_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize() # doc: row major batch_size x doc_length x feature_dim # query: row major batch_size x q x feature_dim # mask: mask of query batch_size # mask: length of a sentence - 1 def apply(self, doc, query, mask_, batch_size): # batch_size x doc_length x hidden_dim mask = mask_.flatten() att1 = self.image_embed.apply(doc) # y_q_i: the ith token of question # batch_size x feature_dim # r_1: r_m_1 # batch_size x feature_dim # y_d: document # batch_size x doc_length x feature_dim # y_d_m: d-to-m # batch_size x doc_length x hidden_dim def one_step(y_q_i, r_1, y_d, y_d_m): # batch_size x hidden_dim att2 = self.r_embed.apply(r_1) # batch_size x hidden_dim att3 = self.word_embed.apply(y_q_i) att = y_d_m + att2.dimshuffle(0, 'x', 1) + att3.dimshuffle(0, 'x', 1) # batch_size x doc_length x hidden_dim m = T.tanh(att) # batch_size x doc_length x 1 s = self.m_to_s.apply(m) # batch_size x doc_length s = s.reshape((s.shape[0], s.shape[1])) s = self.attention_dist.apply(s) y_d_s = y_d.swapaxes(1, 2) # return batch_size x feature_dim return T.batched_dot(y_d_s, s) + T.tanh(self.r_to_r.apply(r_1)) # query: batch_size x q x feature_dim # r: q x batch_size x feature_dim r, updates = theano.scan(fn=one_step, sequences=[query.swapaxes(0,1)], outputs_info=T.zeros_like(doc[:, 0, :]), non_sequences=[doc, att1], n_steps=query.shape[1], name='impatient layer') # for the sequence encoder # q x batch_size x output_dim Wr = self.seq_embed.apply(r) # q x batch_size x output_dim seq_r, garbage = self.seq.apply(Wr) # batch_size x feature_dim r_q = r[mask, T.arange(batch_size), :] seq_r_q = seq_r[mask, T.arange(batch_size), :] # batch_size x output_dim return r_q, seq_r_q
class CostObject(Initializable): @lazy() def __init__(self, cost_type='original', **kwargs): super(CostObject, self).__init__(**kwargs) self.cost_type = cost_type self.softmax = Softmax() self.children = [self.softmax] @application(inputs=['input_'], outputs=['output']) def log_probabilities(self, input_): """Normalize log-probabilities. Converts unnormalized log-probabilities (exponents of which do not sum to one) into actual log-probabilities (exponents of which sum to one). Parameters ---------- input_ : :class:`~theano.Variable` A matrix, each row contains unnormalized log-probabilities of a distribution. Returns ------- output : :class:`~theano.Variable` A matrix with normalized log-probabilities in each row for each distribution from `input_`. """ shifted = input_ - input_.max(axis=1, keepdims=True) return shifted - tensor.log( tensor.exp(shifted).sum(axis=1, keepdims=True)) @application(inputs=['x', 'y'], outputs=['output']) def original_cost(self, x, y): x = self.log_probabilities(x) if y.ndim == x.ndim - 1: indices = tensor.arange(y.shape[0]) * x.shape[1] + y cost = -x.flatten()[indices] elif y.ndim == x.ndim: cost = -(x * y).sum(axis=1) else: raise TypeError('rank mismatch between x and y') return cost @application(inputs=['x', 'y'], outputs=['output']) def simple_cost(self, x, y): if y.ndim == x.ndim - 1: # Get probs: newX = self.softmax.apply(x) indices = tensor.arange(y.shape[0]) * x.shape[1] + y newY = tensor.ones_like(newX) cost = ((newY - newX).flatten()[indices]) elif y.ndim == x.ndim: raise TypeError('\nExpected either x or y to be of another rank\n') else: raise TypeError('rank mismatch between x and y') return cost # x are the gold labels. @application(inputs=['x', 'y'], outputs=['output']) def cost(self, application_call, x, y): if self.cost_type == 'original': return self.original_cost(x, y) if self.cost_type == 'simple': return self.simple_cost(x, y) return 0
################### #### Softmax ################### from blocks.bricks import Softmax from blocks.bricks.cost import MisclassificationRate W2 = theano.shared(numpy.random.normal(size=(n_out, num_protos)).astype('float32')) b = theano.shared(numpy.zeros((num_protos,)).astype('float32')) y = tensor.ivector('y') h = tensor.dot(h3, W2) + b h = tensor.switch(h < 0, -h , h) sm = Softmax() pred = sm.apply(h) misclass = MisclassificationRate().apply(y, pred) c = sm.categorical_cross_entropy(y, h).mean() s_params = [W2, b] s_grad = theano.grad(c, s_params) s_updates = [p - numpy.float32(0.05)*g for p, g in zip(s_params, s_grad)] s_f = theano.function([h3, y], [c, misclass], updates=zip(s_params, s_updates)) s_pred = theano.function([h3], pred) for j in range(200): for i in range(n_batches): if i == 0: print s_f(data[i*batch_size:(i+1)*batch_size, :], labels[i*batch_size:(i+1)*batch_size]) else: s_f(data[i*batch_size:(i+1)*batch_size, :], labels[i*batch_size:(i+1)*batch_size])
linear1 = Linear(name='linear1', input_dim=300, output_dim=128) recurrent = SimpleRecurrent(name='recurrent', activation=Tanh(), dim=128) linear2 = Linear(name='linear2', input_dim=128, output_dim=9) softmax = Softmax() bricks = [linear1, recurrent, linear2] for brick in bricks: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize() linear1_output = linear1.apply(input) recurrent_output = recurrent.apply(linear1_output, mask=mask) linear2_output = linear2.apply(recurrent_output) shape = linear2_output.shape # 100 * 29*9 output = softmax.apply(linear2_output.reshape( (-1, 9))).reshape(shape) # hameye dimension ha be gheyr az yeki k oon 9 hast. # Cost and Functions cost = T.nnet.categorical_crossentropy(output, target) # 100 x 29 cost = cost * mask cost = cost.mean() params = Model(cost).parameters updates = sgd(cost, params) f_train = theano.function(inputs=[input, mask, target], outputs=cost, updates=updates, allow_input_downcast=True) f_valid = theano.function(inputs=[input, mask, target], outputs=cost,
def training(runname, rnnType, maxPackets, packetTimeSteps, packetReverse, padOldTimeSteps, wtstd, lr, decay, clippings, dimIn, dim, attentionEnc, attentionContext, numClasses, batch_size, epochs, trainPercent, dataPath, loadPrepedData, channel): # pragma: no cover print locals() print X = T.tensor4('inputs') Y = T.matrix('targets') linewt_init = IsotropicGaussian(wtstd) line_bias = Constant(1.0) rnnwt_init = IsotropicGaussian(wtstd) rnnbias_init = Constant(0.0) classifierWts = IsotropicGaussian(wtstd) learning_rateClass = theano.shared(np.array(lr, dtype=theano.config.floatX)) learning_decay = np.array(decay, dtype=theano.config.floatX) ###DATA PREP print 'loading data' if loadPrepedData: hexSessions = loadFile(dataPath) else: sessioner = sessionizer.HexSessionizer(dataPath) hexSessions = sessioner.read_pcap() hexSessions = removeBadSessionizer(hexSessions) numSessions = len(hexSessions) print str(numSessions) + ' sessions found' hexSessionsKeys = order_keys(hexSessions) hexDict = hexTokenizer() print 'creating dictionary of ip communications' comsDict, uniqIPs = srcIpDict(hexSessions) comsDict = dictUniquerizer(comsDict) print 'initializing network graph' ###ENCODER if rnnType == 'gru': rnn = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru') dimMultiplier = 2 else: rnn = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm') dimMultiplier = 4 fork = Fork(output_names=['linear', 'gates'], name='fork', input_dim=dimIn, output_dims=[dim, dim * dimMultiplier], weights_init = linewt_init, biases_init = line_bias) ###CONTEXT if rnnType == 'gru': rnnContext = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gruContext') else: rnnContext = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstmContext') forkContext = Fork(output_names=['linearContext', 'gatesContext'], name='forkContext', input_dim=dim, output_dims=[dim, dim * dimMultiplier], weights_init = linewt_init, biases_init = line_bias) forkDec = Fork(output_names=['linear', 'gates'], name='forkDec', input_dim=dim, output_dims=[dim, dim*dimMultiplier], weights_init = linewt_init, biases_init = line_bias) #CLASSIFIER bmlp = BatchNormalizedMLP( activations=[Tanh(),Tanh()], dims=[dim, dim, numClasses], weights_init=classifierWts, biases_init=Constant(0.0001) ) #initialize the weights in all the functions fork.initialize() rnn.initialize() forkContext.initialize() rnnContext.initialize() forkDec.initialize() bmlp.initialize() def onestepEnc(X): data1, data2 = fork.apply(X) if rnnType == 'gru': hEnc = rnn.apply(data1, data2) else: hEnc, _ = rnn.apply(data2) return hEnc hEnc, _ = theano.scan(onestepEnc, X) #(mini*numPackets, packetLen, 1, hexdictLen) if attentionEnc: attentionmlpEnc = MLP(activations=[Tanh()], dims = [dim, 1], weights_init=attnWts, biases_init=Constant(1.0)) attentionmlpEnc.initialize() hEncAttn = T.reshape(hEnc, (-1, packetTimeSteps, dim)) def onestepEncAttn(hEncAttn): preEncattn = attentionmlpEnc.apply(hEncAttn) attEncsoft = Softmax() attEncpyx = attEncsoft.apply(preEncattn.flatten()) attEncpred = attEncpyx.flatten() attenc = T.mul(hEncAttn.dimshuffle(1,0), attEncpred).dimshuffle(1,0) return attenc attenc, _ = theano.scan(onestepEncAttn, hEncAttn) hEncReshape = T.reshape(T.sum(attenc, axis = 1), (-1, maxPackets, 1, dim)) else: hEncReshape = T.reshape(hEnc[:,-1], (-1, maxPackets, 1, dim)) #[:,-1] takes the last rep for each packet #(mini, numPackets, 1, dimReduced) #[:,-1] takes the last rep for each packet #(mini, numPackets, 1, dimReduced) def onestepContext(hEncReshape): data3, data4 = forkContext.apply(hEncReshape) if rnnType == 'gru': hContext = rnnContext.apply(data3, data4) else: hContext, _ = rnnContext.apply(data4) return hContext hContext, _ = theano.scan(onestepContext, hEncReshape) if attentionContext: attentionmlpContext = MLP(activations=[Tanh()], dims = [dim, 1], weights_init=attnWts, biases_init=Constant(1.0)) attentionmlpContext.initialize() hContextAttn = T.reshape(hContext, (-1,maxPackets,dim)) def onestepContextAttn(hContextAttn): preContextatt = attentionmlpContext.apply(hContextAttn) attContextsoft = Softmax() attContextpyx = attContextsoft.apply(preContextatt.flatten()) attContextpred = attContextpyx.flatten() attcontext = T.mul(hContextAttn.dimshuffle(1,0), attContextpred).dimshuffle(1,0) return attcontext attcontext, _ = theano.scan(onestepContextAttn, hContextAttn) hContextReshape = T.sum(attcontext, axis = 1) else: hContextReshape = T.reshape(hContext[:,-1], (-1,dim)) data5, _ = forkDec.apply(hContextReshape) pyx = bmlp.apply(data5) softmax = Softmax() softoutClass = softmax.apply(pyx) costClass = T.mean(CategoricalCrossEntropy().apply(Y, softoutClass)) #CREATE GRAPH cgClass = ComputationGraph([costClass]) paramsClass = VariableFilter(roles = [PARAMETER])(cgClass.variables) learning = learningfunctions.Learning(costClass,paramsClass,learning_rateClass,l1=0.,l2=0.,maxnorm=0.,c=clippings) updatesClass = learning.Adam() module_logger.info('starting graph compilation') classifierTrain = theano.function([X,Y], [costClass, hEnc, hContext, pyx, softoutClass], updates=updatesClass, allow_input_downcast=True) classifierPredict = theano.function([X], softoutClass, allow_input_downcast=True) module_logger.info('graph compilation finished') print 'finished graph compilation' trainIndex = int(len(hexSessionsKeys)*trainPercent) epochCost = [] gradNorms = [] trainAcc = [] testAcc = [] costCollect = [] trainCollect = [] module_logger.info('beginning training') iteration = 0 #epoch for epoch in xrange(epochs): #iteration/minibatch for start, end in zip(range(0, trainIndex,batch_size), range(batch_size, trainIndex, batch_size)): trainingTargets = [] trainingSessions = [] #create one minibatch with 0.5 normal and 0.5 abby normal traffic for trainKey in range(start, end): sessionForEncoding = list(hexSessions[hexSessions.keys()[trainKey]][0]) adfun = adversarialfunctions.Adversary(sessionForEncoding) adversaryList = [sessionForEncoding, adfun.dstIpSwapOut(comsDict, uniqIPs), adfun.portDirSwitcher(), adfun.ipDirSwitcher()] abbyIndex = random.sample(range(len(adversaryList)), 1)[0] targetClasses = [0]*numClasses targetClasses[abbyIndex] = 1 abbyTarget = np.array(targetClasses, dtype=theano.config.floatX) trainingSessions.append(abbyOneHotSes[0]) trainingTargets.append(abbyTarget) sessionsMinibatch = np.asarray(trainingSessions).reshape((-1, packetTimeSteps, 1, dimIn)) targetsMinibatch = np.asarray(trainingTargets) costfun = classifierTrain(sessionsMinibatch, targetsMinibatch) if iteration % (numSessions / (10 * batch_size)) == 0: costCollect.append(costfun[0]) trainCollect.append(np.mean(np.argmax(costfun[-1],axis=1) == np.argmax(targetsMinibatch, axis=1))) module_logger.info(' Iteration: ', iteration) module_logger.info(' Cost: ', np.mean(costCollect)) module_logger.info(' TRAIN accuracy: ', np.mean(trainCollect)) print ' Iteration: ', iteration print ' Cost: ', np.mean(costCollect) print ' TRAIN accuracy: ', np.mean(trainCollect) iteration+=1 #testing accuracy if iteration % (numSessions / (2 * batch_size)) == 0: predtar, acttar, testCollect = predictClass(classifierPredict, hexSessions, comsDict, uniqIPs, hexDict, hexSessionsKeys, numClasses, trainPercent, dimIn, maxPackets, packetTimeSteps, padOldTimeSteps) binaryPrecisionRecall(predtar, acttar, numClasses) module_logger.info(str(testCollect)) #save the models if iteration % (numSessions / (5 * batch_size)) == 0: save_model(classifierPredict) epochCost.append(np.mean(costCollect)) trainAcc.append(np.mean(trainCollect)) module_logger.info('Epoch: ', epoch) module_logger.info('Epoch cost average: ', epochCost[-1]) module_logger.info('Epoch TRAIN accuracy: ', trainAcc[-1]) print 'Epoch: ', epoch print 'Epoch cost average: ', epochCost[-1] print 'Epoch TRAIN accuracy: ', trainAcc[-1] return classifierTrain, classifierPredict
#print("self-cosine similarity %f" % (np.dot(token_v,token_v))) all_similarities = np.dot(e, token_v) #print("overall similarity shape: ", all_similarities.shape) # a 1-d array sorted_similarities = sorted( enumerate(all_similarities), key=lambda (i,v): -v) print("Top Similarities for %10s @ %4d:" % (token_target,token_i, ), map(lambda (i,v): "%s %.1f%%" % (code2word[i],v*100.), sorted_similarities[1:4] # Element [0] is token itself ) ) #exit(0) if not run_test: # i.e. do training phase label_probs = p_labels.apply(labels_raw) # This is a list of label probabilities print("label_probs shape", label_probs.shape.tag.test_value) # array([ 464, 5])) # -- so :: this is an in-place rescaling y = tensor.matrix('labels', dtype="int32") # This is a symbolic vector of ints (implies one-hot in categorical_crossentropy) y.tag.test_value = np.random.randint( labels_size, size=batch_of_sentences).astype(np.int32) print("y shape", y.shape.tag.test_value) # array([ 29, 16])) print("y.flatten() shape", y.flatten().shape.tag.test_value) # array([464])) print("y.flatten() dtype", y.flatten().dtype) # int32 examine_embedding(lookup.W.get_value()) """ class CategoricalCrossEntropy(Cost): @application(outputs=["cost"])
class CCHLSTM(BaseRecurrent, Initializable): def __init__(self, io_dim, hidden_dims, cond_cert, activation=None, **kwargs): super(CCHLSTM, self).__init__(**kwargs) self.cond_cert = cond_cert self.io_dim = io_dim self.hidden_dims = hidden_dims self.children = [] self.layers = [] self.softmax = Softmax() self.children.append(self.softmax) for i, d in enumerate(hidden_dims): i0 = LookupTable(length=io_dim, dim=4*d, name='i0-%d'%i) self.children.append(i0) if i > 0: i1 = Linear(input_dim=hidden_dims[i-1], output_dim=4*d, name='i1-%d'%i) self.children.append(i1) else: i1 = None lstm = LSTM(dim=d, activation=activation, name='LSTM-%d'%i) self.children.append(lstm) o = Linear(input_dim=d, output_dim=io_dim, name='o-%d'%i) self.children.append(o) self.layers.append((i0, i1, lstm, o)) @recurrent(contexts=[]) def apply(self, inputs, **kwargs): l0i, _, l0l, l0o = self.layers[0] l0iv = l0i.apply(inputs) new_states0, new_cells0 = l0l.apply(states=kwargs['states0'], cells=kwargs['cells0'], inputs=l0iv, iterate=False) l0ov = l0o.apply(new_states0) pos = l0ov ps = new_states0 passnext = tensor.ones((inputs.shape[0],)) out_sc = [new_states0, new_cells0, passnext] for i, (cch, (i0, i1, l, o)) in enumerate(zip(self.cond_cert, self.layers[1:])): pop = self.softmax.apply(pos) best = pop.max(axis=1) passnext = passnext * tensor.le(best, cch) * kwargs['pass%d'%i] i0v = i0.apply(inputs) i1v = i1.apply(ps) prev_states = kwargs['states%d'%i] prev_cells = kwargs['cells%d'%i] new_states, new_cells = l.apply(inputs=i0v + i1v, states=prev_states, cells=prev_cells, iterate=False) new_states = tensor.switch(passnext[:, None], new_states, prev_states) new_cells = tensor.switch(passnext[:, None], new_cells, prev_cells) out_sc += [new_states, new_cells, passnext] ov = o.apply(new_states) pos = tensor.switch(passnext[:, None], pos + ov, pos) ps = new_states return [pos] + out_sc def get_dim(self, name): dims = {'pred': self.io_dim} for i, d in enumerate(self.hidden_dims): dims['states%d'%i] = dims['cells%d'%i] = d if name in dims: return dims[name] return super(CCHLSTM, self).get_dim(name) @apply.property('sequences') def apply_sequences(self): return ['inputs'] + ['pass%d'%i for i in range(len(self.hidden_dims)-1)] @apply.property('states') def apply_states(self): ret = [] for i in range(len(self.hidden_dims)): ret += ['states%d'%i, 'cells%d'%i] return ret @apply.property('outputs') def apply_outputs(self): ret = ['pred'] for i in range(len(self.hidden_dims)): ret += ['states%d'%i, 'cells%d'%i, 'active%d'%i] return ret
batch_size = 50 print 'Building model ...' # T x B x F x = tensor.tensor3('x', dtype=floatX) y = tensor.tensor3('y', dtype='int32') x_to_h1 = Linear(name='x_to_h1', input_dim=x_dim, output_dim=h_dim) pre_rnn = x_to_h1.apply(x) rnn = SimpleRecurrent(activation=Rectifier(), dim=h_dim, name="rnn") h1 = rnn.apply(pre_rnn) h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=o_dim) pre_softmax = h1_to_o.apply(h1) softmax = Softmax() shape = pre_softmax.shape softmax_out = softmax.apply(pre_softmax.reshape((-1, o_dim))) softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1]) error_rate.name = 'error_rate' # Initialization for brick in (x_to_h1, h1_to_o): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize() rnn.weights_init = Identity()
class questionAttentionLayer: def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_m_to_s') self.attention_dist = Softmax(name='iw_attetion') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='iw_r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='iw_r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() self.seq = LSTM(feature_dim, name='rereader_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rereader_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize() # video: batch_size x video_length x feature_dim # query: batch_size x q x feature_dim # mask: this mask is different from other masks # batch_size x q # eg. # -10000 == -np.Inf # 1: 0, 0, 0, 0, 0, -10000, -10000, -10000 # 2: 0, 0, 0, 0, -10000, -10000, -10000 # 3: 0, 0, 0, 0, 0, 0, 0, -10000 def apply(self, video, query, mask, batch_size): # batch_size x q x hidden_dim att1 = self.word_embed.apply(query) # batch_size x hidden_dim y_q = query att3 = self.image_embed.apply(video) att = att1 + att3.dimshuffle(0, 'x', 1) # batch_size x q x hidden_dim m = T.tanh(att) # batch_size x q s = self.m_to_s.apply(m) s = s.reshape((s.shape[0], s.shape[1])) # ignore the question padding 0s s = s + mask s = self.attention_dist.apply(s) y_q_s = y_q.swapaxes(1, 2) r = T.batched_dot(y_q_s, s) # batch_size x feature_dim return r
class Model(Initializable): def __init__(self, config, **kwargs): super(Model, self).__init__(**kwargs) self.config = config self.context_embedder = ContextEmbedder(config) self.prefix_encoder = MLP( activations=[Rectifier() for _ in config.prefix_encoder.dim_hidden] + [config.representation_activation()], dims=[config.prefix_encoder.dim_input] + config.prefix_encoder.dim_hidden + [config.representation_size], name="prefix_encoder", ) self.candidate_encoder = MLP( activations=[Rectifier() for _ in config.candidate_encoder.dim_hidden] + [config.representation_activation()], dims=[config.candidate_encoder.dim_input] + config.candidate_encoder.dim_hidden + [config.representation_size], name="candidate_encoder", ) self.softmax = Softmax() self.prefix_extremities = { "%s_k_%s" % (side, ["latitude", "longitude"][axis]): axis for side in ["first", "last"] for axis in [0, 1] } self.candidate_extremities = { "candidate_%s_k_%s" % (side, ["latitude", "longitude"][axis]): axis for side in ["first", "last"] for axis in [0, 1] } self.inputs = ( self.context_embedder.inputs + ["candidate_%s" % k for k in self.context_embedder.inputs] + self.prefix_extremities.keys() + self.candidate_extremities.keys() ) self.children = [self.context_embedder, self.prefix_encoder, self.candidate_encoder, self.softmax] def _push_initialization_config(self): for (mlp, config) in [ [self.prefix_encoder, self.config.prefix_encoder], [self.candidate_encoder, self.config.candidate_encoder], ]: mlp.weights_init = config.weights_init mlp.biases_init = config.biases_init @application(outputs=["destination"]) def predict(self, **kwargs): prefix_embeddings = tuple(self.context_embedder.apply(**{k: kwargs[k] for k in self.context_embedder.inputs})) prefix_extremities = tuple( (kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v] for k, v in self.prefix_extremities.items() ) prefix_inputs = tensor.concatenate(prefix_extremities + prefix_embeddings, axis=1) prefix_representation = self.prefix_encoder.apply(prefix_inputs) if self.config.normalize_representation: prefix_representation = prefix_representation / tensor.sqrt( (prefix_representation ** 2).sum(axis=1, keepdims=True) ) candidate_embeddings = tuple( self.context_embedder.apply(**{k: kwargs["candidate_%s" % k] for k in self.context_embedder.inputs}) ) candidate_extremities = tuple( (kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v] for k, v in self.candidate_extremities.items() ) candidate_inputs = tensor.concatenate(candidate_extremities + candidate_embeddings, axis=1) candidate_representation = self.candidate_encoder.apply(candidate_inputs) if self.config.normalize_representation: candidate_representation = candidate_representation / tensor.sqrt( (candidate_representation ** 2).sum(axis=1, keepdims=True) ) similarity_score = tensor.dot(prefix_representation, candidate_representation.T) similarity = self.softmax.apply(similarity_score) candidate_destination = tensor.concatenate( ( tensor.shape_padright(kwargs["candidate_last_k_latitude"][:, -1]), tensor.shape_padright(kwargs["candidate_last_k_longitude"][:, -1]), ), axis=1, ) return tensor.dot(similarity, candidate_destination) @predict.property("inputs") def predict_inputs(self): return self.inputs @application(outputs=["cost"]) def cost(self, **kwargs): y_hat = self.predict(**kwargs) y = tensor.concatenate( (kwargs["destination_latitude"][:, None], kwargs["destination_longitude"][:, None]), axis=1 ) return error.erdist(y_hat, y).mean() @cost.property("inputs") def cost_inputs(self): return self.inputs + ["destination_latitude", "destination_longitude"]
class Model(Initializable): def __init__(self, config, **kwargs): super(Model, self).__init__(**kwargs) self.config = config self.context_embedder = ContextEmbedder(config) self.prefix_encoder = MLP(activations=[ Rectifier() for _ in config.prefix_encoder.dim_hidden ] + [config.representation_activation()], dims=[config.prefix_encoder.dim_input] + config.prefix_encoder.dim_hidden + [config.representation_size], name='prefix_encoder') self.candidate_encoder = MLP( activations=[ Rectifier() for _ in config.candidate_encoder.dim_hidden ] + [config.representation_activation()], dims=[config.candidate_encoder.dim_input] + config.candidate_encoder.dim_hidden + [config.representation_size], name='candidate_encoder') self.softmax = Softmax() self.prefix_extremities = { '%s_k_%s' % (side, ['latitude', 'longitude'][axis]): axis for side in ['first', 'last'] for axis in [0, 1] } self.candidate_extremities = { 'candidate_%s_k_%s' % (side, ['latitude', 'longitude'][axis]): axis for side in ['first', 'last'] for axis in [0, 1] } self.inputs = self.context_embedder.inputs + [ 'candidate_%s' % k for k in self.context_embedder.inputs ] + self.prefix_extremities.keys() + self.candidate_extremities.keys() self.children = [ self.context_embedder, self.prefix_encoder, self.candidate_encoder, self.softmax ] def _push_initialization_config(self): for (mlp, config) in [[ self.prefix_encoder, self.config.prefix_encoder ], [self.candidate_encoder, self.config.candidate_encoder]]: mlp.weights_init = config.weights_init mlp.biases_init = config.biases_init @application(outputs=['destination']) def predict(self, **kwargs): prefix_embeddings = tuple( self.context_embedder.apply( **{k: kwargs[k] for k in self.context_embedder.inputs})) prefix_extremities = tuple( (kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v] for k, v in self.prefix_extremities.items()) prefix_inputs = tensor.concatenate(prefix_extremities + prefix_embeddings, axis=1) prefix_representation = self.prefix_encoder.apply(prefix_inputs) if self.config.normalize_representation: prefix_representation = prefix_representation / tensor.sqrt( (prefix_representation**2).sum(axis=1, keepdims=True)) candidate_embeddings = tuple( self.context_embedder.apply( **{ k: kwargs['candidate_%s' % k] for k in self.context_embedder.inputs })) candidate_extremities = tuple( (kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v] for k, v in self.candidate_extremities.items()) candidate_inputs = tensor.concatenate(candidate_extremities + candidate_embeddings, axis=1) candidate_representation = self.candidate_encoder.apply( candidate_inputs) if self.config.normalize_representation: candidate_representation = candidate_representation / tensor.sqrt( (candidate_representation**2).sum(axis=1, keepdims=True)) similarity_score = tensor.dot(prefix_representation, candidate_representation.T) similarity = self.softmax.apply(similarity_score) candidate_destination = tensor.concatenate( (tensor.shape_padright(kwargs['candidate_last_k_latitude'][:, -1]), tensor.shape_padright(kwargs['candidate_last_k_longitude'][:, -1])), axis=1) return tensor.dot(similarity, candidate_destination) @predict.property('inputs') def predict_inputs(self): return self.inputs @application(outputs=['cost']) def cost(self, **kwargs): y_hat = self.predict(**kwargs) y = tensor.concatenate((kwargs['destination_latitude'][:, None], kwargs['destination_longitude'][:, None]), axis=1) return error.erdist(y_hat, y).mean() @cost.property('inputs') def cost_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude']