Beispiel #1
0
class MemoryNetworkBase(Initializable):
    def __init__(self, config, prefix_encoder, candidate_encoder, **kwargs):
        super(MemoryNetworkBase, self).__init__(**kwargs)

        self.prefix_encoder = prefix_encoder
        self.candidate_encoder = candidate_encoder
        self.config = config

        self.softmax = Softmax()
        self.children = [self.softmax, prefix_encoder, candidate_encoder]

        self.inputs = self.prefix_encoder.apply.inputs \
                      + ['candidate_%s'%x for x in self.candidate_encoder.apply.inputs] \
                      + ['candidate_destination_latitude', 'candidate_destination_longitude']

    def candidate_destination(self, **kwargs):
        return tensor.concatenate(
            (tensor.shape_padright(kwargs['candidate_destination_latitude']),
             tensor.shape_padright(kwargs['candidate_destination_longitude'])),
            axis=1)

    @application(outputs=['cost'])
    def cost(self, **kwargs):
        y_hat = self.predict(**kwargs)
        y = tensor.concatenate((kwargs['destination_latitude'][:, None],
                                kwargs['destination_longitude'][:, None]),
                               axis=1)

        return error.erdist(y_hat, y).mean()

    @application(outputs=['destination'])
    def predict(self, **kwargs):
        prefix_representation = self.prefix_encoder.apply(
            **{x: kwargs[x]
               for x in self.prefix_encoder.apply.inputs})
        candidate_representation = self.candidate_encoder.apply(
            **{
                x: kwargs['candidate_' + x]
                for x in self.candidate_encoder.apply.inputs
            })

        if self.config.normalize_representation:
            prefix_representation = prefix_representation \
                    / tensor.sqrt((prefix_representation ** 2).sum(axis=1, keepdims=True))
            candidate_representation = candidate_representation \
                    / tensor.sqrt((candidate_representation ** 2).sum(axis=1, keepdims=True))

        similarity_score = tensor.dot(prefix_representation,
                                      candidate_representation.T)
        similarity = self.softmax.apply(similarity_score)

        return tensor.dot(similarity, self.candidate_destination(**kwargs))

    @predict.property('inputs')
    def predict_inputs(self):
        return self.inputs

    @cost.property('inputs')
    def cost_inputs(self):
        return self.inputs + ['destination_latitude', 'destination_longitude']
Beispiel #2
0
class Model(RNN):
    @lazy()
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(config,
                                    rec_input_len=4,
                                    output_dim=config.tgtcls.shape[0],
                                    **kwargs)
        self.classes = theano.shared(numpy.array(config.tgtcls,
                                                 dtype=theano.config.floatX),
                                     name='classes')
        self.softmax = Softmax()
        self.sequences.extend(['latitude_lag', 'longitude_lag'])
        self.children.append(self.softmax)

    def before_predict_all(self, kwargs):
        super(Model, self).before_predict_all(kwargs)
        kwargs['latitude_lag'] = tensor.extra_ops.repeat(kwargs['latitude'],
                                                         2,
                                                         axis=0)
        kwargs['longitude_lag'] = tensor.extra_ops.repeat(kwargs['longitude'],
                                                          2,
                                                          axis=0)

    def process_rto(self, rto):
        return tensor.dot(self.softmax.apply(rto), self.classes)

    def rec_input(self, latitude, longitude, latitude_lag, longitude_lag,
                  **kwargs):
        return (tensor.shape_padright(latitude),
                tensor.shape_padright(longitude),
                tensor.shape_padright(latitude_lag),
                tensor.shape_padright(longitude_lag))
Beispiel #3
0
class rewatching:
    def __init__(self, batch_size, output_length, visual_dim, word_dim,
                 visual_feature_dim, question_feature_dim, joint_dim,
                 memory_dim, output_dim, fc1_dim, fc2_dim, voc_size):
        # the video encoder
        self.video_encoder = visualEncoder(visual_dim, visual_feature_dim)
        self.sentence_encoder = questionEncoder(word_dim, question_feature_dim)
        self.toJoint = embeddingLayer(2 * question_feature_dim,
                                      2 * visual_feature_dim, joint_dim)
        self.rewatcher = impatientLayer(joint_dim, memory_dim, output_dim)

        self.seq_gen = seqDecoder(joint_dim, output_dim, fc1_dim, fc2_dim)
        self.softmax_layer = Softmax()
        self.bs = batch_size
        self.output_length = output_length
        self.voc_size = voc_size

    def build_model(self, frame, q, q_rev, mask, maskMat, mask01, padding):
        bs = self.bs
        # visual dim -> visual feature dim
        video_embedding = self.video_encoder.apply(frame)
        # wod_dim -> question feature dimA
        question_embedding, u1, u2 = self.sentence_encoder.apply(
            q, q_rev, mask, bs)
        # -> joint_dim
        questionJoint, videoJoint, u = self.toJoint.apply(
            words=question_embedding, video=video_embedding, u1=u1, u2=u2)
        # bs x joint_dim, bs x output_dim
        question = questionJoint[:, -1, :]
        #video = videoJoint[:, -1, :]

        r_q, seq_r_q = self.rewatcher.apply(videoJoint, questionJoint, mask,
                                            bs)
        fc_r = self.seq_gen.apply(self.output_length, r_q, question, padding)
        fc = fc_r.reshape((self.bs * self.output_length, self.voc_size))
        self.softmax_result = self.softmax_layer.apply(fc)
        self.pred = T.argmax(self.softmax_result, axis=1)
        self.pred = self.pred.reshape((self.bs, self.output_length))

    # groundtruth_: batch_size x output_length
    # mask_01: (batch_size x output_length)
    # this mask is a 0-1 matrix where 0 indicates padding area of the answer
    def loss(self, groundtruth_, mask_01):
        mask = mask_01.flatten()
        gt = groundtruth_.flatten()

        self.p = self.softmax_result[T.arange(self.bs * self.output_length),
                                     gt]
        self.cost_ = T.log(self.p + 1e-20)
        self.cost = -T.sum(self.cost_ * mask) / self.bs
        self.cost.name = 'softmax_cost'
        return self.cost

    def error(self, groundtruth, mask_01):
        return T.neq(T.sum(T.neq(self.pred, groundtruth) * mask_01, axis=1),
                     0).sum() / self.bs

    def predict(self):
        return self.pred
Beispiel #4
0
        def onestepContextAttn(hContextAttn):

            preContextatt = attentionmlpContext.apply(hContextAttn)
            attContextsoft = Softmax()
            attContextpyx = attContextsoft.apply(preContextatt.flatten())
            attContextpred = attContextpyx.flatten()
            attcontext = T.mul(hContextAttn.dimshuffle(1,0), attContextpred).dimshuffle(1,0)

            return attcontext
Beispiel #5
0
        def onestepEncAttn(hEncAttn):

            preEncattn = attentionmlpEnc.apply(hEncAttn)
            attEncsoft = Softmax()
            attEncpyx = attEncsoft.apply(preEncattn.flatten())
            attEncpred = attEncpyx.flatten()
            attenc = T.mul(hEncAttn.dimshuffle(1,0), attEncpred).dimshuffle(1,0)

            return attenc
Beispiel #6
0
class Model(RNN):
    @lazy()
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(config, output_dim=config.tgtcls.shape[0], **kwargs)
        self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes')
        self.softmax = Softmax()
        self.children.append(self.softmax)

    def process_rto(self, rto):
        return tensor.dot(self.softmax.apply(rto), self.classes)
class SoftmaxLinear(Initializable):
    def __init__(self, input_dim, output_dim, **kwargs):
        super(SoftmaxLinear, self).__init__(**kwargs)
        self.linear = Linear(input_dim=input_dim, output_dim=output_dim)
        self.sofmax = Softmax()

        self.children = [self.linear, self.sofmax]

    def apply(self, input_):
        output = self.sofmax.apply(self.linear.apply(input_))
        return output
Beispiel #8
0
class MemoryNetworkBase(Initializable):
    def __init__(self, config, prefix_encoder, candidate_encoder, **kwargs):
        super(MemoryNetworkBase, self).__init__(**kwargs)

        self.prefix_encoder = prefix_encoder
        self.candidate_encoder = candidate_encoder
        self.config = config

        self.softmax = Softmax()
        self.children = [ self.softmax, prefix_encoder, candidate_encoder ]

        self.inputs = self.prefix_encoder.apply.inputs \
                      + ['candidate_%s'%x for x in self.candidate_encoder.apply.inputs] \
                      + ['candidate_destination_latitude', 'candidate_destination_longitude']

    def candidate_destination(self, **kwargs):
        return tensor.concatenate(
                (tensor.shape_padright(kwargs['candidate_destination_latitude']),
                 tensor.shape_padright(kwargs['candidate_destination_longitude'])),
                axis=1)

    @application(outputs=['cost'])
    def cost(self, **kwargs):
        y_hat = self.predict(**kwargs)
        y = tensor.concatenate((kwargs['destination_latitude'][:, None],
                                kwargs['destination_longitude'][:, None]), axis=1)

        return error.erdist(y_hat, y).mean()

    @application(outputs=['destination'])
    def predict(self, **kwargs):
        prefix_representation = self.prefix_encoder.apply(**{ x: kwargs[x] for x in self.prefix_encoder.apply.inputs })
        candidate_representation = self.candidate_encoder.apply(**{ x: kwargs['candidate_'+x] for x in self.candidate_encoder.apply.inputs })

        if self.config.normalize_representation:
            prefix_representation = prefix_representation \
                    / tensor.sqrt((prefix_representation ** 2).sum(axis=1, keepdims=True))
            candidate_representation = candidate_representation \
                    / tensor.sqrt((candidate_representation ** 2).sum(axis=1, keepdims=True))

        similarity_score = tensor.dot(prefix_representation, candidate_representation.T)
        similarity = self.softmax.apply(similarity_score)

        return tensor.dot(similarity, self.candidate_destination(**kwargs))

    @predict.property('inputs')
    def predict_inputs(self):
        return self.inputs

    @cost.property('inputs')
    def cost_inputs(self):
        return self.inputs + ['destination_latitude', 'destination_longitude']
Beispiel #9
0
class Model(RNN):
    @lazy()
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(config,
                                    output_dim=config.tgtcls.shape[0],
                                    **kwargs)
        self.classes = theano.shared(numpy.array(config.tgtcls,
                                                 dtype=theano.config.floatX),
                                     name='classes')
        self.softmax = Softmax()
        self.children.append(self.softmax)

    def process_rto(self, rto):
        return tensor.dot(self.softmax.apply(rto), self.classes)
Beispiel #10
0
class Model(RNN):
    @lazy()
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(config, rec_input_len=4, output_dim=config.tgtcls.shape[0], **kwargs)
        self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes')
        self.softmax = Softmax()
        self.sequences.extend(['latitude_lag', 'longitude_lag'])
        self.children.append(self.softmax)

    def before_predict_all(self, kwargs):
        super(Model, self).before_predict_all(kwargs)
        kwargs['latitude_lag'] = tensor.extra_ops.repeat(kwargs['latitude'], 2, axis=0)
        kwargs['longitude_lag'] = tensor.extra_ops.repeat(kwargs['longitude'], 2, axis=0)

    def process_rto(self, rto):
        return tensor.dot(self.softmax.apply(rto), self.classes)

    def rec_input(self, latitude, longitude, latitude_lag, longitude_lag, **kwargs):
        return (tensor.shape_padright(latitude),
                tensor.shape_padright(longitude),
                tensor.shape_padright(latitude_lag),
                tensor.shape_padright(longitude_lag))
Beispiel #11
0
    def train(self, X, Y, idx_folds, hyper_params, model_prefix, verbose=False):

        import os
        from collections import OrderedDict
        from fuel.datasets import IndexableDataset
        from blocks.model import Model
        from blocks.bricks import Linear, Softmax
        from blocks.bricks.conv import MaxPooling
        from blocks.initialization import Uniform
        from deepthought.bricks.cost import HingeLoss
        import numpy as np
        import theano
        from theano import tensor

        assert model_prefix is not None

        fold_weights_filename = '{}_weights.npy'.format(model_prefix)

        # convert Y to one-hot encoding
        n_classes = len(set(Y))
        Y = np.eye(n_classes, dtype=int)[Y]

        features = tensor.matrix('features', dtype=theano.config.floatX)
        targets = tensor.lmatrix('targets')

        input_ = features

        dim = X.shape[-1]
        
        # optional additional layers
        if self.pipeline_factory is not None:
            # need to re-shape flattened input to restore bc01 format
            input_shape = (input_.shape[0],) + hyper_params['classifier_input_shape']  # tuple, uses actual batch size
            input_ = input_.reshape(input_shape)

            pipeline = self.pipeline_factory.build_pipeline(input_shape, hyper_params)
            input_ = pipeline.apply(input_)                        
            input_ = input_.flatten(ndim=2)
            
            # this is very hacky, but there seems to be no elegant way to obtain a value for dim
            dummy_fn = theano.function(inputs=[features], outputs=input_)
            dummy_out = dummy_fn(X[:1])
            dim = dummy_out.shape[-1]
            
            
        if hyper_params['classifier_pool_width'] > 1:
            # FIXME: this is probably broken!
            
    #        c = hyper_params['num_components']
    #        input_ = input_.reshape((input_.shape[0], c, input_.shape[-1] // c, 1))  # restore bc01
            # need to re-shape flattened input to restore bc01 format
            input_shape = hyper_params['classifier_pool_input_shape']  # tuple
            input_ = input_.reshape(input_shape)

            pool = MaxPooling(name='pool',
                              input_dim=input_shape[1:],  # (c, X.shape[-1] // c, 1),
                              pooling_size=(hyper_params['classifier_pool_width'], 1),
                              step=(hyper_params['classifier_pool_stride'], 1))
            input_ = pool.apply(input_)
            input_ = input_.reshape((input_.shape[0], tensor.prod(input_.shape[1:])))

            dim = np.prod(pool.get_dim('output'))


        linear = Linear(name='linear',
                        input_dim=dim,
                        output_dim=n_classes,
                        weights_init=Uniform(mean=0, std=0.01),
                        use_bias=False)
        linear.initialize()

        softmax = Softmax('softmax')

        probs = softmax.apply(linear.apply(input_))
        prediction = tensor.argmax(probs, axis=1)

        model = Model(probs)  # classifier with raw probability outputs
        predict = theano.function([features], prediction)  # ready-to-use predict function

        if os.path.isfile(fold_weights_filename):
            # load filter weights from existing file
            fold_weights = np.load(fold_weights_filename)
            print 'loaded filter weights from', fold_weights_filename
        else:
            # train model

            from blocks.bricks.cost import MisclassificationRate
            from blocks.filter import VariableFilter
            from blocks.graph import ComputationGraph
            from blocks.roles import WEIGHT
            from blocks.bricks import Softmax
            from blocks.model import Model
            from blocks.algorithms import GradientDescent, Adam
            from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar
            from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring
            from blocks.extensions.predicates import OnLogRecord
            from fuel.streams import DataStream
            from fuel.schemes import SequentialScheme, ShuffledScheme
            from blocks.monitoring import aggregation
            from blocks.main_loop import MainLoop
            from blocks.extensions.training import TrackTheBest
            from deepthought.extensions.parameters import BestParams
            # from deepthought.datasets.selection import DatasetMetaDB

            init_param_values = model.get_parameter_values()

            cost = HingeLoss().apply(targets, probs)
            # Note: this requires just the class labels, not in a one-hot encoding
            error_rate = MisclassificationRate().apply(targets.argmax(axis=1), probs)
            error_rate.name = 'error_rate'

            cg = ComputationGraph([cost])

            # L1 regularization
            if hyper_params['classifier_l1wdecay'] > 0:
                weights = VariableFilter(roles=[WEIGHT])(cg.variables)
                cost = cost + hyper_params['classifier_l1wdecay'] * sum([abs(W).sum() for W in weights])

            cost.name = 'cost'

            # iterate over trial folds
            fold_weights = []
            fold_errors = []

            # for ifi, ifold in fold_generator.get_inner_cv_folds(outer_fold):
            #
            #     train_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['train'])
            #     valid_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['valid'])
            #
            #     metadb = DatasetMetaDB(meta, train_selectors.keys())
            #
            #     # get selected trial IDs
            #     train_idx = metadb.select(train_selectors)
            #     valid_idx = metadb.select(valid_selectors)

            for train_idx, valid_idx in idx_folds:

                # print train_idx
                # print valid_idx

                trainset = IndexableDataset(indexables=OrderedDict(
                    [('features', X[train_idx]), ('targets', Y[train_idx])]))

                validset = IndexableDataset(indexables=OrderedDict(
                    [('features', X[valid_idx]), ('targets', Y[valid_idx])]))

                model.set_parameter_values(init_param_values)

                best_params = BestParams()
                best_params.add_condition(['after_epoch'],
                                          predicate=OnLogRecord('error_rate_valid_best_so_far'))

                algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam())

                extensions = [Timing(),
                              FinishAfter(after_n_epochs=hyper_params['classifier_max_epochs']),
                              DataStreamMonitoring(
                                  [cost, error_rate],
                                  DataStream.default_stream(
                                      validset,
                                      iteration_scheme=SequentialScheme(
                                          validset.num_examples, hyper_params['classifier_batch_size'])),
                                  suffix="valid"),
                              TrainingDataMonitoring(
                                  [cost, error_rate,
                                   aggregation.mean(algorithm.total_gradient_norm)],
                                  suffix="train",
                                  after_epoch=True),
                              TrackTheBest('error_rate_valid'),
                              best_params  # after TrackTheBest!
                              ]

                if verbose:
                    extensions.append(Printing())  # optional
                    extensions.append(ProgressBar())

                main_loop = MainLoop(
                    algorithm,
                    DataStream.default_stream(
                        trainset,
                        iteration_scheme=ShuffledScheme(trainset.num_examples, hyper_params['classifier_batch_size'])),
                    model=model,
                    extensions=extensions)

                main_loop.run()

                fold_weights.append(best_params.values['/linear.W'])
                fold_errors.append(main_loop.status['best_error_rate_valid'])
                # break # FIXME

            fold_errors = np.asarray(fold_errors).squeeze()
            print 'simple NN fold classification errors:', fold_errors

            fold_weights = np.asarray(fold_weights)

            # store filter weights for later analysis
            np.save(fold_weights_filename, fold_weights)

        weights = fold_weights.mean(axis=0)

        linear.parameters[0].set_value(weights)

        return model, predict
Beispiel #12
0
class videoAttentionLayer:
    # both visual and word feature are in the joint space
    # of dim: feature_dim
    # hidden_dim: dim of m
    # output_dim: final joint document query representation dim
    def __init__(self, feature_dim, hidden_dim, output_dim):
        self.image_embed = Linear(input_dim=feature_dim,
                                  output_dim=hidden_dim,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  use_bias=False,
                                  name='image_embed')
        self.word_embed = Linear(input_dim=feature_dim,
                                 output_dim=hidden_dim,
                                 weights_init=IsotropicGaussian(0.01),
                                 biases_init=Constant(0),
                                 use_bias=False,
                                 name='word_embed')
        self.r_embed = Linear(input_dim=feature_dim,
                              output_dim=hidden_dim,
                              weights_init=IsotropicGaussian(0.01),
                              biases_init=Constant(0),
                              use_bias=False,
                              name='r_embed')
        self.m_to_s = Linear(input_dim=hidden_dim,
                             output_dim=1,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='m_to_s')
        self.attention_dist = Softmax(name='attention_dist_softmax')
        self.r_to_r = Linear(input_dim=feature_dim,
                             output_dim=feature_dim,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='r_to_r')
        # self.r_to_g = Linear(input_dim=feature_dim,
        #                      output_dim=output_dim,
        #                      weights_init=IsotropicGaussian(0.01),
        #                      biases_init=Constant(0),
        #                      use_bias=False,
        #                      name='r_to_g')
        self.image_embed.initialize()
        self.word_embed.initialize()
        self.r_embed.initialize()
        self.m_to_s.initialize()
        self.r_to_r.initialize()
        # self.r_to_g.initialize()

        # the sequence to sequence LSTM
        self.seq = LSTM(output_dim,
                        name='rewatcher_seq',
                        weights_init=IsotropicGaussian(0.01),
                        biases_init=Constant(0))
        self.seq_embed = Linear(feature_dim,
                                output_dim * 4,
                                name='rewatcher_seq_embed',
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0),
                                use_bias=False)

        self.seq.initialize()
        self.seq_embed.initialize()

    # doc: row major batch_size x doc_length x feature_dim
    # query: row major batch_size x feature_dim
    # mask: mask of query batch_size
    # mask: length of a sentence - 1
    def apply(self, doc, query, mask_, batch_size):
        # batch_size x doc_length x hidden_dim
        mask = mask_.flatten()
        att1 = self.image_embed.apply(doc)

        # y_q_i: the ith token of question
        #        batch_size x feature_dim
        # r_1: r_m_1
        #        batch_size x feature_dim
        # y_d: document
        #        batch_size x doc_length x feature_dim
        # y_d_m: d-to-m
        #        batch_size x doc_length x hidden_dim

        # batch_size x hidden_dim

        # batch_size x hidden_dim
        y_d = doc
        att3 = self.word_embed.apply(query)
        att = att1 + att3.dimshuffle(0, 'x', 1)
        # batch_size x doc_length x hidden_dim
        m = T.tanh(att)
        # batch_size x doc_length x 1
        s = self.m_to_s.apply(m)
        # batch_size x doc_length
        s = s.reshape((s.shape[0], s.shape[1]))
        s = self.attention_dist.apply(s)
        y_d_s = y_d.swapaxes(1, 2)
        # return batch_size x feature_dim
        r = T.batched_dot(y_d_s, s)


        # batch_size x output_dim
        return r
Beispiel #13
0
class iwLayer:
    def __init__(self, feature_dim, hidden_dim, output_dim):
        self.image_embed = Linear(input_dim=feature_dim,
                                  output_dim=hidden_dim,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  use_bias=False,
                                  name='iw_image_embed')
        self.word_embed = Linear(input_dim=feature_dim,
                                 output_dim=hidden_dim,
                                 weights_init=IsotropicGaussian(0.01),
                                 biases_init=Constant(0),
                                 use_bias=False,
                                 name='iw_word_embed')
        self.r_embed = Linear(input_dim=feature_dim,
                              output_dim=hidden_dim,
                              weights_init=IsotropicGaussian(0.01),
                              biases_init=Constant(0),
                              use_bias=False,
                              name='iw_r_embed')
        self.m_to_s = Linear(input_dim=hidden_dim,
                             output_dim=1,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='iw_m_to_s')
        self.attention_dist = Softmax(name='iw_attetion')
        self.r_to_r = Linear(input_dim=feature_dim,
                             output_dim=feature_dim,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='iw_r_to_r')
        # self.r_to_g = Linear(input_dim=feature_dim,
        #                      output_dim=output_dim,
        #                      weights_init=IsotropicGaussian(0.01),
        #                      biases_init=Constant(0),
        #                      use_bias=False,
        #                      name='iw_r_to_g')
        self.image_embed.initialize()
        self.word_embed.initialize()
        self.r_embed.initialize()
        self.m_to_s.initialize()
        self.r_to_r.initialize()
        # self.r_to_g.initialize()
        self.seq = LSTM(feature_dim,
                        name='rereader_seq',
                        weights_init=IsotropicGaussian(0.01),
                        biases_init=Constant(0))
        self.seq_embed = Linear(feature_dim,
                                output_dim * 4,
                                name='rereader_seq_embed',
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0),
                                use_bias=False)

        self.seq.initialize()
        self.seq_embed.initialize()


    # video: batch_size x video_length x feature_dim
    # query: batch_size x q x feature_dim
    # mask: this mask is different from other masks
    # batch_size x q
    # eg.
    # -10000 == -np.Inf
    # 1:   0, 0, 0, 0, 0, -10000, -10000, -10000
    # 2:   0, 0, 0, 0, -10000, -10000, -10000
    # 3:   0, 0, 0, 0, 0, 0, 0, -10000
    def apply(self, video, query, mask, batch_size):
        # batch_size x q x hidden_dim
        att1 = self.word_embed.apply(query)

        def one_step(y_d_i, r_1, y_q, y_q_m):
            # batch_size x hidden_dim
            att2 = self.r_embed.apply(r_1)
            att3 = self.image_embed.apply(y_d_i)
            att = y_q_m + att2.dimshuffle(0, 'x', 1) + att3.dimshuffle(0, 'x', 1)
            # batch_size x q x hidden_dim
            m = T.tanh(att)
            # batch_size x q
            s = self.m_to_s.apply(m)
            s = s.reshape((s.shape[0], s.shape[1]))
            # ignore the question padding 0s
            s = s + mask
            s = self.attention_dist.apply(s)
            y_q_s = y_q.swapaxes(1, 2)
            return T.batched_dot(y_q_s, s) + T.tanh(self.r_to_r.apply(r_1))

        # r: video_length x batch_size x feature_dim
        r, updates = theano.scan(fn=one_step,
                                 sequences=[video.swapaxes(0, 1)],
                                 outputs_info=T.zeros_like(video[:, 0, :]),
                                 non_sequences=[query, att1],
                                 n_steps=video.shape[1],
                                 name='iw layer')

        # video_length x batch_size x output_dim
        Wr = self.seq_embed.apply(r)
        seq_r, garbage = self.seq.apply(Wr)
        
        # batch_size x feature_dim
        r_V = r[-1, :, :]
        # batch_size x output_dim
        seq_r_V = seq_r[-1, :, :]
        return r_V, seq_r_V
Beispiel #14
0
###################
#### Softmax
###################

from blocks.bricks import Softmax
from blocks.bricks.cost import MisclassificationRate

W2 = theano.shared(
    numpy.random.normal(size=(n_out, num_protos)).astype('float32'))
b = theano.shared(numpy.zeros((num_protos, )).astype('float32'))
y = tensor.ivector('y')

h = tensor.dot(h3, W2) + b
h = tensor.switch(h < 0, -h, h)
sm = Softmax()
pred = sm.apply(h)
misclass = MisclassificationRate().apply(y, pred)
c = sm.categorical_cross_entropy(y, h).mean()

s_params = [W2, b]
s_grad = theano.grad(c, s_params)
s_updates = [p - numpy.float32(0.05) * g for p, g in zip(s_params, s_grad)]
s_f = theano.function([h3, y], [c, misclass], updates=zip(s_params, s_updates))
s_pred = theano.function([h3], pred)

for j in range(200):
    for i in range(n_batches):
        if i == 0:
            print s_f(data[i * batch_size:(i + 1) * batch_size, :],
                      labels[i * batch_size:(i + 1) * batch_size])
        else:
class EncoderDecoder(Initializable, Random):
    """Encapsulate all reusable logic.

    This class plays a few roles: (a) it's a top brick that knows
    how to combine bottom, bidirectional and recognizer network, (b)
    it has the inputs variables and can build whole computation graphs
    starting with them (c) it hides compilation of Theano functions
    and initialization of beam search. I find it simpler to have it all
    in one place for research code.

    Parameters
    ----------
    All defining the structure and the dimensions of the model. Typically
    receives everything from the "net" section of the config.

    """

    def __init__(self,
                 input_dims,
                 input_num_chars,
                 bos_label, eos_label,
                 num_labels,
                 dim_dec, dims_bidir,
                 enc_transition, dec_transition,
                 use_states_for_readout,
                 attention_type,
                 criterion,
                 bottom,
                 lm=None, token_map=None,
                 bidir=True, window_size=None,
                 max_length=None, subsample=None,
                 dims_top=None, extra_input_dim=None,
                 prior=None, conv_n=None,
                 post_merge_activation=None,
                 post_merge_dims=None,
                 dim_matcher=None,
                 embed_outputs=True,
                 dim_output_embedding=None,
                 reuse_bottom_lookup_table=False,
                 dec_stack=1,
                 conv_num_filters=1,
                 data_prepend_eos=True,
                 # softmax is the default set in SequenceContentAndConvAttention
                 energy_normalizer=None,
                 # for speech this is the approximate phoneme duration in frames
                 max_decoded_length_scale=1,
                 # for criterions involving generation of outputs, whether
                 # or not they should be generated by the recognizer itself
                 generate_predictions=True,
                 compute_targets=True,
                 extra_generation_steps=3,
                 **kwargs):
        all_arguments = copy.deepcopy(locals())
        all_arguments.update(copy.deepcopy(kwargs))
        del all_arguments['kwargs']
        del all_arguments['self']

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(EncoderDecoder, self).__init__(**kwargs)
        self.bos_label = bos_label
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion
        self.generate_predictions = generate_predictions
        self.extra_generation_steps = extra_generation_steps
        self.compute_targets = compute_targets

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(
            input_dims=input_dims, input_num_chars=input_num_chars,
            name='bottom',
            **bottom)

        # BiRNN
        if dims_bidir:
            if not subsample:
                subsample = [1] * len(dims_bidir)
            encoder = Encoder(self.enc_transition, dims_bidir,
                            bottom.get_dim(bottom.apply.outputs[0]),
                            subsample, bidir=bidir)
        elif window_size:
            encoder = ConvEncoder(
                max_length, bottom.get_dim(bottom.apply.outputs[0]), window_size)
        else:
            raise ValueError("Don't know which Encoder to use")
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [dim_encoded] + dims_top + [dim_encoded], name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(
                dim=dim_dec, activation=Tanh(), name="transition")
        else:
            assert not extra_input_dim
            transitions = [self.dec_transition(dim=dim_dec,
                                               activation=Tanh(),
                                               name="transition_{}".format(trans_level))
                           for trans_level in xrange(dec_stack)]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError("Unknown attention type {}"
                             .format(attention_type))
        if not embed_outputs:
            raise ValueError("embed_outputs=False is not supported any more")
        if not reuse_bottom_lookup_table:
            embedding = LookupTable(num_labels + 1,
                            dim_dec if
                            dim_output_embedding is None
                            else dim_output_embedding)
        else:
            embedding = bottom.children[0]
        feedback = Feedback(
            embedding=embedding,
            output_names=[s for s in transition.apply.sequences
                           if s != 'mask'])

        # Create a readout
        readout_config = dict(
            num_tokens=num_labels,
            input_names=(transition.apply.states if use_states_for_readout else [])
                         + [attention.take_glimpses.outputs[0]],
            name="readout")
        if post_merge_dims:
            readout_config['merge_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence([
                Bias(post_merge_dims[0]).apply,
                post_merge_activation.apply,
                MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()],
                    # MLP was designed to support Maxout is activation
                    # (because Maxout in a way is not one). However
                    # a single layer Maxout network works with the trick below.
                    # For deeper Maxout network one has to use the
                    # Sequence brick.
                    [d//getattr(post_merge_activation, 'num_pieces', 1)
                     for d in post_merge_dims] + [num_labels]).apply,
            ], name='post_merge')
        if 'reward' in criterion and criterion['name'] != 'log_likelihood':
            if criterion['reward'] == 'edit_distance':
                readout_config['reward_brick'] = EditDistanceReward(
                    self.bos_label, self.eos_label)
            elif criterion['reward'] == 'delta_edit_distance':
                readout_config['reward_brick'] = EditDistanceReward(
                    self.bos_label, self.eos_label, deltas=True)
            elif criterion['reward'] == 'bleu':
                readout_config['reward_brick'] = BleuReward(
                    self.bos_label, self.eos_label, deltas=False)
            elif criterion['reward'] == 'delta_bleu':
                readout_config['reward_brick'] = BleuReward(
                    self.bos_label, self.eos_label, deltas=True)
            else:
                raise ValueError("Unknown reward type")
        if criterion['name'] == 'log_likelihood':
            readout_class = SoftmaxReadout
        elif criterion['name'] == 'critic':
            readout_class = CriticReadout
            criterion_copy = dict(criterion)
            del criterion_copy['name']
            readout_config.update(**criterion_copy)
        elif criterion['name'] == 'reinforce':
            readout_class = ReinforceReadout
            readout_config['merge_names'] = list(readout_config['input_names'])
            readout_config['entropy'] = criterion.get('entropy')
            readout_config['input_names'] += ['attended', 'attended_mask']
        elif criterion['name'] in ['sarsa', 'actor_critic']:
            readout_class = ActorCriticReadout
            if criterion['name'] == 'actor_critic':
                critic_arguments = dict(all_arguments)
                # No worries, critic will not compute log likelihood values.
                # We
                critic_arguments['criterion'] = {
                    'name': 'critic',
                    'value_softmax': criterion.get('value_softmax'),
                    'same_value_for_wrong': criterion.get('same_value_for_wrong'),
                    'groundtruth_word_bonus': criterion.get('groundtruth_word_bonus'),
                    'dueling_outputs':  criterion.get('dueling_outputs')}
                critic_arguments['name'] = 'critic'
                if criterion.get('critic_uses_actor_states'):
                    critic_arguments['extra_input_dim'] = dim_dec
                if (criterion.get('value_softmax')
                        or criterion.get('same_value_for_wrong')
                        or criterion.get('dueling_outputs')):
                    # Add an extra output for the critic
                    critic_arguments['num_labels'] = num_labels + 1
                if criterion.get('force_bidir'):
                    critic_arguments['dims_bidir'] = [dim_dec]
                critic_arguments['reuse_bottom_lookup_table'] = True
                critic_arguments['input_num_chars'] = {'inputs': num_labels}
                if criterion.get('downsize_critic'):
                    critic_arguments = _downsize_config(
                        critic_arguments, criterion['downsize_critic'])
                critic = EncoderDecoder(**critic_arguments)
                readout_config['critic'] = critic
            readout_config['merge_names'] = list(readout_config['input_names'])
            readout_config['freeze_actor'] = criterion.get('freeze_actor')
            readout_config['freeze_critic'] = criterion.get('freeze_critic')
            readout_config['critic_uses_actor_states'] = criterion.get('critic_uses_actor_states')
            readout_config['critic_uses_groundtruth'] = criterion.get('critic_uses_groundtruth')
            readout_config['critic_burnin_steps'] = criterion.get('critic_burnin_steps')
            readout_config['critic_loss'] = criterion.get('critic_loss')
            readout_config['discount'] = criterion.get('discount')
            readout_config['entropy_reward_coof'] = criterion.get('entropy_reward_coof')
            readout_config['cross_entropy_reward_coof'] = criterion.get('cross_entropy_reward_coof')
            readout_config['value_penalty'] = criterion.get('value_penalty')
            readout_config['value_penalty_type'] = criterion.get('value_penalty_type')
            readout_config['critic_policy_t'] = criterion.get('critic_policy_t')
            readout_config['bos_token'] = bos_label
            readout_config['accumulate_outputs'] = criterion.get('accumulate_outputs')
            readout_config['use_value_biases'] = criterion.get('use_value_biases')
            readout_config['actor_grad_estimate'] = criterion.get('actor_grad_estimate')
            readout_config['input_names'] += ['attended', 'attended_mask']
            # Note, that settings below are for the "clean" mode.
            # When get_cost_graph() is run with training=True, they
            # are temporarily overriden with the "real" settings from
            # "criterion"
            readout_config['compute_targets'] = True
            readout_config['trpo_coef'] = 0.0
            readout_config['solve_bellman'] = True
        else:
            raise ValueError("Unknown criterion {}".format(criterion['name']))
        readout = readout_class(**readout_config)

        if lm:
            raise ValueError("LM is currently not supported")

        recurrent = AttentionRecurrent(transition, attention)
        if extra_input_dim:
            recurrent = RecurrentWithExtraInput(
                recurrent, "extra_inputs", extra_input_dim, name="with_extra_inputs")
        generator = SequenceGenerator(
            recurrent=recurrent, readout=readout, feedback=feedback,
            name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.softmax = Softmax()
        self.children = [encoder, top, bottom, generator, self.softmax]

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.predicted_labels = tensor.lmatrix('predicted_labels')
        self.predicted_mask = tensor.matrix('predicted_mask')
        self.prefix_labels = tensor.lmatrix('prefix_labels')
        self.prefix_steps = tensor.lscalar('prefix_steps')

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.single_predicted_labels = tensor.lvector('predicted_labels')
        self.n_steps = tensor.lscalar('n_steps')

        # Configure mixed_generate
        if criterion['name'] == 'actor_critic':
            critic = self.generator.readout.critic
            self.mixed_generate.sequences = []
            self.mixed_generate.states = (
                ['step'] +
                self.generator.recurrent.apply.states +
                ['critic_' + name for name in critic.generator.recurrent.apply.states])
            self.mixed_generate.outputs = (
                ['samples', 'step'] +
                self.generator.recurrent.apply.outputs +
                ['critic_' + name for name in critic.generator.recurrent.apply.outputs])
            self.mixed_generate.contexts = (
                self.generator.recurrent.apply.contexts +
                ['critic_' + name for name in critic.generator.recurrent.apply.contexts]
                + ['groundtruth', 'groundtruth_mask'])
            self.initial_states.outputs = self.mixed_generate.states

        self.prefix_generate.sequences = []
        self.prefix_generate.states = ['step'] + self.generator.recurrent.apply.states
        self.prefix_generate.outputs = ['samples', 'step'] + self.generator.recurrent.apply.outputs
        self.prefix_generate.contexts = self.generator.recurrent.apply.contexts


    def push_initialization_config(self):
        super(EncoderDecoder, self).push_initialization_config()
        if self.rec_weights_init:
            rec_weights_config = {'weights_init': self.rec_weights_init,
                                  'recurrent_weights_init': self.rec_weights_init}
            global_push_initialization_config(self,
                                              rec_weights_config,
                                              BaseRecurrent)
        if self.initial_states_init:
            global_push_initialization_config(self,
                                              {'initial_states_init': self.initial_states_init})

    @application
    def costs(self, **kwargs):
        # pop inputs we know about
        prediction = kwargs.pop('prediction')
        prediction_mask = kwargs.pop('prediction_mask')
        groundtruth = kwargs.pop('groundtruth', None)
        groundtruth_mask = kwargs.pop('groundtruth_mask', None)
        inputs_mask = kwargs.pop('inputs_mask')
        extra_inputs = kwargs.pop('extra_inputs', None)

        # the rest is for bottom
        bottom_processed = self.bottom.apply(**kwargs)
        encoded, encoded_mask = self.encoder.apply(
            input_=bottom_processed, mask=inputs_mask)
        encoded = self.top.apply(encoded)
        costs_kwargs = dict(
            prediction=prediction, prediction_mask=prediction_mask,
            groundtruth=groundtruth, groundtruth_mask=groundtruth_mask,
            attended=encoded, attended_mask=encoded_mask)
        if extra_inputs:
            costs_kwargs['extra_inputs'] = extra_inputs
        return self.generator.costs(**costs_kwargs)

    @application
    def generate(self, return_initial_states=False, **kwargs):
        inputs_mask = kwargs.pop('inputs_mask')
        n_steps = kwargs.pop('n_steps')

        encoded, encoded_mask = self.encoder.apply(
            input_=self.bottom.apply(**kwargs),
            mask=inputs_mask)
        encoded = self.top.apply(encoded)
        return self.generator.generate(
            n_steps=n_steps if n_steps is not None else self.n_steps,
            batch_size=encoded.shape[1],
            attended=encoded,
            attended_mask=encoded_mask,
            return_initial_states=return_initial_states,
            as_dict=True)

    @recurrent
    def prefix_generate(self, return_initial_states=True, **kwargs):
        step = kwargs.pop('step')

        sampling_inputs = dict_subset(
            kwargs, self.generator.readout.sample.inputs)
        samples, scores = self.generator.readout.sample(**sampling_inputs)
        prefix_mask = tensor.lt(step, self.prefix_steps)
        samples = (prefix_mask * self.prefix_labels[step[0]]
                   + (1 - prefix_mask) * samples)

        feedback = self.generator.feedback.apply(samples, as_dict=True)
        states_contexts = dict_subset(
            kwargs,
            self.generator.recurrent.apply.states
            + self.generator.recurrent.apply.contexts)
        states_outputs = self.generator.recurrent.apply(
            as_dict=True, iterate=False,
            **dict_union(feedback, states_contexts))

        return ([samples, step + 1]
                + states_outputs.values())

    @recurrent
    def mixed_generate(self, return_initial_states=True, **kwargs):
        critic = self.generator.readout.critic
        groundtruth = kwargs.pop('groundtruth')
        groundtruth_mask = kwargs.pop('groundtruth_mask')
        step = kwargs.pop('step')

        sampling_inputs = dict_subset(
            kwargs, self.generator.readout.sample.inputs)
        actor_scores = self.generator.readout.scores(**sampling_inputs)

        critic_inputs = {
            name: kwargs['critic_' + name]
            for name in critic.generator.readout.merge_names}
        critic_outputs = critic.generator.readout.outputs(
            groundtruth, groundtruth_mask, **critic_inputs)

        epsilon = numpy.array(self.generator.readout.epsilon,
                              dtype=theano.config.floatX)
        actor_probs = tensor.exp(actor_scores)
        # This is a poor man's 1-hot argmax
        critic_probs = self.softmax.apply(critic_outputs * 1000)
        probs = (actor_probs * (tensor.constant(1) - epsilon)
                 + critic_probs * epsilon)

        x = self.theano_rng.uniform(size=(probs.shape[0],))
        samples = (tensor.gt(x[:, None], tensor.cumsum(probs, axis=1))
                   .astype(theano.config.floatX)
                   .sum(axis=1)
                   .astype('int64'))
        samples = tensor.minimum(samples, probs.shape[1] - 1)

        actor_feedback = self.generator.feedback.apply(samples, as_dict=True)
        actor_states_contexts = dict_subset(
            kwargs,
            self.generator.recurrent.apply.states
            + self.generator.recurrent.apply.contexts)
        actor_states_outputs = self.generator.recurrent.apply(
            as_dict=True, iterate=False,
            **dict_union(actor_feedback, actor_states_contexts))

        critic_feedback = critic.generator.feedback.apply(samples, as_dict=True)
        critic_states_contexts = {
            name: kwargs['critic_' + name]
            for name in
            critic.generator.recurrent.apply.states
            + critic.generator.recurrent.apply.contexts}
        critic_apply_kwargs = dict(
            as_dict=True, iterate=False,
            **dict_union(critic_feedback, critic_states_contexts))
        if self.generator.readout.critic_uses_actor_states:
            critic_apply_kwargs['extra_inputs'] = actor_states_outputs['states']
        critic_states_outputs = critic.generator.recurrent.apply(**critic_apply_kwargs)
        return ([samples, step + 1]
                + actor_states_outputs.values()
                + critic_states_outputs.values())

    @application
    def initial_states(self, batch_size, *args, **kwargs):
        critic = self.generator.readout.critic
        result = ([tensor.zeros((batch_size,), dtype='int64')]
                  + self.generator.initial_states(batch_size, *args, **kwargs))
        critic_kwargs = {name[7:]: kwargs[name] for name in kwargs if name.startswith('critic_')}
        # This method can be called for two different recurrent application method,
        # "mixed_generate" and "prefix_generate". That's why this dirty hack is needed.
        if critic_kwargs:
            result += critic.generator.initial_states(batch_size, **critic_kwargs)
        return result

    def get_dim(self, name):
        critic = self.generator.readout.critic
        if name.startswith('critic_'):
            return critic.generator.get_dim(name[7:])
        elif name == 'step':
            return 0
        else:
            return self.generator.get_dim(name)

    @application
    def mask_for_prediction(self, prediction, groundtruth_mask=None,
                            extra_generation_steps=None):
        prediction_mask = tensor.lt(
            tensor.cumsum(tensor.eq(prediction, self.eos_label)
                          .astype(theano.config.floatX), axis=0),
            1).astype(theano.config.floatX)
        prediction_mask = tensor.roll(prediction_mask, 1, 0)
        prediction_mask = tensor.set_subtensor(
            prediction_mask[0, :], tensor.ones_like(prediction_mask[0, :]))
        if groundtruth_mask:
            max_lengths = groundtruth_mask.sum(axis=0) + extra_generation_steps
            prediction_mask *= tensor.lt(
                tensor.arange(prediction.shape[0])[:, None], max_lengths[None, :])
        return prediction_mask

    def load_params(self, path):
        cg = self.get_cost_graph()
        with open(path, 'r') as src:
            param_values = load_parameters(src)
        Model(cg.outputs).set_parameter_values(param_values)

    def get_generate_graph(self, use_mask=True, n_steps=None,
                           return_initial_states=False,
                           use_softmax_t=False):
        if use_softmax_t:
            self.generator.readout.softmax_t = self.criterion.get('softmax_t', 1.0)
        inputs_mask = None
        if use_mask:
            inputs_mask = self.inputs_mask
        result = self.generate(
            n_steps=n_steps, inputs_mask=inputs_mask,
            return_initial_states=return_initial_states,
             **self.inputs)

        self.generator.readout.softmax_t = 1.
        return result

    def get_mixed_generate_graph(self, n_steps=None,
                                 return_initial_states=False):
        critic = self.generator.readout.critic

        attended, attended_mask = self.encoder.apply(
            input_=self.bottom.apply(**self.inputs),
            mask=self.inputs_mask)
        attended = self.top.apply(attended)

        critic_attended, critic_attended_mask = critic.encoder.apply(
            input_=critic.bottom.apply(inputs=self.labels),
            mask=self.labels_mask)
        critic_attended = critic.top.apply(critic_attended)

        return self.mixed_generate(
            n_steps=n_steps, batch_size=attended.shape[1],
            return_initial_states=return_initial_states, as_dict=True,
            attended=attended, attended_mask=attended_mask,
            critic_attended=critic_attended, critic_attended_mask=critic_attended_mask,
            groundtruth=self.labels, groundtruth_mask=self.labels_mask)

    def get_prefix_generate_graph(self, n_steps=None,
                                 return_initial_states=False):
        attended, attended_mask = self.encoder.apply(
            input_=self.bottom.apply(**self.inputs),
            mask=self.inputs_mask)
        attended = self.top.apply(attended)

        return self.prefix_generate(
            n_steps=n_steps, batch_size=attended.shape[1],
            return_initial_states=return_initial_states, as_dict=True,
            attended=attended, attended_mask=attended_mask)


    def get_cost_graph(self, batch=True, use_prediction=False,
                       training=False, groundtruth_as_predictions=False,
                       with_mixed_generation=False):
        # "use_predictions" means use the Theano input variable
        # for predictions.
        readout = self.generator.readout
        if training and self.criterion['name'] == 'actor_critic':
            logger.debug("Switching to training mode")
            readout.compute_targets = self.compute_targets
            readout.trpo_coef = self.criterion.get('trpo_coef', 0.0)
            if 'solve_bellman' in self.criterion:
                readout.solve_bellman = self.criterion['solve_bellman']
        if with_mixed_generation and 'epsilon' in self.criterion:
            readout.epsilon = self.criterion['epsilon']

        if batch:
            inputs, inputs_mask = self.inputs, self.inputs_mask
            groundtruth, groundtruth_mask = self.labels, self.labels_mask
            prediction, prediction_mask = self.predicted_labels, self.predicted_mask
        else:
            inputs, inputs_mask = self.bottom.single_to_batch_inputs(
                self.single_inputs)
            groundtruth = self.single_labels[:, None]
            groundtruth_mask = self.mask_for_prediction(groundtruth)
            prediction = self.single_predicted_labels[:, None]
            prediction_mask = self.mask_for_prediction(prediction)
        if self.cost_involves_generation() and not groundtruth_as_predictions:
            if ((training and self.generate_predictions) or
                    (not training and not use_prediction)):
                generation_routine = (self.get_mixed_generate_graph
                                      if with_mixed_generation
                                      else self.get_generate_graph)
                generated = generation_routine(
                    n_steps=self.labels.shape[0] + self.extra_generation_steps)
                prediction = disconnected_grad(generated['samples'])
                prediction_mask = self.mask_for_prediction(
                    prediction, groundtruth_mask, self.extra_generation_steps)
            else:
                logger.debug("Using provided predictions")
            cost = self.costs(inputs_mask=inputs_mask,
                 prediction=prediction, prediction_mask=prediction_mask,
                 groundtruth=groundtruth, groundtruth_mask=groundtruth_mask,
                 **inputs)
        else:
            if use_prediction:
                cost = self.costs(inputs_mask=inputs_mask,
                    prediction=prediction, prediction_mask=prediction_mask,
                    **inputs)
            else:
                cost = self.costs(inputs_mask=inputs_mask,
                    prediction=groundtruth, prediction_mask=groundtruth_mask,
                    groundtruth=groundtruth, groundtruth_mask=groundtruth_mask,
                    **inputs)
        cost_cg = ComputationGraph(cost)

        # This *has to* be done only when
        # "training" or "with_mixed_generation" is True,
        # but it does not hurt to do it every time.
        logger.debug("Switching back to the normal mode")
        readout = self.generator.readout
        readout.compute_targets = True
        readout.trpo_coef = 0.0
        readout.solve_bellman = True
        readout.epsilon = 0.

        return cost_cg

    def analyze(self, inputs, groundtruth, prediction):
        """Compute cost and aligment."""
        if not hasattr(self, "_analyze"):
            input_variables = list(self.single_inputs.values())
            input_variables.append(self.single_labels)
            input_variables.append(self.single_predicted_labels)

            cg = self.get_cost_graph(batch=False, use_prediction=True)
            costs = cg.outputs[0]

            weights, = VariableFilter(
                bricks=[self.generator], name="weights")(cg)
            energies = VariableFilter(
                bricks=[self.generator], name="energies")(cg)
            energies_output = [energies[0][:, 0, :] if energies
                               else tensor.zeros_like(weights)]

            self._analyze = theano.function(
                input_variables,
                [costs[0], weights[:, 0, :]] + energies_output,
                on_unused_input='warn')

        input_values_dict = dict(inputs)
        input_values_dict['labels'] = groundtruth
        input_values_dict['predicted_labels'] = prediction
        return self._analyze(**input_values_dict)

    def init_beam_search(self, beam_size):
        """Compile beam search and set the beam size.

        See Blocks issue #500.

        """
        if hasattr(self, '_beam_search') and self.beam_size == beam_size:
            # Only recompile if the user wants a different beam size
            return
        self.beam_size = beam_size
        generated = self.get_generate_graph(use_mask=False, n_steps=3)
        cg = ComputationGraph(generated.values())
        samples, = VariableFilter(
            applications=[self.generator.generate], name="samples")(cg)
        self._beam_search = BeamSearch(beam_size, samples)
        self._beam_search.compile()

    def beam_search(self, inputs, **kwargs):
        # When a recognizer is unpickled, self.beam_size is available
        # but beam search has to be recompiled.

        self.init_beam_search(self.beam_size)
        inputs = dict(inputs)
        max_length = int(self.bottom.num_time_steps(**inputs) /
                         self.max_decoded_length_scale)
        search_inputs = {}
        for var in self.inputs.values():
            search_inputs[var] = inputs.pop(var.name)[:, numpy.newaxis, ...]
        if inputs:
            raise Exception(
                'Unknown inputs passed to beam search: {}'.format(
                    inputs.keys()))
        outputs, search_costs = self._beam_search.search(
            search_inputs, self.eos_label,
            max_length,
            ignore_first_eol=self.data_prepend_eos,
            **kwargs)
        return outputs, search_costs

    def init_generate(self):
        generated = self.get_generate_graph(use_mask=False)
        cg = ComputationGraph(generated['samples'])
        self._do_generate = cg.get_theano_function()

    def sample(self, inputs, n_steps=None):
        if not hasattr(self, '_do_generate'):
            self.init_generate()
        batch, unused_mask = self.bottom.single_to_batch_inputs(inputs)
        batch['n_steps'] = n_steps if n_steps is not None \
            else int(self.bottom.num_time_steps(**batch) /
                     self.max_decoded_length_scale)
        sample = self._do_generate(**batch)[0]
        sample = list(sample[:, 0])
        if self.eos_label in sample:
            sample = sample[:sample.index(self.eos_label) + 1]
        return sample

    def __getstate__(self):
        state = dict(self.__dict__)
        for attr in ['_analyze', '_beam_search']:
            state.pop(attr, None)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        # To use bricks used on a GPU first on a CPU later
        try:
            emitter = self.generator.readout.emitter
            del emitter._theano_rng
        except:
            pass

    def cost_involves_generation(self):
        return self.criterion['name'] in ['reinforce', 'sarsa', 'actor_critic']
Beispiel #16
0
                                     key=lambda (i, v): -v)
        print(
            "Top Similarities for %10s @ %4d:" % (
                token_target,
                token_i,
            ),
            map(
                lambda (i, v): "%s %.1f%%" % (code2word[i], v * 100.),
                sorted_similarities[1:4]  # Element [0] is token itself
            ))

    #exit(0)


if not run_test:  # i.e. do training phase
    label_probs = p_labels.apply(
        labels_raw)  # This is a list of label probabilities
    print("label_probs shape",
          label_probs.shape.tag.test_value)  # array([ 464, 5]))
    # -- so :: this is an in-place rescaling

    y = tensor.matrix(
        'labels', dtype="int32"
    )  # This is a symbolic vector of ints (implies one-hot in categorical_crossentropy)
    y.tag.test_value = np.random.randint(
        labels_size, size=batch_of_sentences).astype(np.int32)

    print("y shape", y.shape.tag.test_value)  # array([ 29, 16]))
    print("y.flatten() shape",
          y.flatten().shape.tag.test_value)  # array([464]))
    print("y.flatten() dtype", y.flatten().dtype)  # int32
Beispiel #17
0
    rnn = DropLSTM(dim=h_dim, model_type=model_type,
                   update_prob=update_prob, name="rnn")
    h1, c1 = rnn.apply(pre_rnn, drops, is_for_test)
else:
    rnn = DropGRU(dim=h_dim, model_type=model_type,
                  update_prob=update_prob, name="rnn")
    h1, sd = rnn.apply(pre_rnn[:, :, :h_dim],
                   pre_rnn[:, :, h_dim:],
                   drops, is_for_test)
h1_to_o = Linear(name='h1_to_o',
                 input_dim=h_dim,
                 output_dim=y_dim)
pre_softmax = h1_to_o.apply(h1)
softmax = Softmax()
shape = pre_softmax.shape
softmax_out = softmax.apply(pre_softmax.reshape((-1, y_dim)))
softmax_out = softmax_out.reshape(shape)
softmax_out.name = 'softmax_out'

# comparing only last time-step
cost = CategoricalCrossEntropy().apply(y, softmax_out[-1])
cost.name = 'CrossEntropy'
error_rate = MisclassificationRate().apply(y, softmax_out[-1])
error_rate.name = 'error_rate'

# Initialization
for brick in (x_to_h1, h1_to_o, rnn):
    brick.weights_init = Glorot()
    brick.biases_init = Constant(0)
    brick.initialize()
Beispiel #18
0
class impatientLayer:
    # both visual and word feature are in the joint space
    # of dim: feature_dim
    # hidden_dim: dim of m
    # output_dim: final joint document query representation dim
    def __init__(self, feature_dim, hidden_dim, output_dim):
        self.image_embed = Linear(input_dim=feature_dim,
                                  output_dim=hidden_dim,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  use_bias=False,
                                  name='image_embed')
        self.word_embed = Linear(input_dim=feature_dim,
                                 output_dim=hidden_dim,
                                 weights_init=IsotropicGaussian(0.01),
                                 biases_init=Constant(0),
                                 use_bias=False,
                                 name='word_embed')
        self.r_embed = Linear(input_dim=feature_dim,
                              output_dim=hidden_dim,
                              weights_init=IsotropicGaussian(0.01),
                              biases_init=Constant(0),
                              use_bias=False,
                              name='r_embed')
        self.m_to_s = Linear(input_dim=hidden_dim,
                             output_dim=1,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='m_to_s')
        self.attention_dist = Softmax(name='attention_dist_softmax')
        self.r_to_r = Linear(input_dim=feature_dim,
                             output_dim=feature_dim,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='r_to_r')
        # self.r_to_g = Linear(input_dim=feature_dim,
        #                      output_dim=output_dim,
        #                      weights_init=IsotropicGaussian(0.01),
        #                      biases_init=Constant(0),
        #                      use_bias=False,
        #                      name='r_to_g')
        self.image_embed.initialize()
        self.word_embed.initialize()
        self.r_embed.initialize()
        self.m_to_s.initialize()
        self.r_to_r.initialize()
        # self.r_to_g.initialize()

        # the sequence to sequence LSTM
        self.seq = LSTM(output_dim,
                        name='rewatcher_seq',
                        weights_init=IsotropicGaussian(0.01),
                        biases_init=Constant(0))
        self.seq_embed = Linear(feature_dim,
                                output_dim * 4,
                                name='rewatcher_seq_embed',
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0),
                                use_bias=False)

        self.seq.initialize()
        self.seq_embed.initialize()

    # doc: row major batch_size x doc_length x feature_dim
    # query: row major batch_size x q x feature_dim
    # mask: mask of query batch_size
    # mask: length of a sentence - 1
    def apply(self, doc, query, mask_, batch_size):
        # batch_size x doc_length x hidden_dim
        mask = mask_.flatten()
        att1 = self.image_embed.apply(doc)

        # y_q_i: the ith token of question
        #        batch_size x feature_dim
        # r_1: r_m_1
        #        batch_size x feature_dim
        # y_d: document
        #        batch_size x doc_length x feature_dim
        # y_d_m: d-to-m
        #        batch_size x doc_length x hidden_dim
        def one_step(y_q_i, r_1, y_d, y_d_m):
            # batch_size x hidden_dim
            att2 = self.r_embed.apply(r_1)
            # batch_size x hidden_dim
            att3 = self.word_embed.apply(y_q_i)
            att = y_d_m + att2.dimshuffle(0, 'x', 1) + att3.dimshuffle(0, 'x', 1)
            # batch_size x doc_length x hidden_dim
            m = T.tanh(att)
            # batch_size x doc_length x 1
            s = self.m_to_s.apply(m)
            # batch_size x doc_length
            s = s.reshape((s.shape[0], s.shape[1]))
            s = self.attention_dist.apply(s)
            y_d_s = y_d.swapaxes(1, 2)
            # return batch_size x feature_dim
            return T.batched_dot(y_d_s, s) + T.tanh(self.r_to_r.apply(r_1))

        # query: batch_size x q x feature_dim
        # r: q x batch_size x feature_dim
        r, updates = theano.scan(fn=one_step,
                                 sequences=[query.swapaxes(0,1)],
                                 outputs_info=T.zeros_like(doc[:, 0, :]),
                                 non_sequences=[doc, att1],
                                 n_steps=query.shape[1],
                                 name='impatient layer')

        # for the sequence encoder
        # q x batch_size x output_dim
        Wr = self.seq_embed.apply(r)
        # q x batch_size x output_dim
        seq_r, garbage = self.seq.apply(Wr)
        # batch_size x feature_dim
        r_q = r[mask, T.arange(batch_size), :]
        seq_r_q = seq_r[mask, T.arange(batch_size), :]
        # batch_size x output_dim
        return r_q, seq_r_q
class CostObject(Initializable):
    @lazy()
    def __init__(self, cost_type='original', **kwargs):
        super(CostObject, self).__init__(**kwargs)
        self.cost_type = cost_type
        self.softmax = Softmax()
        self.children = [self.softmax]

    @application(inputs=['input_'], outputs=['output'])
    def log_probabilities(self, input_):
        """Normalize log-probabilities.
      Converts unnormalized log-probabilities (exponents of which do not
      sum to one) into actual log-probabilities (exponents of which sum
      to one).
      Parameters
      ----------
      input_ : :class:`~theano.Variable`
          A matrix, each row contains unnormalized log-probabilities of a
          distribution.
      Returns
      -------
      output : :class:`~theano.Variable`
          A matrix with normalized log-probabilities in each row for each
          distribution from `input_`.
      """
        shifted = input_ - input_.max(axis=1, keepdims=True)
        return shifted - tensor.log(
            tensor.exp(shifted).sum(axis=1, keepdims=True))

    @application(inputs=['x', 'y'], outputs=['output'])
    def original_cost(self, x, y):
        x = self.log_probabilities(x)
        if y.ndim == x.ndim - 1:
            indices = tensor.arange(y.shape[0]) * x.shape[1] + y
            cost = -x.flatten()[indices]
        elif y.ndim == x.ndim:
            cost = -(x * y).sum(axis=1)
        else:
            raise TypeError('rank mismatch between x and y')
        return cost

    @application(inputs=['x', 'y'], outputs=['output'])
    def simple_cost(self, x, y):
        if y.ndim == x.ndim - 1:
            # Get probs:
            newX = self.softmax.apply(x)
            indices = tensor.arange(y.shape[0]) * x.shape[1] + y
            newY = tensor.ones_like(newX)
            cost = ((newY - newX).flatten()[indices])
        elif y.ndim == x.ndim:
            raise TypeError('\nExpected either x or y to be of another rank\n')
        else:
            raise TypeError('rank mismatch between x and y')
        return cost

    # x are the gold labels.
    @application(inputs=['x', 'y'], outputs=['output'])
    def cost(self, application_call, x, y):
        if self.cost_type == 'original':
            return self.original_cost(x, y)
        if self.cost_type == 'simple':
            return self.simple_cost(x, y)
        return 0
Beispiel #20
0
###################
#### Softmax
###################

from blocks.bricks import Softmax
from blocks.bricks.cost import MisclassificationRate

W2 = theano.shared(numpy.random.normal(size=(n_out, num_protos)).astype('float32'))
b = theano.shared(numpy.zeros((num_protos,)).astype('float32'))
y = tensor.ivector('y')

h = tensor.dot(h3, W2) + b
h = tensor.switch(h < 0, -h , h)
sm = Softmax()
pred = sm.apply(h)
misclass = MisclassificationRate().apply(y, pred)
c = sm.categorical_cross_entropy(y, h).mean()

s_params = [W2, b]
s_grad = theano.grad(c, s_params)
s_updates = [p - numpy.float32(0.05)*g for p, g in zip(s_params, s_grad)]
s_f = theano.function([h3, y], [c, misclass], updates=zip(s_params, s_updates))
s_pred = theano.function([h3], pred)

for j in range(200):
    for i in range(n_batches):
	if i == 0:
            print s_f(data[i*batch_size:(i+1)*batch_size, :], labels[i*batch_size:(i+1)*batch_size])
	else:
            s_f(data[i*batch_size:(i+1)*batch_size, :], labels[i*batch_size:(i+1)*batch_size])
Beispiel #21
0
linear1 = Linear(name='linear1', input_dim=300, output_dim=128)
recurrent = SimpleRecurrent(name='recurrent', activation=Tanh(), dim=128)
linear2 = Linear(name='linear2', input_dim=128, output_dim=9)
softmax = Softmax()
bricks = [linear1, recurrent, linear2]
for brick in bricks:
    brick.weights_init = IsotropicGaussian(0.01)
    brick.biases_init = Constant(0)
    brick.initialize()

linear1_output = linear1.apply(input)
recurrent_output = recurrent.apply(linear1_output, mask=mask)
linear2_output = linear2.apply(recurrent_output)
shape = linear2_output.shape  # 100 * 29*9
output = softmax.apply(linear2_output.reshape(
    (-1,
     9))).reshape(shape)  # hameye dimension ha be gheyr az yeki k oon 9 hast.

# Cost and Functions
cost = T.nnet.categorical_crossentropy(output, target)  # 100 x 29
cost = cost * mask
cost = cost.mean()

params = Model(cost).parameters
updates = sgd(cost, params)
f_train = theano.function(inputs=[input, mask, target],
                          outputs=cost,
                          updates=updates,
                          allow_input_downcast=True)
f_valid = theano.function(inputs=[input, mask, target],
                          outputs=cost,
Beispiel #22
0
def training(runname, rnnType, maxPackets, packetTimeSteps, packetReverse, padOldTimeSteps, wtstd, 
             lr, decay, clippings, dimIn, dim, attentionEnc, attentionContext, numClasses, batch_size, epochs, 
             trainPercent, dataPath, loadPrepedData, channel):  # pragma: no cover
    print locals()
    print
    
    X = T.tensor4('inputs')
    Y = T.matrix('targets')
    linewt_init = IsotropicGaussian(wtstd)
    line_bias = Constant(1.0)
    rnnwt_init = IsotropicGaussian(wtstd)
    rnnbias_init = Constant(0.0)
    classifierWts = IsotropicGaussian(wtstd)

    learning_rateClass = theano.shared(np.array(lr, dtype=theano.config.floatX))
    learning_decay = np.array(decay, dtype=theano.config.floatX)
    
    ###DATA PREP
    print 'loading data'
    if loadPrepedData:
        hexSessions = loadFile(dataPath)

    else:
        sessioner = sessionizer.HexSessionizer(dataPath)
        hexSessions = sessioner.read_pcap()
        hexSessions = removeBadSessionizer(hexSessions)

    numSessions = len(hexSessions)
    print str(numSessions) + ' sessions found'
    hexSessionsKeys = order_keys(hexSessions)
    hexDict = hexTokenizer()
    
    print 'creating dictionary of ip communications'
    comsDict, uniqIPs = srcIpDict(hexSessions)
    comsDict = dictUniquerizer(comsDict)
     
    print 'initializing network graph'
    ###ENCODER
    if rnnType == 'gru':
        rnn = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
        dimMultiplier = 2
    else:
        rnn = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
        dimMultiplier = 4

    fork = Fork(output_names=['linear', 'gates'],
                name='fork', input_dim=dimIn, output_dims=[dim, dim * dimMultiplier], 
                weights_init = linewt_init, biases_init = line_bias)

    ###CONTEXT
    if rnnType == 'gru':
        rnnContext = GatedRecurrent(dim=dim, weights_init = rnnwt_init, 
                                    biases_init = rnnbias_init, name = 'gruContext')
    else:
        rnnContext = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, 
                          name = 'lstmContext')

    forkContext = Fork(output_names=['linearContext', 'gatesContext'],
                name='forkContext', input_dim=dim, output_dims=[dim, dim * dimMultiplier], 
                weights_init = linewt_init, biases_init = line_bias)

    forkDec = Fork(output_names=['linear', 'gates'],
                name='forkDec', input_dim=dim, output_dims=[dim, dim*dimMultiplier], 
                weights_init = linewt_init, biases_init = line_bias)

    #CLASSIFIER
    bmlp = BatchNormalizedMLP( activations=[Tanh(),Tanh()], 
               dims=[dim, dim, numClasses],
               weights_init=classifierWts,
               biases_init=Constant(0.0001) )

    #initialize the weights in all the functions
    fork.initialize()
    rnn.initialize()
    forkContext.initialize()
    rnnContext.initialize()
    forkDec.initialize()
    bmlp.initialize()

    def onestepEnc(X):
        data1, data2 = fork.apply(X) 

        if rnnType == 'gru':
            hEnc = rnn.apply(data1, data2) 
        else:
            hEnc, _ = rnn.apply(data2)

        return hEnc

    hEnc, _ = theano.scan(onestepEnc, X) #(mini*numPackets, packetLen, 1, hexdictLen)
        if attentionEnc:
        
        attentionmlpEnc = MLP(activations=[Tanh()], dims = [dim, 1], weights_init=attnWts,
               biases_init=Constant(1.0))
        attentionmlpEnc.initialize()

        hEncAttn = T.reshape(hEnc, (-1, packetTimeSteps, dim))
        def onestepEncAttn(hEncAttn):

            preEncattn = attentionmlpEnc.apply(hEncAttn)
            attEncsoft = Softmax()
            attEncpyx = attEncsoft.apply(preEncattn.flatten())
            attEncpred = attEncpyx.flatten()
            attenc = T.mul(hEncAttn.dimshuffle(1,0), attEncpred).dimshuffle(1,0)

            return attenc

        attenc, _ = theano.scan(onestepEncAttn, hEncAttn)

        hEncReshape = T.reshape(T.sum(attenc, axis = 1), (-1, maxPackets, 1, dim))

    else:
        hEncReshape = T.reshape(hEnc[:,-1], (-1, maxPackets, 1, dim)) #[:,-1] takes the last rep for each packet
                                                                 #(mini, numPackets, 1, dimReduced)  #[:,-1] takes the last rep for each packet
                                                                 #(mini, numPackets, 1, dimReduced)
    def onestepContext(hEncReshape):

        data3, data4 = forkContext.apply(hEncReshape)

        if rnnType == 'gru':
            hContext = rnnContext.apply(data3, data4)
        else:
            hContext, _ = rnnContext.apply(data4)

        return hContext

    hContext, _ = theano.scan(onestepContext, hEncReshape)
    
    if attentionContext:
        attentionmlpContext = MLP(activations=[Tanh()], dims = [dim, 1], weights_init=attnWts,
               biases_init=Constant(1.0))
        attentionmlpContext.initialize()

        hContextAttn = T.reshape(hContext, (-1,maxPackets,dim))
        def onestepContextAttn(hContextAttn):

            preContextatt = attentionmlpContext.apply(hContextAttn)
            attContextsoft = Softmax()
            attContextpyx = attContextsoft.apply(preContextatt.flatten())
            attContextpred = attContextpyx.flatten()
            attcontext = T.mul(hContextAttn.dimshuffle(1,0), attContextpred).dimshuffle(1,0)

            return attcontext

        attcontext, _ = theano.scan(onestepContextAttn, hContextAttn)
        hContextReshape = T.sum(attcontext, axis = 1)

    else:
        hContextReshape = T.reshape(hContext[:,-1], (-1,dim))

    data5, _ = forkDec.apply(hContextReshape)
    pyx = bmlp.apply(data5)
    softmax = Softmax()
    softoutClass = softmax.apply(pyx)
    costClass = T.mean(CategoricalCrossEntropy().apply(Y, softoutClass))

    #CREATE GRAPH
    cgClass = ComputationGraph([costClass])
    paramsClass = VariableFilter(roles = [PARAMETER])(cgClass.variables)
    learning = learningfunctions.Learning(costClass,paramsClass,learning_rateClass,l1=0.,l2=0.,maxnorm=0.,c=clippings)
    updatesClass = learning.Adam() 

    module_logger.info('starting graph compilation')
    classifierTrain = theano.function([X,Y], [costClass, hEnc, hContext, pyx, softoutClass], 
                                      updates=updatesClass, allow_input_downcast=True)
    classifierPredict = theano.function([X], softoutClass, allow_input_downcast=True)
    module_logger.info('graph compilation finished')
    print 'finished graph compilation'

    trainIndex = int(len(hexSessionsKeys)*trainPercent)

    epochCost = []
    gradNorms = []
    trainAcc = []
    testAcc = []

    costCollect = []
    trainCollect = []

    module_logger.info('beginning training')
    iteration = 0
    #epoch
    for epoch in xrange(epochs):

        #iteration/minibatch
        for start, end in zip(range(0, trainIndex,batch_size),
                              range(batch_size, trainIndex, batch_size)):

            trainingTargets = []
            trainingSessions = []

            #create one minibatch with 0.5 normal and 0.5 abby normal traffic
            for trainKey in range(start, end):
                sessionForEncoding = list(hexSessions[hexSessions.keys()[trainKey]][0])
    
                adfun = adversarialfunctions.Adversary(sessionForEncoding)
                adversaryList = [sessionForEncoding, 
                                 adfun.dstIpSwapOut(comsDict, uniqIPs),
                                 adfun.portDirSwitcher(),
                                 adfun.ipDirSwitcher()]
                abbyIndex = random.sample(range(len(adversaryList)), 1)[0]

                targetClasses = [0]*numClasses
                targetClasses[abbyIndex] = 1
                abbyTarget = np.array(targetClasses, dtype=theano.config.floatX)
                trainingSessions.append(abbyOneHotSes[0])
                trainingTargets.append(abbyTarget)

            sessionsMinibatch = np.asarray(trainingSessions).reshape((-1, packetTimeSteps, 1, dimIn))
            targetsMinibatch = np.asarray(trainingTargets)

            costfun = classifierTrain(sessionsMinibatch, targetsMinibatch)

            if iteration % (numSessions / (10 * batch_size)) == 0:
                costCollect.append(costfun[0])
                trainCollect.append(np.mean(np.argmax(costfun[-1],axis=1) == np.argmax(targetsMinibatch, axis=1)))
                module_logger.info('   Iteration: ', iteration)
                module_logger.info('   Cost: ', np.mean(costCollect))
                module_logger.info('   TRAIN accuracy: ', np.mean(trainCollect))
                print '   Iteration: ', iteration
                print '   Cost: ', np.mean(costCollect)
                print '   TRAIN accuracy: ', np.mean(trainCollect)

            iteration+=1

            #testing accuracy
            if iteration % (numSessions / (2 * batch_size)) == 0:
                predtar, acttar, testCollect = predictClass(classifierPredict, hexSessions, comsDict, uniqIPs, hexDict,
                                                            hexSessionsKeys,
                                                            numClasses, trainPercent, dimIn, maxPackets, packetTimeSteps,
                                                            padOldTimeSteps)
                binaryPrecisionRecall(predtar, acttar, numClasses)
                module_logger.info(str(testCollect))

            #save the models
            if iteration % (numSessions / (5 * batch_size)) == 0:
                save_model(classifierPredict)

        epochCost.append(np.mean(costCollect))
        trainAcc.append(np.mean(trainCollect))
        
        module_logger.info('Epoch: ', epoch)
        module_logger.info('Epoch cost average: ', epochCost[-1])
        module_logger.info('Epoch TRAIN accuracy: ', trainAcc[-1])
        print 'Epoch: ', epoch
        print 'Epoch cost average: ', epochCost[-1]
        print 'Epoch TRAIN accuracy: ', trainAcc[-1]

    return classifierTrain, classifierPredict
    #print("self-cosine similarity %f" % (np.dot(token_v,token_v)))
    
    all_similarities = np.dot(e, token_v)
    #print("overall similarity shape: ", all_similarities.shape)  # a 1-d array
    
    sorted_similarities = sorted( enumerate(all_similarities), key=lambda (i,v): -v)
    print("Top Similarities for %10s @ %4d:" % (token_target,token_i, ), 
      map(lambda (i,v): "%s %.1f%%" % (code2word[i],v*100.), 
        sorted_similarities[1:4] # Element [0] is token itself
      )  
    )
  
  #exit(0)

if not run_test:  # i.e. do training phase
  label_probs = p_labels.apply(labels_raw)               # This is a list of label probabilities
  print("label_probs shape", label_probs.shape.tag.test_value)          # array([ 464, 5]))
  # -- so :: this is an in-place rescaling

  y = tensor.matrix('labels', dtype="int32")   # This is a symbolic vector of ints (implies one-hot in categorical_crossentropy)
  y.tag.test_value = np.random.randint( labels_size, size=batch_of_sentences).astype(np.int32)

  print("y shape", y.shape.tag.test_value)                              # array([ 29, 16]))
  print("y.flatten() shape", y.flatten().shape.tag.test_value)          # array([464]))
  print("y.flatten() dtype", y.flatten().dtype)                         # int32

  examine_embedding(lookup.W.get_value())

  """
  class CategoricalCrossEntropy(Cost):
      @application(outputs=["cost"])
Beispiel #24
0
class CCHLSTM(BaseRecurrent, Initializable):
    def __init__(self, io_dim, hidden_dims, cond_cert, activation=None, **kwargs):
        super(CCHLSTM, self).__init__(**kwargs)

        self.cond_cert = cond_cert

        self.io_dim = io_dim
        self.hidden_dims = hidden_dims

        self.children = []
        self.layers = []

        self.softmax = Softmax()
        self.children.append(self.softmax)

        for i, d in enumerate(hidden_dims):
            i0 = LookupTable(length=io_dim,
                             dim=4*d,
                             name='i0-%d'%i)
            self.children.append(i0)

            if i > 0:
                i1 = Linear(input_dim=hidden_dims[i-1],
                            output_dim=4*d,
                            name='i1-%d'%i)
                self.children.append(i1)
            else:
                i1 = None

            lstm = LSTM(dim=d, activation=activation,
                        name='LSTM-%d'%i)
            self.children.append(lstm)

            o = Linear(input_dim=d,
                       output_dim=io_dim,
                       name='o-%d'%i)
            self.children.append(o)

            self.layers.append((i0, i1, lstm, o))


    @recurrent(contexts=[])
    def apply(self, inputs, **kwargs):

        l0i, _, l0l, l0o = self.layers[0]
        l0iv = l0i.apply(inputs)
        new_states0, new_cells0 = l0l.apply(states=kwargs['states0'],
                                            cells=kwargs['cells0'],
                                            inputs=l0iv,
                                            iterate=False)
        l0ov = l0o.apply(new_states0)

        pos = l0ov
        ps = new_states0

        passnext = tensor.ones((inputs.shape[0],))
        out_sc = [new_states0, new_cells0, passnext]

        for i, (cch, (i0, i1, l, o)) in enumerate(zip(self.cond_cert, self.layers[1:])):
            pop = self.softmax.apply(pos)
            best = pop.max(axis=1)
            passnext = passnext * tensor.le(best, cch) * kwargs['pass%d'%i]

            i0v = i0.apply(inputs)
            i1v = i1.apply(ps)

            prev_states = kwargs['states%d'%i]
            prev_cells = kwargs['cells%d'%i]
            new_states, new_cells = l.apply(inputs=i0v + i1v,
                                            states=prev_states,
                                            cells=prev_cells,
                                            iterate=False)
            new_states = tensor.switch(passnext[:, None], new_states, prev_states)
            new_cells = tensor.switch(passnext[:, None], new_cells, prev_cells)
            out_sc += [new_states, new_cells, passnext]

            ov = o.apply(new_states)
            pos = tensor.switch(passnext[:, None], pos + ov, pos)
            ps = new_states

        return [pos] + out_sc

    def get_dim(self, name):
        dims = {'pred': self.io_dim}
        for i, d in enumerate(self.hidden_dims):
            dims['states%d'%i] = dims['cells%d'%i] = d
        if name in dims:
            return dims[name]
        return super(CCHLSTM, self).get_dim(name)

    @apply.property('sequences')
    def apply_sequences(self):
        return ['inputs'] + ['pass%d'%i for i in range(len(self.hidden_dims)-1)]

    @apply.property('states')
    def apply_states(self):
        ret = []
        for i in range(len(self.hidden_dims)):
            ret += ['states%d'%i, 'cells%d'%i]
        return ret

    @apply.property('outputs')
    def apply_outputs(self):
        ret = ['pred']
        for i in range(len(self.hidden_dims)):
            ret += ['states%d'%i, 'cells%d'%i, 'active%d'%i]
        return ret
Beispiel #25
0
batch_size = 50

print 'Building model ...'
# T x B x F
x = tensor.tensor3('x', dtype=floatX)
y = tensor.tensor3('y', dtype='int32')

x_to_h1 = Linear(name='x_to_h1', input_dim=x_dim, output_dim=h_dim)
pre_rnn = x_to_h1.apply(x)
rnn = SimpleRecurrent(activation=Rectifier(), dim=h_dim, name="rnn")
h1 = rnn.apply(pre_rnn)
h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=o_dim)
pre_softmax = h1_to_o.apply(h1)
softmax = Softmax()
shape = pre_softmax.shape
softmax_out = softmax.apply(pre_softmax.reshape((-1, o_dim)))
softmax_out = softmax_out.reshape(shape)
softmax_out.name = 'softmax_out'

# comparing only last time-step
cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1])
cost.name = 'CrossEntropy'
error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1])
error_rate.name = 'error_rate'

# Initialization
for brick in (x_to_h1, h1_to_o):
    brick.weights_init = IsotropicGaussian(0.01)
    brick.biases_init = Constant(0)
    brick.initialize()
rnn.weights_init = Identity()
Beispiel #26
0
class questionAttentionLayer:
    def __init__(self, feature_dim, hidden_dim, output_dim):
        self.image_embed = Linear(input_dim=feature_dim,
                                  output_dim=hidden_dim,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  use_bias=False,
                                  name='iw_image_embed')
        self.word_embed = Linear(input_dim=feature_dim,
                                 output_dim=hidden_dim,
                                 weights_init=IsotropicGaussian(0.01),
                                 biases_init=Constant(0),
                                 use_bias=False,
                                 name='iw_word_embed')
        self.r_embed = Linear(input_dim=feature_dim,
                              output_dim=hidden_dim,
                              weights_init=IsotropicGaussian(0.01),
                              biases_init=Constant(0),
                              use_bias=False,
                              name='iw_r_embed')
        self.m_to_s = Linear(input_dim=hidden_dim,
                             output_dim=1,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='iw_m_to_s')
        self.attention_dist = Softmax(name='iw_attetion')
        self.r_to_r = Linear(input_dim=feature_dim,
                             output_dim=feature_dim,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='iw_r_to_r')
        # self.r_to_g = Linear(input_dim=feature_dim,
        #                      output_dim=output_dim,
        #                      weights_init=IsotropicGaussian(0.01),
        #                      biases_init=Constant(0),
        #                      use_bias=False,
        #                      name='iw_r_to_g')
        self.image_embed.initialize()
        self.word_embed.initialize()
        self.r_embed.initialize()
        self.m_to_s.initialize()
        self.r_to_r.initialize()
        # self.r_to_g.initialize()
        self.seq = LSTM(feature_dim,
                        name='rereader_seq',
                        weights_init=IsotropicGaussian(0.01),
                        biases_init=Constant(0))
        self.seq_embed = Linear(feature_dim,
                                output_dim * 4,
                                name='rereader_seq_embed',
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0),
                                use_bias=False)

        self.seq.initialize()
        self.seq_embed.initialize()


    # video: batch_size x video_length x feature_dim
    # query: batch_size x q x feature_dim
    # mask: this mask is different from other masks
    # batch_size x q
    # eg.
    # -10000 == -np.Inf
    # 1:   0, 0, 0, 0, 0, -10000, -10000, -10000
    # 2:   0, 0, 0, 0, -10000, -10000, -10000
    # 3:   0, 0, 0, 0, 0, 0, 0, -10000
    def apply(self, video, query, mask, batch_size):
        # batch_size x q x hidden_dim
        att1 = self.word_embed.apply(query)


        # batch_size x hidden_dim
        y_q = query
        att3 = self.image_embed.apply(video)
        att = att1 + att3.dimshuffle(0, 'x', 1)
        # batch_size x q x hidden_dim
        m = T.tanh(att)
        # batch_size x q
        s = self.m_to_s.apply(m)
        s = s.reshape((s.shape[0], s.shape[1]))
        # ignore the question padding 0s
        s = s + mask
        s = self.attention_dist.apply(s)
        y_q_s = y_q.swapaxes(1, 2)
        r = T.batched_dot(y_q_s, s) 

        # batch_size x feature_dim
        return r
Beispiel #27
0
class Model(Initializable):
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(**kwargs)
        self.config = config

        self.context_embedder = ContextEmbedder(config)

        self.prefix_encoder = MLP(
            activations=[Rectifier() for _ in config.prefix_encoder.dim_hidden] + [config.representation_activation()],
            dims=[config.prefix_encoder.dim_input] + config.prefix_encoder.dim_hidden + [config.representation_size],
            name="prefix_encoder",
        )
        self.candidate_encoder = MLP(
            activations=[Rectifier() for _ in config.candidate_encoder.dim_hidden]
            + [config.representation_activation()],
            dims=[config.candidate_encoder.dim_input]
            + config.candidate_encoder.dim_hidden
            + [config.representation_size],
            name="candidate_encoder",
        )
        self.softmax = Softmax()

        self.prefix_extremities = {
            "%s_k_%s" % (side, ["latitude", "longitude"][axis]): axis for side in ["first", "last"] for axis in [0, 1]
        }
        self.candidate_extremities = {
            "candidate_%s_k_%s" % (side, ["latitude", "longitude"][axis]): axis
            for side in ["first", "last"]
            for axis in [0, 1]
        }

        self.inputs = (
            self.context_embedder.inputs
            + ["candidate_%s" % k for k in self.context_embedder.inputs]
            + self.prefix_extremities.keys()
            + self.candidate_extremities.keys()
        )
        self.children = [self.context_embedder, self.prefix_encoder, self.candidate_encoder, self.softmax]

    def _push_initialization_config(self):
        for (mlp, config) in [
            [self.prefix_encoder, self.config.prefix_encoder],
            [self.candidate_encoder, self.config.candidate_encoder],
        ]:
            mlp.weights_init = config.weights_init
            mlp.biases_init = config.biases_init

    @application(outputs=["destination"])
    def predict(self, **kwargs):
        prefix_embeddings = tuple(self.context_embedder.apply(**{k: kwargs[k] for k in self.context_embedder.inputs}))
        prefix_extremities = tuple(
            (kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v] for k, v in self.prefix_extremities.items()
        )
        prefix_inputs = tensor.concatenate(prefix_extremities + prefix_embeddings, axis=1)
        prefix_representation = self.prefix_encoder.apply(prefix_inputs)
        if self.config.normalize_representation:
            prefix_representation = prefix_representation / tensor.sqrt(
                (prefix_representation ** 2).sum(axis=1, keepdims=True)
            )

        candidate_embeddings = tuple(
            self.context_embedder.apply(**{k: kwargs["candidate_%s" % k] for k in self.context_embedder.inputs})
        )
        candidate_extremities = tuple(
            (kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v] for k, v in self.candidate_extremities.items()
        )
        candidate_inputs = tensor.concatenate(candidate_extremities + candidate_embeddings, axis=1)
        candidate_representation = self.candidate_encoder.apply(candidate_inputs)
        if self.config.normalize_representation:
            candidate_representation = candidate_representation / tensor.sqrt(
                (candidate_representation ** 2).sum(axis=1, keepdims=True)
            )

        similarity_score = tensor.dot(prefix_representation, candidate_representation.T)
        similarity = self.softmax.apply(similarity_score)

        candidate_destination = tensor.concatenate(
            (
                tensor.shape_padright(kwargs["candidate_last_k_latitude"][:, -1]),
                tensor.shape_padright(kwargs["candidate_last_k_longitude"][:, -1]),
            ),
            axis=1,
        )

        return tensor.dot(similarity, candidate_destination)

    @predict.property("inputs")
    def predict_inputs(self):
        return self.inputs

    @application(outputs=["cost"])
    def cost(self, **kwargs):
        y_hat = self.predict(**kwargs)
        y = tensor.concatenate(
            (kwargs["destination_latitude"][:, None], kwargs["destination_longitude"][:, None]), axis=1
        )

        return error.erdist(y_hat, y).mean()

    @cost.property("inputs")
    def cost_inputs(self):
        return self.inputs + ["destination_latitude", "destination_longitude"]
Beispiel #28
0
class Model(Initializable):
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(**kwargs)
        self.config = config

        self.context_embedder = ContextEmbedder(config)

        self.prefix_encoder = MLP(activations=[
            Rectifier() for _ in config.prefix_encoder.dim_hidden
        ] + [config.representation_activation()],
                                  dims=[config.prefix_encoder.dim_input] +
                                  config.prefix_encoder.dim_hidden +
                                  [config.representation_size],
                                  name='prefix_encoder')
        self.candidate_encoder = MLP(
            activations=[
                Rectifier() for _ in config.candidate_encoder.dim_hidden
            ] + [config.representation_activation()],
            dims=[config.candidate_encoder.dim_input] +
            config.candidate_encoder.dim_hidden + [config.representation_size],
            name='candidate_encoder')
        self.softmax = Softmax()

        self.prefix_extremities = {
            '%s_k_%s' % (side, ['latitude', 'longitude'][axis]): axis
            for side in ['first', 'last'] for axis in [0, 1]
        }
        self.candidate_extremities = {
            'candidate_%s_k_%s' % (side, ['latitude', 'longitude'][axis]): axis
            for side in ['first', 'last'] for axis in [0, 1]
        }

        self.inputs = self.context_embedder.inputs + [
            'candidate_%s' % k for k in self.context_embedder.inputs
        ] + self.prefix_extremities.keys() + self.candidate_extremities.keys()
        self.children = [
            self.context_embedder, self.prefix_encoder, self.candidate_encoder,
            self.softmax
        ]

    def _push_initialization_config(self):
        for (mlp, config) in [[
                self.prefix_encoder, self.config.prefix_encoder
        ], [self.candidate_encoder, self.config.candidate_encoder]]:
            mlp.weights_init = config.weights_init
            mlp.biases_init = config.biases_init

    @application(outputs=['destination'])
    def predict(self, **kwargs):
        prefix_embeddings = tuple(
            self.context_embedder.apply(
                **{k: kwargs[k]
                   for k in self.context_embedder.inputs}))
        prefix_extremities = tuple(
            (kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v]
            for k, v in self.prefix_extremities.items())
        prefix_inputs = tensor.concatenate(prefix_extremities +
                                           prefix_embeddings,
                                           axis=1)
        prefix_representation = self.prefix_encoder.apply(prefix_inputs)
        if self.config.normalize_representation:
            prefix_representation = prefix_representation / tensor.sqrt(
                (prefix_representation**2).sum(axis=1, keepdims=True))

        candidate_embeddings = tuple(
            self.context_embedder.apply(
                **{
                    k: kwargs['candidate_%s' % k]
                    for k in self.context_embedder.inputs
                }))
        candidate_extremities = tuple(
            (kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v]
            for k, v in self.candidate_extremities.items())
        candidate_inputs = tensor.concatenate(candidate_extremities +
                                              candidate_embeddings,
                                              axis=1)
        candidate_representation = self.candidate_encoder.apply(
            candidate_inputs)
        if self.config.normalize_representation:
            candidate_representation = candidate_representation / tensor.sqrt(
                (candidate_representation**2).sum(axis=1, keepdims=True))

        similarity_score = tensor.dot(prefix_representation,
                                      candidate_representation.T)
        similarity = self.softmax.apply(similarity_score)

        candidate_destination = tensor.concatenate(
            (tensor.shape_padright(kwargs['candidate_last_k_latitude'][:, -1]),
             tensor.shape_padright(kwargs['candidate_last_k_longitude'][:,
                                                                        -1])),
            axis=1)

        return tensor.dot(similarity, candidate_destination)

    @predict.property('inputs')
    def predict_inputs(self):
        return self.inputs

    @application(outputs=['cost'])
    def cost(self, **kwargs):
        y_hat = self.predict(**kwargs)
        y = tensor.concatenate((kwargs['destination_latitude'][:, None],
                                kwargs['destination_longitude'][:, None]),
                               axis=1)

        return error.erdist(y_hat, y).mean()

    @cost.property('inputs')
    def cost_inputs(self):
        return self.inputs + ['destination_latitude', 'destination_longitude']