Esempio n. 1
0
def test_batch_normalized_mlp_learn_scale_propagated_at_alloc():
    """Test that setting learn_scale on a BatchNormalizedMLP works."""
    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9], learn_scale=False)
    assert not mlp.learn_scale
    assert all(act.children[0].learn_scale for act in mlp.activations)
    mlp.allocate()
    assert not any(act.children[0].learn_scale for act in mlp.activations)
Esempio n. 2
0
def test_batch_normalized_mlp_transformed():
    """Smoke test that a graph involving a BatchNormalizedMLP transforms."""
    x = tensor.matrix('x')
    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
    with batch_normalization(mlp):
        y = mlp.apply(x)
    assert len(get_batch_normalization_updates(ComputationGraph([y]))) == 4
Esempio n. 3
0
def test_batch_normalized_mlp_allocation():
    """Test that BatchNormalizedMLP performs allocation correctly."""
    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
    mlp.allocate()
    assert mlp.activations[0].children[0].input_dim == 7
    assert mlp.activations[1].children[0].input_dim == 9
    assert not any(l.use_bias for l in mlp.linear_transformations)
Esempio n. 4
0
def test_batch_normalized_mlp_mean_only_propagated_at_alloc():
    """Test that setting mean_only on a BatchNormalizedMLP works."""
    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9], mean_only=True)
    assert mlp.mean_only
    assert not any(act.children[0].mean_only for act in mlp.activations)
    mlp.allocate()
    assert all(act.children[0].mean_only for act in mlp.activations)
Esempio n. 5
0
 def test_get_batch_normalization_updates_mean_only(self):
     """Test get_batch_normalization_updates with mean_only bricks."""
     mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9], mean_only=True)
     with batch_normalization(mlp):
         y_bn = mlp.apply(self.x)
     graph = ComputationGraph([y_bn])
     updates = get_batch_normalization_updates(graph)
     self.simple_assertions(updates, num_updates=2, mean_only=True)
Esempio n. 6
0
def test_batch_normalized_mlp_conserve_memory_propagated():
    """Test that setting conserve_memory on a BatchNormalizedMLP works."""
    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9],
                             conserve_memory=False)
    assert not any(act.children[0].conserve_memory for act in mlp.activations)
    mlp.conserve_memory = True
    assert mlp.conserve_memory
    assert all(act.children[0].conserve_memory for act in mlp.activations)
Esempio n. 7
0
 def test_get_batch_normalization_updates_mean_only(self):
     """Test get_batch_normalization_updates with mean_only bricks."""
     mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9], mean_only=True)
     with batch_normalization(mlp):
         y_bn = mlp.apply(self.x)
     graph = ComputationGraph([y_bn])
     updates = get_batch_normalization_updates(graph)
     self.simple_assertions(updates, num_updates=2, mean_only=True)
Esempio n. 8
0
def test_batch_normalized_mlp_learn_scale_propagated_at_alloc():
    """Test that setting learn_scale on a BatchNormalizedMLP works."""
    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9],
                             learn_scale=False)
    assert not mlp.learn_scale
    assert all(act.children[0].learn_scale for act in mlp.activations)
    mlp.allocate()
    assert not any(act.children[0].learn_scale for act in mlp.activations)
Esempio n. 9
0
def test_batch_normalized_mlp_mean_only_propagated_at_alloc():
    """Test that setting mean_only on a BatchNormalizedMLP works."""
    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9],
                             mean_only=True)
    assert mlp.mean_only
    assert not any(act.children[0].mean_only for act in mlp.activations)
    mlp.allocate()
    assert all(act.children[0].mean_only for act in mlp.activations)
Esempio n. 10
0
class TestSimpleGetBatchNormalizationUpdates(object):
    def setUp(self):
        self.mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
        self.x = tensor.matrix()

    def simple_assertions(self, updates, num_bricks=2, num_updates=4):
        """Shared assertions for simple tests."""
        assert len(updates) == num_updates
        assert all(is_shared_variable(u[0]) for u in updates)
        # This order is somewhat arbitrary and implementation_dependent
        means = set(u[0] for u in updates
                    if has_roles(u[0], [BATCH_NORM_POPULATION_MEAN]))
        stdevs = set(u[0] for u in updates
                     if has_roles(u[0], [BATCH_NORM_POPULATION_STDEV]))
        assert means.isdisjoint(stdevs)
        assert len(set(get_brick(v) for v in means)) == num_bricks
        assert len(set(get_brick(v) for v in stdevs)) == num_bricks

    def test_get_batch_normalization_updates(self):
        """Test that get_batch_normalization_updates works as expected."""
        with batch_normalization(self.mlp):
            y_bn = self.mlp.apply(self.x)
        graph = ComputationGraph([y_bn])
        updates = get_batch_normalization_updates(graph)
        self.simple_assertions(updates)

    def test_get_batch_normalization_updates_non_training_applications(self):
        """Test updates extracton in graph with non-training apply."""
        y = self.mlp.apply(self.x)
        with batch_normalization(self.mlp):
            y_bn = self.mlp.apply(self.x)
        graph = ComputationGraph([y_bn, y])
        updates = get_batch_normalization_updates(graph)
        self.simple_assertions(updates)

    def test_get_batch_normalization_updates_no_training(self):
        """Test for exception if there are no training-mode nodes."""
        y = self.mlp.apply(self.x)
        graph = ComputationGraph([y])
        numpy.testing.assert_raises(ValueError,
                                    get_batch_normalization_updates, graph)

    def test_get_batch_normalization_updates_duplicates_error(self):
        """Test that we get an error by default on multiple apply."""
        with batch_normalization(self.mlp):
            y = self.mlp.apply(self.x)
            y2 = self.mlp.apply(self.x)
        graph = ComputationGraph([y, y2])
        numpy.testing.assert_raises(ValueError,
                                    get_batch_normalization_updates, graph)

    def test_get_batch_normalization_updates_allow_duplicates(self):
        """Test get_batch_normalization_updates(allow_duplicates=True)."""
        with batch_normalization(self.mlp):
            y = self.mlp.apply(self.x)
            y2 = self.mlp.apply(self.x)
        graph = ComputationGraph([y, y2])
        updates = get_batch_normalization_updates(graph, allow_duplicates=True)
        self.simple_assertions(updates, num_bricks=2, num_updates=8)
Esempio n. 11
0
class TestSimpleGetBatchNormalizationUpdates(object):
    def setUp(self):
        self.mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
        self.x = tensor.matrix()

    def simple_assertions(self, updates, num_bricks=2, num_updates=4):
        """Shared assertions for simple tests."""
        assert len(updates) == num_updates
        assert all(is_shared_variable(u[0]) for u in updates)
        # This order is somewhat arbitrary and implementation_dependent
        means = set(u[0] for u in updates
                    if has_roles(u[0], [BATCH_NORM_POPULATION_MEAN]))
        stdevs = set(u[0] for u in updates
                     if has_roles(u[0], [BATCH_NORM_POPULATION_STDEV]))
        assert means.isdisjoint(stdevs)
        assert len(set(get_brick(v) for v in means)) == num_bricks
        assert len(set(get_brick(v) for v in stdevs)) == num_bricks

    def test_get_batch_normalization_updates(self):
        """Test that get_batch_normalization_updates works as expected."""
        with batch_normalization(self.mlp):
            y_bn = self.mlp.apply(self.x)
        graph = ComputationGraph([y_bn])
        updates = get_batch_normalization_updates(graph)
        self.simple_assertions(updates)

    def test_get_batch_normalization_updates_non_training_applications(self):
        """Test updates extracton in graph with non-training apply."""
        y = self.mlp.apply(self.x)
        with batch_normalization(self.mlp):
            y_bn = self.mlp.apply(self.x)
        graph = ComputationGraph([y_bn, y])
        updates = get_batch_normalization_updates(graph)
        self.simple_assertions(updates)

    def test_get_batch_normalization_updates_no_training(self):
        """Test for exception if there are no training-mode nodes."""
        y = self.mlp.apply(self.x)
        graph = ComputationGraph([y])
        numpy.testing.assert_raises(ValueError,
                                    get_batch_normalization_updates, graph)

    def test_get_batch_normalization_updates_duplicates_error(self):
        """Test that we get an error by default on multiple apply."""
        with batch_normalization(self.mlp):
            y = self.mlp.apply(self.x)
            y2 = self.mlp.apply(self.x)
        graph = ComputationGraph([y, y2])
        numpy.testing.assert_raises(ValueError,
                                    get_batch_normalization_updates, graph)

    def test_get_batch_normalization_updates_allow_duplicates(self):
        """Test get_batch_normalization_updates(allow_duplicates=True)."""
        with batch_normalization(self.mlp):
            y = self.mlp.apply(self.x)
            y2 = self.mlp.apply(self.x)
        graph = ComputationGraph([y, y2])
        updates = get_batch_normalization_updates(graph, allow_duplicates=True)
        self.simple_assertions(updates, num_bricks=2, num_updates=8)
Esempio n. 12
0
def test_batch_normalized_mlp_construction():
    """Test that BatchNormalizedMLP performs construction correctly."""
    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
    assert all(isinstance(a, Sequence) for a in mlp.activations)
    assert all(
        isinstance(a.children[0], BatchNormalization) for a in mlp.activations)
    assert all(isinstance(a.children[1], Tanh) for a in mlp.activations)
Esempio n. 13
0
def build_model(images, labels):
    
    # Construct a bottom convolutional sequence
    bottom_conv_sequence = convolutional_sequence((3,3), 16, (160, 160))
    bottom_conv_sequence._push_allocation_config()
    
    # Flatten layer
    flattener = Flattener()

    # Construct a top MLP
    conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output'))
    #top_mlp = MLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0))
    top_mlp = BatchNormalizedMLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0))
    
    # Construct feedforward sequence
    ss_seq = FeedforwardSequence([bottom_conv_sequence.apply, flattener.apply, top_mlp.apply])
    ss_seq.push_initialization_config()
    ss_seq.initialize()
    
    prediction = ss_seq.apply(images)
    cost_noreg = CategoricalCrossEntropy().apply(labels.flatten(), prediction)

    # add regularization
    selector = Selector([top_mlp])
    Ws = selector.get_parameters('W')
    mlp_brick_name = 'batchnormalizedmlp'
    W0 = Ws['/%s/linear_0.W' % mlp_brick_name]
    W1 = Ws['/%s/linear_1.W' % mlp_brick_name]

    cost = cost_noreg + .01 * (W0 ** 2).mean() + .01 * (W1 ** 2).mean()


    return cost
Esempio n. 14
0
    def __init__(self, image_dimension, **kwargs):

        layers = []
        
        #############################################
        # a first block with 2 convolutions of 32 (3, 3) filters
        layers.append(Convolutional((3, 3), 32, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 32, border_mode='half'))
        layers.append(Rectifier())

        # maxpool with size=(2, 2)
        layers.append(MaxPooling((2, 2)))

        #############################################
        # a 2nd block with 3 convolutions of 64 (3, 3) filters
        layers.append(Convolutional((3, 3), 64, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 64, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 64, border_mode='half'))
        layers.append(Rectifier())
        
        # maxpool with size=(2, 2)
        layers.append(MaxPooling((2, 2)))

        #############################################
        # a 3rd block with 4 convolutions of 128 (3, 3) filters
        layers.append(Convolutional((3, 3), 128, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 128, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 128, border_mode='half'))
        layers.append(Rectifier())
        layers.append(Convolutional((3, 3), 128, border_mode='half'))
        layers.append(Rectifier())
        
        # maxpool with size=(2, 2)
        layers.append(MaxPooling((2, 2)))

        self.conv_sequence = ConvolutionalSequence(layers, 3, image_size=image_dimension)

        flattener = Flattener()

        self.top_mlp = BatchNormalizedMLP(activations=[Rectifier(), Logistic()], dims=[500, 1])

        application_methods = [self.conv_sequence.apply, flattener.apply, self.top_mlp.apply]

        super(VGGNet, self).__init__(application_methods, biases_init=Constant(0), weights_init=Uniform(width=.1), **kwargs)
Esempio n. 15
0
 def setUp(self):
     self.mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
     self.x = tensor.matrix()
Esempio n. 16
0
def build_model(images, labels):

    vgg = VGG(layer='conv3_4')
    vgg.push_initialization_config()
    vgg.initialize()

    sb = SubstractBatch()

    # Construct a bottom convolutional sequence
    layers = [
        Convolutional(filter_size=(3, 3),
                      num_filters=100,
                      use_bias=True,
                      tied_biases=True,
                      name='final_conv0'),
        BatchNormalization(name='batchnorm_1'),
        Rectifier(name='final_conv0_act'),
        Convolutional(filter_size=(3, 3),
                      num_filters=100,
                      use_bias=True,
                      tied_biases=True,
                      name='final_conv1'),
        BatchNormalization(name='batchnorm_2'),
        Rectifier(name='final_conv1_act'),
        MaxPooling(pooling_size=(2, 2), name='maxpool_final')
    ]
    bottom_conv_sequence = ConvolutionalSequence(
        layers,
        num_channels=256,
        image_size=(40, 40),
        biases_init=Constant(0.),
        weights_init=IsotropicGaussian(0.01))
    bottom_conv_sequence._push_allocation_config()

    # Flatten layer
    flattener = Flattener()

    # Construct a top MLP
    conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output'))
    print 'dim output conv:', bottom_conv_sequence.get_dim('output')
    # conv_out_dim = 20 * 40 * 40
    top_mlp = BatchNormalizedMLP(
        [Rectifier(name='non_linear_9'),
         Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10],
        weights_init=IsotropicGaussian(),
        biases_init=Constant(0))

    # Construct feedforward sequence
    ss_seq = FeedforwardSequence([
        vgg.apply, bottom_conv_sequence.apply, flattener.apply, top_mlp.apply
    ])
    ss_seq.push_initialization_config()
    ss_seq.initialize()

    prediction = ss_seq.apply(images)
    cost_noreg = CategoricalCrossEntropy().apply(labels.flatten(), prediction)

    # add regularization
    selector = Selector([top_mlp])
    Ws = selector.get_parameters('W')
    mlp_brick_name = 'batchnormalizedmlp'
    W0 = Ws['/%s/linear_0.W' % mlp_brick_name]
    W1 = Ws['/%s/linear_1.W' % mlp_brick_name]
    cost = cost_noreg + .0001 * (W0**2).sum() + .001 * (W1**2).sum()

    # define learned parameters
    selector = Selector([ss_seq])
    Ws = selector.get_parameters('W')
    bs = selector.get_parameters('b')
    BNSCs = selector.get_parameters('batch_norm_scale')
    BNSHs = selector.get_parameters('batch_norm_shift')

    parameters_top = []
    parameters_top += [v for k, v in Ws.items()]
    parameters_top += [v for k, v in bs.items()]
    parameters_top += [v for k, v in BNSCs.items()]
    parameters_top += [v for k, v in BNSHs.items()]

    selector = Selector([vgg])
    convs = selector.get_parameters()

    parameters_all = []
    parameters_all += parameters_top
    parameters_all += [v for k, v in convs.items()]

    return cost, [parameters_top, parameters_all]
Esempio n. 17
0
    def __init__(
            self,
            dim,
            emb_dim,
            vocab,
            def_emb_translate_dim=-1,
            def_dim=-1,
            encoder='bilstm',
            bn=True,
            def_reader=None,
            def_combiner=None,
            dropout=0.5,
            num_input_words=-1,
            # Others
            **kwargs):

        self._dropout = dropout
        self._vocab = vocab
        self._emb_dim = emb_dim
        self._def_reader = def_reader
        self._def_combiner = def_combiner

        if encoder != 'bilstm':
            raise NotImplementedError()

        if def_emb_translate_dim < 0:
            self.def_emb_translate_dim = emb_dim
        else:
            self.def_emb_translate_dim = def_emb_translate_dim

        if def_dim < 0:
            self._def_dim = emb_dim
        else:
            self._def_dim = def_dim

        if num_input_words > 0:
            logger.info("Restricting vocab to " + str(num_input_words))
            self._num_input_words = num_input_words
        else:
            self._num_input_words = vocab.size()

        children = []

        if self.def_emb_translate_dim != self._emb_dim:
            self._translate_pre_def = Linear(input_dim=emb_dim,
                                             output_dim=def_emb_translate_dim)
            children.append(self._translate_pre_def)
        else:
            self._translate_pre_def = None

        ## Embedding
        self._lookup = LookupTable(self._num_input_words,
                                   emb_dim,
                                   weights_init=GlorotUniform())
        children.append(self._lookup)

        if def_reader:
            self._final_emb_dim = self._def_dim
            self._def_reader = def_reader
            self._def_combiner = def_combiner
            children.extend([self._def_reader, self._def_combiner])
        else:
            self._final_emb_dim = self._emb_dim

        ## BiLSTM
        self._hyp_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='hyp_bidir_fork')
        self._hyp_bidir = Bidirectional(LSTM(dim), name='hyp_bidir')
        self._prem_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='prem_bidir_fork')
        self._prem_bidir = Bidirectional(LSTM(dim), name='prem_bidir')
        children.extend([self._hyp_bidir_fork, self._hyp_bidir])
        children.extend([self._prem_bidir, self._prem_bidir_fork])

        ## BiLSTM no. 2 (encoded attentioned embeddings)
        self._hyp_bidir_fork2 = Linear(8 * dim,
                                       4 * dim,
                                       name='hyp_bidir_fork2')
        self._hyp_bidir2 = Bidirectional(LSTM(dim), name='hyp_bidir2')
        self._prem_bidir_fork2 = Linear(8 * dim,
                                        4 * dim,
                                        name='prem_bidir_fork2')
        self._prem_bidir2 = Bidirectional(LSTM(dim), name='prem_bidir2')
        children.extend([self._hyp_bidir_fork2, self._hyp_bidir2])
        children.extend([self._prem_bidir2, self._prem_bidir_fork2])

        self._rnns = [
            self._prem_bidir2, self._hyp_bidir2, self._prem_bidir,
            self._hyp_bidir
        ]

        ## MLP
        if bn:
            self._mlp = BatchNormalizedMLP([Tanh()], [8 * dim, dim],
                                           conserve_memory=False,
                                           name="mlp")
            self._pred = BatchNormalizedMLP([Softmax()], [dim, 3],
                                            conserve_memory=False,
                                            name="pred_mlp")
        else:
            self._mlp = MLP([Tanh()], [8 * dim, dim], name="mlp")
            self._pred = MLP([Softmax()], [dim, 3], name="pred_mlp")

        children.append(self._mlp)
        children.append(self._pred)

        ## Softmax
        self._ndim_softmax = NDimensionalSoftmax()
        children.append(self._ndim_softmax)

        super(ESIM, self).__init__(children=children, **kwargs)
Esempio n. 18
0
 def setUp(self):
     self.mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
     self.x = tensor.matrix()
Esempio n. 19
0
def training(runname, rnnType, maxPackets, packetTimeSteps, packetReverse, padOldTimeSteps, wtstd, 
             lr, decay, clippings, dimIn, dim, attentionEnc, attentionContext, numClasses, batch_size, epochs, 
             trainPercent, dataPath, loadPrepedData, channel):  # pragma: no cover
    print locals()
    print
    
    X = T.tensor4('inputs')
    Y = T.matrix('targets')
    linewt_init = IsotropicGaussian(wtstd)
    line_bias = Constant(1.0)
    rnnwt_init = IsotropicGaussian(wtstd)
    rnnbias_init = Constant(0.0)
    classifierWts = IsotropicGaussian(wtstd)

    learning_rateClass = theano.shared(np.array(lr, dtype=theano.config.floatX))
    learning_decay = np.array(decay, dtype=theano.config.floatX)
    
    ###DATA PREP
    print 'loading data'
    if loadPrepedData:
        hexSessions = loadFile(dataPath)

    else:
        sessioner = sessionizer.HexSessionizer(dataPath)
        hexSessions = sessioner.read_pcap()
        hexSessions = removeBadSessionizer(hexSessions)

    numSessions = len(hexSessions)
    print str(numSessions) + ' sessions found'
    hexSessionsKeys = order_keys(hexSessions)
    hexDict = hexTokenizer()
    
    print 'creating dictionary of ip communications'
    comsDict, uniqIPs = srcIpDict(hexSessions)
    comsDict = dictUniquerizer(comsDict)
     
    print 'initializing network graph'
    ###ENCODER
    if rnnType == 'gru':
        rnn = GatedRecurrent(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'gru')
        dimMultiplier = 2
    else:
        rnn = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, name = 'lstm')
        dimMultiplier = 4

    fork = Fork(output_names=['linear', 'gates'],
                name='fork', input_dim=dimIn, output_dims=[dim, dim * dimMultiplier], 
                weights_init = linewt_init, biases_init = line_bias)

    ###CONTEXT
    if rnnType == 'gru':
        rnnContext = GatedRecurrent(dim=dim, weights_init = rnnwt_init, 
                                    biases_init = rnnbias_init, name = 'gruContext')
    else:
        rnnContext = LSTM(dim=dim, weights_init = rnnwt_init, biases_init = rnnbias_init, 
                          name = 'lstmContext')

    forkContext = Fork(output_names=['linearContext', 'gatesContext'],
                name='forkContext', input_dim=dim, output_dims=[dim, dim * dimMultiplier], 
                weights_init = linewt_init, biases_init = line_bias)

    forkDec = Fork(output_names=['linear', 'gates'],
                name='forkDec', input_dim=dim, output_dims=[dim, dim*dimMultiplier], 
                weights_init = linewt_init, biases_init = line_bias)

    #CLASSIFIER
    bmlp = BatchNormalizedMLP( activations=[Tanh(),Tanh()], 
               dims=[dim, dim, numClasses],
               weights_init=classifierWts,
               biases_init=Constant(0.0001) )

    #initialize the weights in all the functions
    fork.initialize()
    rnn.initialize()
    forkContext.initialize()
    rnnContext.initialize()
    forkDec.initialize()
    bmlp.initialize()

    def onestepEnc(X):
        data1, data2 = fork.apply(X) 

        if rnnType == 'gru':
            hEnc = rnn.apply(data1, data2) 
        else:
            hEnc, _ = rnn.apply(data2)

        return hEnc

    hEnc, _ = theano.scan(onestepEnc, X) #(mini*numPackets, packetLen, 1, hexdictLen)
        if attentionEnc:
        
        attentionmlpEnc = MLP(activations=[Tanh()], dims = [dim, 1], weights_init=attnWts,
               biases_init=Constant(1.0))
        attentionmlpEnc.initialize()

        hEncAttn = T.reshape(hEnc, (-1, packetTimeSteps, dim))
        def onestepEncAttn(hEncAttn):

            preEncattn = attentionmlpEnc.apply(hEncAttn)
            attEncsoft = Softmax()
            attEncpyx = attEncsoft.apply(preEncattn.flatten())
            attEncpred = attEncpyx.flatten()
            attenc = T.mul(hEncAttn.dimshuffle(1,0), attEncpred).dimshuffle(1,0)

            return attenc

        attenc, _ = theano.scan(onestepEncAttn, hEncAttn)

        hEncReshape = T.reshape(T.sum(attenc, axis = 1), (-1, maxPackets, 1, dim))

    else:
        hEncReshape = T.reshape(hEnc[:,-1], (-1, maxPackets, 1, dim)) #[:,-1] takes the last rep for each packet
                                                                 #(mini, numPackets, 1, dimReduced)  #[:,-1] takes the last rep for each packet
                                                                 #(mini, numPackets, 1, dimReduced)
    def onestepContext(hEncReshape):

        data3, data4 = forkContext.apply(hEncReshape)

        if rnnType == 'gru':
            hContext = rnnContext.apply(data3, data4)
        else:
            hContext, _ = rnnContext.apply(data4)

        return hContext

    hContext, _ = theano.scan(onestepContext, hEncReshape)
    
    if attentionContext:
        attentionmlpContext = MLP(activations=[Tanh()], dims = [dim, 1], weights_init=attnWts,
               biases_init=Constant(1.0))
        attentionmlpContext.initialize()

        hContextAttn = T.reshape(hContext, (-1,maxPackets,dim))
        def onestepContextAttn(hContextAttn):

            preContextatt = attentionmlpContext.apply(hContextAttn)
            attContextsoft = Softmax()
            attContextpyx = attContextsoft.apply(preContextatt.flatten())
            attContextpred = attContextpyx.flatten()
            attcontext = T.mul(hContextAttn.dimshuffle(1,0), attContextpred).dimshuffle(1,0)

            return attcontext

        attcontext, _ = theano.scan(onestepContextAttn, hContextAttn)
        hContextReshape = T.sum(attcontext, axis = 1)

    else:
        hContextReshape = T.reshape(hContext[:,-1], (-1,dim))

    data5, _ = forkDec.apply(hContextReshape)
    pyx = bmlp.apply(data5)
    softmax = Softmax()
    softoutClass = softmax.apply(pyx)
    costClass = T.mean(CategoricalCrossEntropy().apply(Y, softoutClass))

    #CREATE GRAPH
    cgClass = ComputationGraph([costClass])
    paramsClass = VariableFilter(roles = [PARAMETER])(cgClass.variables)
    learning = learningfunctions.Learning(costClass,paramsClass,learning_rateClass,l1=0.,l2=0.,maxnorm=0.,c=clippings)
    updatesClass = learning.Adam() 

    module_logger.info('starting graph compilation')
    classifierTrain = theano.function([X,Y], [costClass, hEnc, hContext, pyx, softoutClass], 
                                      updates=updatesClass, allow_input_downcast=True)
    classifierPredict = theano.function([X], softoutClass, allow_input_downcast=True)
    module_logger.info('graph compilation finished')
    print 'finished graph compilation'

    trainIndex = int(len(hexSessionsKeys)*trainPercent)

    epochCost = []
    gradNorms = []
    trainAcc = []
    testAcc = []

    costCollect = []
    trainCollect = []

    module_logger.info('beginning training')
    iteration = 0
    #epoch
    for epoch in xrange(epochs):

        #iteration/minibatch
        for start, end in zip(range(0, trainIndex,batch_size),
                              range(batch_size, trainIndex, batch_size)):

            trainingTargets = []
            trainingSessions = []

            #create one minibatch with 0.5 normal and 0.5 abby normal traffic
            for trainKey in range(start, end):
                sessionForEncoding = list(hexSessions[hexSessions.keys()[trainKey]][0])
    
                adfun = adversarialfunctions.Adversary(sessionForEncoding)
                adversaryList = [sessionForEncoding, 
                                 adfun.dstIpSwapOut(comsDict, uniqIPs),
                                 adfun.portDirSwitcher(),
                                 adfun.ipDirSwitcher()]
                abbyIndex = random.sample(range(len(adversaryList)), 1)[0]

                targetClasses = [0]*numClasses
                targetClasses[abbyIndex] = 1
                abbyTarget = np.array(targetClasses, dtype=theano.config.floatX)
                trainingSessions.append(abbyOneHotSes[0])
                trainingTargets.append(abbyTarget)

            sessionsMinibatch = np.asarray(trainingSessions).reshape((-1, packetTimeSteps, 1, dimIn))
            targetsMinibatch = np.asarray(trainingTargets)

            costfun = classifierTrain(sessionsMinibatch, targetsMinibatch)

            if iteration % (numSessions / (10 * batch_size)) == 0:
                costCollect.append(costfun[0])
                trainCollect.append(np.mean(np.argmax(costfun[-1],axis=1) == np.argmax(targetsMinibatch, axis=1)))
                module_logger.info('   Iteration: ', iteration)
                module_logger.info('   Cost: ', np.mean(costCollect))
                module_logger.info('   TRAIN accuracy: ', np.mean(trainCollect))
                print '   Iteration: ', iteration
                print '   Cost: ', np.mean(costCollect)
                print '   TRAIN accuracy: ', np.mean(trainCollect)

            iteration+=1

            #testing accuracy
            if iteration % (numSessions / (2 * batch_size)) == 0:
                predtar, acttar, testCollect = predictClass(classifierPredict, hexSessions, comsDict, uniqIPs, hexDict,
                                                            hexSessionsKeys,
                                                            numClasses, trainPercent, dimIn, maxPackets, packetTimeSteps,
                                                            padOldTimeSteps)
                binaryPrecisionRecall(predtar, acttar, numClasses)
                module_logger.info(str(testCollect))

            #save the models
            if iteration % (numSessions / (5 * batch_size)) == 0:
                save_model(classifierPredict)

        epochCost.append(np.mean(costCollect))
        trainAcc.append(np.mean(trainCollect))
        
        module_logger.info('Epoch: ', epoch)
        module_logger.info('Epoch cost average: ', epochCost[-1])
        module_logger.info('Epoch TRAIN accuracy: ', trainAcc[-1])
        print 'Epoch: ', epoch
        print 'Epoch cost average: ', epochCost[-1]
        print 'Epoch TRAIN accuracy: ', trainAcc[-1]

    return classifierTrain, classifierPredict