# connect layer to previous layer
            layer.connect(prev_layer)
            prev_layer = layer
            delta = not delta

        remain, total = drv.mem_get_info()
        print "%.3fGB of %.3fGB Allocated (%.3fGB Remaining)" % (
            (total - remain) / 1024.**3, total / 1024.**3, remain / 1024.**3)

        # give the first layer some data
        layers[0].init_data(np.random.uniform(0.0, 1.0, layers[0].dimO2))

        # Scale the initial weights so activations are bound around 1.0
        # We do this by running it through the forward pass and collecting mean stats
        ng.bench = False
        prev_layer = None
        for layer in layers:
            layer.fprop()
            if layer.weights is not None:
                mean = layer.get_activation_mean()
                scale = .5  #if prev_layer is None else prev_layer.reduction_factor()
                print "Scale weights: %.3f (%.3f) %s" % (scale / mean, scale,
                                                         layer)
                layer.weights *= scale / mean
                layer.fprop()

            prev_layer = layer

        ng.bench = layer_bench
            if i > 1:
                layer.init_deltas(shared=shared_deltas)

        remain, total = drv.mem_get_info()
        print("%.3fGB of %.3fGB Allocated (%.3fGB Remaining)" %
              ((total-remain)/1024.**3, total/1024.**3, remain/1024.**3))

        if zeros:
            layers[0].init_data()
        else:
            # give the first layer some data
            layers[0].init_data(np.random.uniform(0.0, 1.0, layers[0].dimO))

            # Scale the initial weights so activations are bound around 1.0
            # We do this by running it through the forward pass and collecting mean stats
            ng.bench = False
            propagation = None
            for layer in layers:
                propagation = layer.fprop(propagation, scale_weights=.5)
            ng.bench = layer_bench

        start = drv.Event()
        end   = drv.Event()

        fprop_time  = 0
        bprop_time  = 0
        fprop_flops = 0
        bprop_flops = 0

        # We throw away the first two runs as it includes pycuda kernel loading times and clock warmup.
        # So add 1 to our loop count.