# connect layer to previous layer layer.connect(prev_layer) prev_layer = layer delta = not delta remain, total = drv.mem_get_info() print "%.3fGB of %.3fGB Allocated (%.3fGB Remaining)" % ( (total - remain) / 1024.**3, total / 1024.**3, remain / 1024.**3) # give the first layer some data layers[0].init_data(np.random.uniform(0.0, 1.0, layers[0].dimO2)) # Scale the initial weights so activations are bound around 1.0 # We do this by running it through the forward pass and collecting mean stats ng.bench = False prev_layer = None for layer in layers: layer.fprop() if layer.weights is not None: mean = layer.get_activation_mean() scale = .5 #if prev_layer is None else prev_layer.reduction_factor() print "Scale weights: %.3f (%.3f) %s" % (scale / mean, scale, layer) layer.weights *= scale / mean layer.fprop() prev_layer = layer ng.bench = layer_bench
if i > 1: layer.init_deltas(shared=shared_deltas) remain, total = drv.mem_get_info() print("%.3fGB of %.3fGB Allocated (%.3fGB Remaining)" % ((total-remain)/1024.**3, total/1024.**3, remain/1024.**3)) if zeros: layers[0].init_data() else: # give the first layer some data layers[0].init_data(np.random.uniform(0.0, 1.0, layers[0].dimO)) # Scale the initial weights so activations are bound around 1.0 # We do this by running it through the forward pass and collecting mean stats ng.bench = False propagation = None for layer in layers: propagation = layer.fprop(propagation, scale_weights=.5) ng.bench = layer_bench start = drv.Event() end = drv.Event() fprop_time = 0 bprop_time = 0 fprop_flops = 0 bprop_flops = 0 # We throw away the first two runs as it includes pycuda kernel loading times and clock warmup. # So add 1 to our loop count.