rng=rng, W_scale=net.W_scale, b_scale=net.b_scale, activation=activation)) out = T.concatenate([layers[-1].output, layers[-2].output], axis=1) # out = normalize(out) if use.drop: out = DropoutLayer(out, rng=rng, p=drop.p_hidden).output # softmax layer layers.append( LogRegr(out, rng=rng, activation=activation, n_in=net.hidden, W_scale=net.W_scale, b_scale=net.b_scale, n_out=20)) """ layers[-1] : softmax layer layers[-2] : hidden layer (video if late fusion) layers[-3] : hidden layer (trajectory, only if late fusion) """ # cost function cost = layers[-1].negative_log_likelihood(y) if reg.L1_vid > 0 or reg.L2_vid > 0: # L1 and L2 regularization L1 = T.abs_(layers[-2].W).sum() + T.abs_(layers[-1].W).sum()
rng=tr.rng, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], activation=net.activation)) out = layers[-1].output if tr.inspect: insp.append(T.mean(out)) if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output insp = T.stack(insp) # softmax layer layers.append( LogRegr(out, rng=tr.rng, n_in=net.hidden, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], n_out=net.n_class)) # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size print 'MLP:', video_cnn.n_in_MLP, "->", net.hidden_penultimate, "+", net.hidden_traj, '->', \ net.hidden, '->', net.hidden, '->', net.n_class, "" # cost function cost = layers[-1].negative_log_likelihood(y) # function computing the number of errors errors = layers[-1].errors(y) # gradient descent # parameter list for layer in video_cnn.layers:
rng=rng, activation=activation)) n_in_MLP = net.maps[-1] * net.n_convnets * prod(video_shapes[-1]) layers.append( HiddenLayer(vid_, n_in=n_in_MLP, n_out=net.hidden_vid, rng=rng, activation=activation)) out = T.concatenate([layers[-1].output, layers[-2].output], axis=1) if use.drop: out = DropoutLayer(out, rng=rng, p=drop.p_hidden).output # softmax layer layers.append( LogRegr(out, rng=rng, activation=activation, n_in=net.hidden, n_out=20)) """ layers[-1] : softmax layer layers[-2] : hidden layer (video if late fusion) layers[-3] : hidden layer (trajectory, only if late fusion) """ # cost function cost = layers[-1].negative_log_likelihood(y) if reg.L1_vid > 0 or reg.L2_vid > 0: # L1 and L2 regularization L1 = T.abs_(layers[-2].W).sum() + T.abs_(layers[-1].W).sum() L2 = (layers[-2].W**2).sum() + (layers[-1].W**2).sum() cost += reg.L1_vid * L1 + reg.L2_vid * L2
n_in=n_in_MLP, n_out=net.hidden_vid, rng=rng, W_scale=net.W_scale[-2], b_scale=net.b_scale, activation=activation)) out = T.concatenate([layers[-1].output, layers[-2].output], axis=1) # out = normalize(out) if use.drop: out = DropoutLayer(out, rng=rng, p=drop.p_hidden).output # softmax layer layers.append( LogRegr(out, rng=rng, activation=activation, n_in=net.hidden, W_scale=net.W_scale[-1], b_scale=net.b_scale, n_out=net.n_class)) #-------------------LATE LATE FUSION--------------------------------------------- # vid_out = layers[-1].p_y_given_x # n_hidden = 200 # traj_ = t.flatten(2) # layers.append(HiddenLayer(traj_, n_in=traj_size, n_out=n_hidden, rng=rng, # W_scale=0.01, b_scale=net.b_scale, activation=activation)) # traj_out = layers[-1].output # traj_out = DropoutLayer(traj_out, rng=rng, p=drop.p_hidden).output
out = layers[-1].output if tr.inspect: insp = T.stack(insp[0],insp[1],insp[2],insp[3],insp[4],insp[5], T.mean(out)) else: insp = T.stack(0,0) # out = normalize(out) if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output #maxout if use.maxout: out = maxout(out, (batch.micro,net.hidden)) net.hidden /= 2 # softmax layer if use.load: Ws, bs = load_params(use) # This is test, wudi added this! layers.append(LogRegr(out, W = Ws, b = bs, rng=tr.rng, activation=lin, n_in=net.hidden, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], n_out=net.n_class)) else: layers.append(LogRegr(out, rng=tr.rng, activation=lin, n_in=net.hidden, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], n_out=net.n_class)) """ layers[-1] : softmax layer layers[-2] : hidden layer (video if late fusion) layers[-3] : hidden layer (trajectory, only if late fusion) """ # cost function cost = layers[-1].negative_log_likelihood(y) if reg.L1_vid > 0 or reg.L2_vid > 0: # L1 and L2 regularization
out = layers[-1].output else: # late fusion n_in_MLP -= net.maps[-1]*net.n_convnets*prod(video_shapes[-1]) layers.append(HiddenLayer(traj_, n_in=n_in_MLP, n_out=net.hidden_traj, rng=rng, W_scale=net.W_scale[-2], b_scale=net.b_scale, activation=activation)) n_in_MLP = net.maps[-1]*net.n_convnets*prod(video_shapes[-1]) layers.append(HiddenLayer(vid_, n_in=n_in_MLP, n_out=net.hidden_vid, rng=rng, W_scale=net.W_scale[-2], b_scale=net.b_scale, activation=activation)) out = T.concatenate([layers[-1].output, layers[-2].output], axis=1) # out = normalize(out) if use.drop: out = DropoutLayer(out, rng=rng, p=drop.p_hidden).output # softmax layer layers.append(LogRegr(out, rng=rng, activation=activation, n_in=net.hidden, W_scale=net.W_scale[-1], b_scale=net.b_scale, n_out=20)) #-------------------LATE LATE FUSION--------------------------------------------- # vid_out = layers[-1].p_y_given_x # n_hidden = 200 # traj_ = t.flatten(2) # layers.append(HiddenLayer(traj_, n_in=traj_size, n_out=n_hidden, rng=rng, # W_scale=0.01, b_scale=net.b_scale, activation=activation)) # traj_out = layers[-1].output # traj_out = DropoutLayer(traj_out, rng=rng, p=drop.p_hidden).output # layers.append(LogRegr(traj_out, rng=rng, activation=activation, n_in=n_hidden,
def build(cpu): # constants floatX = config.floatX enum = enumerate file = gzip.GzipFile("params.zip", 'rb') params = load(file) file.close() print params W = [[None,None],[None,None],[None,None]] b = [[None,None],[None,None],[None,None]] W[0][0],b[0][0],W[0][1],b[0][1],W[1][0],b[1][0],W[1][1],b[1][1],W[2][0],b[2][0],W[2][1],b[2][1],Wh,bh,Ws,bs = params #-----------------------------FLIP KERNEL------------------------------------------ if cpu: W = array(W) W_new = [[None,None],[None,None],[None,None]] for i in range(W.shape[0]): for j in range(W.shape[1]): w = W[i,j].get_value() print w.shape, w.dtype for k in range(w.shape[0]): for l in range(w.shape[1]): for m in range(w.shape[2]): w[k,l,m] = cv2.flip(w[k,l,m],-1) W_new[i][j] = shared(array(w, dtype=floatX), borrow=True) W = W_new #-----------------------------FLIP KERNEL------------------------------------------ rng = random.RandomState(1337) # this will make sure results are always the same batch_size = 1 in_shape = (1,2,2,32,64,64) # (batchsize, maps, frames, w, h) input video shapes traj_shape = (batch_size,3,32) # (batchsize, input shape of the trajectory # hyper parameters # ------------------------------------------------------------------------------ # use techniques/methods class use: drop = True # dropout depth = True # use depth map as input aug = False # data augmentation load = False # load params.p file traj = False # trajectory trajconv = False # convolutions on trajectory valid2 = False fast_conv = not cpu norm_div = False norm = True # normalization layer mom = True # momentum # regularization class reg: L1_traj = .0 # degree/amount of regularization L2_traj = .0 # 1: only L1, 0: only L2 L1_vid = .0 # degree/amount of regularization L2_vid = .0 # 1: only L1, 0: only L2 class trajconv: append = False # append convolutions result to original traject filter_size = 5 layers = 3 # number of convolution layers res_shape = traj_shape[-1]-layers*(filter_size-1) class net: shared_stages = [] # stages where weights are shared shared_convnets = [] # convnets that share weights ith beighbouring convnet n_convnets = 2 # number of convolutional networks in the architecture maps = [2,16,32,64] # feature maps in each convolutional network # maps = [2,5,25,25] # feature maps in each convolutional network kernels = [(1,7,7), (1,8,8), (1,6,6)] # convolution kernel shapes pools = [(2,2,2), (2,2,2), (2,2,2)] # pool/subsampling shapes hidden_traj = 200 # hidden units in MLP hidden_vid = 300 # hidden units in MLP W_scale = 0.01 b_scale = 0.1 norm_method = "lcn" # normalisation method: lcn = local contrast normalisation pool_method = "max" # maxpool fusion = "early" # early or late fusion hidden = hidden_traj+hidden_vid if fusion=="late" else 500 # hidden units in MLP n_class = 21 activation = relu n_stages = len(net.kernels) video_shapes = [in_shape[-3:]] def _shared(val, borrow=True): return shared(array(val, dtype=floatX), borrow=borrow) def ndtensor(n): return TensorType(floatX, (False,)*n) # n-dimensional tensor for i in xrange(n_stages): k,p,v = array(net.kernels[i]), array(net.pools[i]), array(video_shapes[i]) conv_s = tuple(v-k+1) video_shapes.append(tuple((v-k+1)/p)) n_in_MLP = net.maps[-1]*net.n_convnets*prod(video_shapes[-1]) def conv_args(stage, i): """ ConvLayer arguments, i: stage index """ args = { 'batch_size':1, 'activation':activation, 'rng':rng, 'n_in_maps':net.maps[stage], 'n_out_maps':net.maps[stage+1], 'kernel_shape':net.kernels[stage], 'video_shape':video_shapes[stage], "fast_conv":use.fast_conv, "layer_name":"Conv"+str(stage), "W_scale":net.W_scale, "b_scale":net.b_scale, "stride":1, "W":W[stage][i], "b":b[stage][i] } return args # print conv_args(0,0) x = ndtensor(len(in_shape))(name = 'x') # video input def var_norm(_x,imgs=True,axis=[-3,-2,-1]): if imgs: return (_x-T.mean(_x,axis=axis,keepdims=True))/T.maximum(1e-4,T.std(_x,axis=axis,keepdims=True)) return (_x-T.mean(_x))/T.maximum(1e-4,T.std(_x)) def std_norm(_x,axis=[-3,-2,-1]): return _x/T.maximum(1e-4,T.std(_x,axis=axis,keepdims=True)) out = [x[:,0], x[:,1]] for stage in xrange(n_stages): for i in xrange(len(out)): # for each convnet of the stage if stage==0: gray_norm = NormLayer(out[i][:,0:1], method="lcn",use_divisor=False).output gray_norm = std_norm(gray_norm) depth_norm = var_norm(out[i][:,1:]) out[i] = T.concatenate([gray_norm,depth_norm],axis=1) else: out[i] = NormLayer(out[i], method="lcn",use_divisor=False).output out[i] = std_norm(out[i]) out[i] = ConvLayer(out[i], **conv_args(stage, i)).output out[i] = PoolLayer(out[i], net.pools[stage], method=net.pool_method).output out = [out[i].flatten(2) for i in range(len(out))] out = T.concatenate(out, axis=1) #hidden layer out = HiddenLayer(out, W = Wh, b = bh, n_in=n_in_MLP, n_out=net.hidden, rng=rng, activation=activation).output logreg = LogRegr(out, W = Ws, b = bs, rng=rng, activation=activation, n_in=net.hidden, n_out=net.n_class) pred = logreg.p_y_given_x x_ = _shared(empty(in_shape)) print "compiling..." eval_model = function([], [pred], givens={x:x_}, on_unused_input='ignore') print "compiling done" return eval_model, x_
# # net.hidden /= 2 #----EXTRA LAYER---------------------------------------------------------------- # softmax layer args = { 'activation': lin, 'rng': rng, "W_scale": net.W_scale[-1], "b_scale": net.b_scale[-1], "n_in": net.hidden, "n_out": net.n_class } if use.load: args["W"], args["b"] = load_params() layers.append(LogRegr(out, **args)) """ layers[-1] : softmax layer layers[-2] : hidden layer (video if late fusion) layers[-3] : hidden layer (trajectory, only if late fusion) """ # cost function cost = layers[-1].negative_log_likelihood(y) if reg.L1_vid > 0 or reg.L2_vid > 0: # L1 and L2 regularization L1 = T.abs_(layers[-2].W).sum() + T.abs_(layers[-1].W).sum() L2 = (layers[-2].W**2).sum() + (layers[-1].W**2).sum()