elif pc == "lio": src = "/mnt/wd/chalearn/preproc" res_dir_ = "/home/lpigou/chalearn_wudi/try" loader = DataLoader(src, tr.batch_size) # Lio changed it to read from HDF5 files #################################################################### #################################################################### print "\n%s\n\tbuilding\n%s" % (('-' * 30, ) * 2) #################################################################### #################################################################### idx_mini = T.lscalar(name="idx_mini") # minibatch index idx_micro = T.lscalar(name="idx_micro") # microbatch index x = ndtensor(len(tr.in_shape))(name='x') # video input x_ = _shared(empty(tr.in_shape)) y_ = _shared(empty((tr.batch_size, ))) y_int32 = T.cast(y_, 'int32') y = T.ivector(name='y') # labels conv_shapes = [] for i in xrange(net.n_stages): k, p, v = array(net.kernels[i]), array(net.pools[i]), array( tr.video_shapes[i]) conv_s = tuple(v - k + 1) conv_shapes.append(conv_s) tr.video_shapes.append(tuple((v - k + 1) / p)) print "stage", i print " conv", tr.video_shapes[i], "->", conv_s print " pool", conv_s, "->", tr.video_shapes[i + 1], "x", net.maps[i + 1]
layers = [] # all architecture layers mini_updates = [] micro_updates = [] last_upd = [] update = [] # shared variables learning_rate = shared(float32(lr.init)) if use.mom: momentum = shared(float32(mom.momentum)) drop.p_vid = shared(float32(drop.p_vid_val)) drop.p_hidden = shared(float32(drop.p_hidden_val)) idx_mini = T.lscalar(name="idx_mini") # minibatch index idx_micro = T.lscalar(name="idx_micro") # microbatch index x = ndtensor(len(tr.in_shape))(name='x') # video input y = T.ivector(name='y') # labels x_ = _shared(empty(tr.in_shape)) y_ = _shared(empty(tr.batch_size)) y_int32 = T.cast(y_, 'int32') L1 = _shared(0) L2 = _shared(0) ### useless fake, but DataLoader_with_skeleton_normalisation would require that x_skeleton = ndtensor(len(tr._skeleon_in_shape))( name='x_skeleton') # video input x_skeleton_ = _shared(empty(tr._skeleon_in_shape)) # load the skeleton normalisation --Lio didn't normalise video input, but should we? import cPickle
micro_updates = [] last_upd = [] update = [] # shared variables learning_rate = shared(float32(lr.init)) if use.mom: momentum = shared(float32(mom.momentum)) drop.p_vid = shared(float32(drop.p_vid_val) ) drop.p_hidden = shared(float32(drop.p_hidden_val)) idx_mini = T.lscalar(name="idx_mini") # minibatch index idx_micro = T.lscalar(name="idx_micro") # microbatch index x = ndtensor(len(tr.in_shape))(name = 'x') # video input y = T.ivector(name = 'y') # labels x_ = _shared(empty(tr.in_shape)) y_ = _shared(empty(tr.batch_size)) y_int32 = T.cast(y_,'int32') L1 = _shared(0) L2 = _shared(0) ### useless fake, but DataLoader_with_skeleton_normalisation would require that x_skeleton = ndtensor(len(tr._skeleon_in_shape))(name = 'x_skeleton') # video input x_skeleton_ = _shared(empty(tr._skeleon_in_shape)) # load the skeleton normalisation --Lio didn't normalise video input, but should we?
def build(): use.load = True # we load the CNN parameteres here x = ndtensor(len(tr.in_shape))(name='x') # video input x_ = _shared(empty(tr.in_shape)) conv_shapes = [] for i in xrange(net.n_stages): k, p, v = array(net.kernels[i]), array(net.pools[i]), array( tr.video_shapes[i]) conv_s = tuple(v - k + 1) conv_shapes.append(conv_s) tr.video_shapes.append(tuple((v - k + 1) / p)) print "stage", i print " conv", tr.video_shapes[i], "->", conv_s print " pool", conv_s, "->", tr.video_shapes[i + 1], "x", net.maps[i + 1] # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size n_in_MLP = net.maps[-1] * net.n_convnets * prod(tr.video_shapes[-1]) print 'MLP:', n_in_MLP, "->", net.hidden, "->", net.n_class, "" if use.depth: if net.n_convnets == 2: out = [x[:, :, 0, :, :, :], x[:, :, 1, :, :, :]] # 2 nets: body and hand # build 3D ConvNet layers = [] # all architecture layers insp = [] for stage in xrange(net.n_stages): for i in xrange(len(out)): # for body and hand # normalization if use.norm and stage == 0: gray_norm = NormLayer(out[i][:, 0:1], method="lcn", use_divisor=use.norm_div).output gray_norm = std_norm(gray_norm, axis=[-3, -2, -1]) depth_norm = var_norm(out[i][:, 1:]) out[i] = T.concatenate([gray_norm, depth_norm], axis=1) elif use.norm: out[i] = NormLayer(out[i], method="lcn", use_divisor=use.norm_div).output out[i] = std_norm(out[i], axis=[-3, -2, -1]) # convolutions out[i] *= net.scaler[stage][i] layers.append( ConvLayer( out[i], **conv_args(stage, i, batch, net, use, tr.rng, tr.video_shapes))) out[i] = layers[-1].output out[i] = PoolLayer(out[i], net.pools[stage], method=net.pool_method).output if tr.inspect: insp.append(T.mean(out[i])) # flatten all convnets outputs for i in xrange(len(out)): out[i] = std_norm(out[i], axis=[-3, -2, -1]) out = [out[i].flatten(2) for i in range(len(out))] vid_ = T.concatenate(out, axis=1) # dropout if use.drop: drop.p_vid = shared(float32(drop.p_vid_val)) drop.p_hidden = shared(float32(drop.p_hidden_val)) drop.p_vid.set_value(float32(0.)) # dont use dropout when testing drop.p_hidden.set_value(float32(0.)) # dont use dropout when testing vid_ = DropoutLayer(vid_, rng=tr.rng, p=drop.p_vid).output # MLP # ------------------------------------------------------------------------------ # fusion if net.fusion == "early": out = vid_ # hidden layer Wh, bh = load_params(use) # This is test, wudi added this! layers.append( HiddenLayer(out, W=Wh, b=bh, n_in=n_in_MLP, n_out=net.hidden, rng=tr.rng, W_scale=net.W_scale[-2], b_scale=net.b_scale[-2], activation=relu)) out = layers[-1].output if tr.inspect: insp = T.stack(insp[0], insp[1], insp[2], insp[3], insp[4], insp[5], T.mean(out)) else: insp = T.stack(0, 0) if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output #maxout # softmax layer Ws, bs = load_params(use) # This is test, wudi added this! layers.append( LogRegr(out, W=Ws, b=bs, rng=tr.rng, activation=lin, n_in=net.hidden, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], n_out=net.n_class)) """ layers[-1] : softmax layer layers[-2] : hidden layer (video if late fusion) layers[-3] : hidden layer (trajectory, only if late fusion) """ # prediction y_pred = layers[-1].y_pred p_y_given_x = layers[-1].p_y_given_x #################################################################### #################################################################### print "\n%s\n\tcompiling\n%s" % (('-' * 30, ) * 2) #################################################################### #################################################################### # compile functions # ------------------------------------------------------------------------------ print 'compiling test_model' eval_model = function([], [y_pred, p_y_given_x], givens={x: x_}, on_unused_input='ignore') return eval_model, x_
def __init__(self, res_dir, load_path): self.layers = [] # only contain the layers from fusion self.insp_mean = [] # inspection for each layer mean activation self.insp_std = [] # inspection for each layer std activation self.params = [] # parameter list self.idx_mini = T.lscalar(name="idx_mini") # minibatch index self.idx_micro = T.lscalar(name="idx_micro") # microbatch index # symbolic variables self.x = ndtensor(len(tr.in_shape))(name='x') # video input self.y = T.ivector(name='y') # labels # symbolic variables self.x_skeleton = ndtensor(len(tr._skeleon_in_shape))( name='x_skeleton') # video input if use.drop: drop.p_vid = shared(float32(drop.p_vid_val)) drop.p_hidden = shared(float32(drop.p_hidden_val)) video_cnn = conv3d_chalearn(self.x, use, lr, batch, net, reg, drop, mom, \ tr, res_dir, load_path) dbn = GRBM_DBN(numpy_rng=random.RandomState(123), n_ins=891, \ hidden_layers_sizes=[2000, 2000, 1000], n_outs=101, input_x=self.x_skeleton, label=self.y ) # we load the pretrained DBN skeleton parameteres here if use.load == True: dbn.load(os.path.join(load_path, 'dbn_2015-06-19-11-34-24.npy')) ##################################################################### # fuse the ConvNet output with skeleton output -- need to change here ###################################################################### out = T.concatenate([video_cnn.out, dbn.sigmoid_layers[-1].output], axis=1) ##################################################################### # wudi add the mean and standard deviation of the activation values to exam the neural net # Reference: Understanding the difficulty of training deep feedforward neural networks, Xavier Glorot, Yoshua Bengio ##################################################################### insp_mean_list = [] insp_std_list = [] insp_mean_list.extend(dbn.out_mean) insp_mean_list.extend(video_cnn.insp_mean) insp_std_list.extend(dbn.out_std) insp_std_list.extend(video_cnn.insp_std) ###################################################################### #MLP layer self.layers.append( HiddenLayer(out, n_in=net.hidden, n_out=net.hidden, rng=tr.rng, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], activation=net.activation)) out = self.layers[-1].output if tr.inspect: insp_mean_list.extend([T.mean(out)]) insp_std_list.extend([T.std(out)]) self.insp_mean = T.stacklists(insp_mean_list) self.insp_std = T.stacklists(insp_std_list) if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output ###################################################################### # softmax layer self.layers.append( LogRegr(out, rng=tr.rng, n_in=net.hidden, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], n_out=net.n_class)) self.p_y_given_x = self.layers[-1].p_y_given_x ###################################################################### # cost function self.cost = self.layers[-1].negative_log_likelihood(self.y) # function computing the number of errors self.errors = self.layers[-1].errors(self.y) # parameter list for layer in video_cnn.layers: self.params.extend(layer.params) # pre-trained dbn parameter last layer (W, b) doesn't need to incorporate into the params # for calculating the gradient self.params.extend(dbn.params[:-2]) # MLP hidden layer params self.params.extend(self.layers[-2].params) # softmax layer params self.params.extend(self.layers[-1].params) # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size print 'MLP:', video_cnn.n_in_MLP, "->", net.hidden_penultimate, "+", net.hidden_traj, '->', \ net.hidden, '->', net.hidden, '->', net.n_class, "" return
def build(): use.load = True # we load the CNN parameteres here x = ndtensor(len(tr.in_shape))(name = 'x') # video input x_ = _shared(empty(tr.in_shape)) conv_shapes = [] for i in xrange(net.n_stages): k,p,v = array(net.kernels[i]), array(net.pools[i]), array(tr.video_shapes[i]) conv_s = tuple(v-k+1) conv_shapes.append(conv_s) tr.video_shapes.append(tuple((v-k+1)/p)) print "stage", i print " conv",tr.video_shapes[i],"->",conv_s print " pool",conv_s,"->",tr.video_shapes[i+1],"x",net.maps[i+1] # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size n_in_MLP = net.maps[-1]*net.n_convnets*prod(tr.video_shapes[-1]) print 'MLP:', n_in_MLP, "->", net.hidden, "->", net.n_class, "" if use.depth: if net.n_convnets==2: out = [x[:,:,0,:,:,:], x[:,:,1,:,:,:]] # 2 nets: body and hand # build 3D ConvNet layers = [] # all architecture layers insp = [] for stage in xrange(net.n_stages): for i in xrange(len(out)): # for body and hand # normalization if use.norm and stage==0: gray_norm = NormLayer(out[i][:,0:1], method="lcn", use_divisor=use.norm_div).output gray_norm = std_norm(gray_norm,axis=[-3,-2,-1]) depth_norm = var_norm(out[i][:,1:]) out[i] = T.concatenate([gray_norm,depth_norm],axis=1) elif use.norm: out[i] = NormLayer(out[i], method="lcn",use_divisor=use.norm_div).output out[i] = std_norm(out[i],axis=[-3,-2,-1]) # convolutions out[i] *= net.scaler[stage][i] layers.append(ConvLayer(out[i], **conv_args(stage, i, batch, net, use, tr.rng, tr.video_shapes))) out[i] = layers[-1].output out[i] = PoolLayer(out[i], net.pools[stage], method=net.pool_method).output if tr.inspect: insp.append(T.mean(out[i])) # flatten all convnets outputs for i in xrange(len(out)): out[i] = std_norm(out[i],axis=[-3,-2,-1]) out = [out[i].flatten(2) for i in range(len(out))] vid_ = T.concatenate(out, axis=1) # dropout if use.drop: drop.p_vid = shared(float32(drop.p_vid_val) ) drop.p_hidden = shared(float32(drop.p_hidden_val)) drop.p_vid.set_value(float32(0.)) # dont use dropout when testing drop.p_hidden.set_value(float32(0.)) # dont use dropout when testing vid_ = DropoutLayer(vid_, rng=tr.rng, p=drop.p_vid).output # MLP # ------------------------------------------------------------------------------ # fusion if net.fusion == "early": out = vid_ # hidden layer Wh, bh = load_params(use) # This is test, wudi added this! layers.append(HiddenLayer(out, W = Wh, b =bh, n_in=n_in_MLP, n_out=net.hidden, rng=tr.rng, W_scale=net.W_scale[-2], b_scale=net.b_scale[-2], activation=relu)) out = layers[-1].output if tr.inspect: insp = T.stack(insp[0],insp[1],insp[2],insp[3],insp[4],insp[5], T.mean(out)) else: insp = T.stack(0,0) if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output #maxout # softmax layer Ws, bs = load_params(use) # This is test, wudi added this! layers.append(LogRegr(out, W = Ws, b = bs, rng=tr.rng, activation=lin, n_in=net.hidden, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], n_out=net.n_class)) """ layers[-1] : softmax layer layers[-2] : hidden layer (video if late fusion) layers[-3] : hidden layer (trajectory, only if late fusion) """ # prediction y_pred = layers[-1].y_pred p_y_given_x = layers[-1].p_y_given_x #################################################################### #################################################################### print "\n%s\n\tcompiling\n%s"%(('-'*30,)*2) #################################################################### #################################################################### # compile functions # ------------------------------------------------------------------------------ print 'compiling test_model' eval_model = function([], [y_pred, p_y_given_x], givens={x:x_}, on_unused_input='ignore') return eval_model, x_
def __init__(self, res_dir, load_path): self.layers = [] # only contain the layers from fusion self.insp_mean = [] # inspection for each layer mean activation self.insp_std = [] # inspection for each layer std activation self.params = [] # parameter list self.idx_mini = T.lscalar(name="idx_mini") # minibatch index self.idx_micro = T.lscalar(name="idx_micro") # microbatch index # symbolic variables self.x = ndtensor(len(tr.in_shape))(name = 'x') # video input self.y = T.ivector(name = 'y') # labels # symbolic variables self.x_skeleton = ndtensor(len(tr._skeleon_in_shape))(name = 'x_skeleton') # video input if use.drop: drop.p_vid = shared(float32(drop.p_vid_val) ) drop.p_hidden = shared(float32(drop.p_hidden_val)) video_cnn = conv3d_chalearn(self.x, use, lr, batch, net, reg, drop, mom, \ tr, res_dir, load_path) dbn = GRBM_DBN(numpy_rng=random.RandomState(123), n_ins=891, \ hidden_layers_sizes=[2000, 2000, 1000], n_outs=101, input_x=self.x_skeleton, label=self.y ) # we load the pretrained DBN skeleton parameteres here if use.load == True: dbn.load(os.path.join(load_path,'dbn_2015-06-19-11-34-24.npy')) ##################################################################### # fuse the ConvNet output with skeleton output -- need to change here ###################################################################### out = T.concatenate([video_cnn.out, dbn.sigmoid_layers[-1].output], axis=1) ##################################################################### # wudi add the mean and standard deviation of the activation values to exam the neural net # Reference: Understanding the difficulty of training deep feedforward neural networks, Xavier Glorot, Yoshua Bengio ##################################################################### insp_mean_list = [] insp_std_list = [] insp_mean_list.extend(dbn.out_mean) insp_mean_list.extend(video_cnn.insp_mean) insp_std_list.extend(dbn.out_std) insp_std_list.extend(video_cnn.insp_std) ###################################################################### #MLP layer self.layers.append(HiddenLayer(out, n_in=net.hidden, n_out=net.hidden, rng=tr.rng, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], activation=net.activation)) out = self.layers[-1].output if tr.inspect: insp_mean_list.extend([T.mean(out)]) insp_std_list.extend([T.std(out)]) self.insp_mean = T.stacklists(insp_mean_list) self.insp_std = T.stacklists(insp_std_list) if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output ###################################################################### # softmax layer self.layers.append(LogRegr(out, rng=tr.rng, n_in=net.hidden, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], n_out=net.n_class)) self.p_y_given_x = self.layers[-1].p_y_given_x ###################################################################### # cost function self.cost = self.layers[-1].negative_log_likelihood(self.y) # function computing the number of errors self.errors = self.layers[-1].errors(self.y) # parameter list for layer in video_cnn.layers: self.params.extend(layer.params) # pre-trained dbn parameter last layer (W, b) doesn't need to incorporate into the params # for calculating the gradient self.params.extend(dbn.params[:-2]) # MLP hidden layer params self.params.extend(self.layers[-2].params) # softmax layer params self.params.extend(self.layers[-1].params) # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size print 'MLP:', video_cnn.n_in_MLP, "->", net.hidden_penultimate, "+", net.hidden_traj, '->', \ net.hidden, '->', net.hidden, '->', net.n_class, "" return
#parser.add_argument('path')# the path to load best parameters #args = parser.parse_args() #load_path = args.path load_path='/remote/idiap.svm/user.active/dwu/chalearn/result/try/CNN_normalisation_53.0% 2015.06.23.12.17.31/' ###################################################################### import cPickle f = open('CNN_normalization.pkl','rb') CNN_normalization = cPickle.load(f) Mean_CNN = CNN_normalization ['Mean_CNN'] Std_CNN = CNN_normalization['Std_CNN'] # customized data loader for both video module and skeleton module #loader = DataLoader_with_skeleton_normalisation(src, tr.batch_size, Mean_CNN, Std_CNN) # Lio changed it to read from HDF5 files # we load the CNN parameteres here x = ndtensor(len(tr.in_shape))(name = 'x') # video input x_ = _shared(empty(tr.in_shape)) use.load=True use.fast_conv=True video_cnn = conv3d_chalearn(x, use, lr, batch, net, reg, drop, mom, tr, res_dir, load_path) out = video_cnn.out layers = [] # all architecture layers # softmax layer if use.load: W, b = load_params(use, load_path) print W.shape, b.shape layers.append(LogRegr(out, rng=tr.rng, n_in=net.hidden_vid, W=W, b=b, W_scale=net.W_scale[-1], b_scale=net.b_scale[-1], n_out=net.n_class))