def __init__(self, x, use, lr, batch, net, reg, drop, mom, tr, res_dir, load_path=""):

        self.out = []
        self.layers = []
        self.insp_mean = []
        self.insp_std = []

        for c in (use, lr, batch, net, reg, drop, mom, tr):
            write(c.__name__+":", res_dir)
            _s = c.__dict__
            del _s['__module__'], _s['__doc__']
            for key in _s.keys(): 
                val = str(_s[key])
                if val.startswith("<static"): val = str(_s[key].__func__.__name__)
                if val.startswith("<Cuda"): continue
                if val.startswith("<Tensor"): continue
                write("  "+key+": "+val, res_dir)


        ####################################################################
        ####################################################################
        print "\n%s\n\tbuilding\n%s"%(('-'*30,)*2)
        ####################################################################
        #################################################################### 
        # ConvNet
        # ------------------------------------------------------------------------------
        # calculate resulting video shapes for all stages
        conv_shapes = []
        for i in xrange(net.n_stages):
            k,p,v = array(net.kernels[i]), array(net.pools[i]), array(tr.video_shapes[i])
            conv_s = tuple(v-k+1)
            conv_shapes.append(conv_s)
            tr.video_shapes.append(tuple((v-k+1)/p))
            print "stage", i
            if use.depth and i==0:
                print "  conv",tr.video_shapes[i],"x 2 ->",conv_s #for body and hand
            else:
                print "  conv",tr.video_shapes[i],"->",conv_s
            print "  pool",conv_s,"->",tr.video_shapes[i+1],"x",net.maps[i+1]

        # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size
        n_in_MLP = net.maps[-1]*net.n_convnets*prod(tr.video_shapes[-1]) 

        if use.depth:
            if net.n_convnets==2: 
                out = [x[:,:,0,:,:,:], x[:,:,1,:,:,:]] # 2 nets: body and hand

        # build 3D ConvNet
        
        for stage in xrange(net.n_stages):
            for i in xrange(len(out)): # for body and hand
                # normalization
                if use.norm and stage==0: 
                    gray_norm = NormLayer(out[i][:,0:1], method="lcn",
                        use_divisor=use.norm_div).output
                    gray_norm = std_norm(gray_norm,axis=[-3,-2,-1])
                    depth_norm = var_norm(out[i][:,1:])
                    out[i]  = T.concatenate([gray_norm,depth_norm],axis=1)
                elif use.norm:
                    out[i] = NormLayer(out[i], method="lcn",use_divisor=use.norm_div).output
                    out[i] = std_norm(out[i],axis=[-3,-2,-1])
                # convolutions  
                out[i] *= net.scaler[stage][i]
                self.layers.append(ConvLayer(out[i], **conv_args(stage, i, batch, net, use, tr.rng, tr.video_shapes, load_path)))
                out[i] = self.layers[-1].output
                out[i] = PoolLayer(out[i], net.pools[stage], method=net.pool_method).output
                if tr.inspect: 
                    self.insp_mean.append(T.mean(out[i]))
                    self.insp_std.append(T.std(out[i]))

        # flatten all convnets outputs
        for i in xrange(len(out)): out[i] = std_norm(out[i],axis=[-3,-2,-1])
        out = [out[i].flatten(2) for i in range(len(out))]
        vid_ = T.concatenate(out, axis=1)

        # dropout
        if use.drop: 
            drop.p_vid = shared(float32(drop.p_vid_val) )
            drop.p_hidden = shared(float32(drop.p_hidden_val))
            vid_ = DropoutLayer(vid_, rng=tr.rng, p=drop.p_hidden).output

        #maxout
        if use.maxout:
            vid_ = maxout(vid_, (batch.micro,n_in_MLP))
            net.activation = lin
            n_in_MLP /= 2
            # net.hidden *= 2

        # MLP
        # ------------------------------------------------------------------------------
        # fusion
        if net.fusion == "early":
            out = vid_
            # hidden layer
            if use.load:
                W,b = load_params(use, load_path)
                self.layers.append(HiddenLayer(out, n_in=n_in_MLP, n_out=net.hidden_vid, rng=tr.rng, W=W, b=b,
                    W_scale=net.W_scale[-2], b_scale=net.b_scale[-2], activation=net.activation))
            else:
                self.layers.append(HiddenLayer(out, n_in=n_in_MLP, n_out=net.hidden_vid, rng=tr.rng, 
                    W_scale=net.W_scale[-2], b_scale=net.b_scale[-2], activation=net.activation))
            out = self.layers[-1].output


        #if tr.inspect: 
            #self.insp_mean = T.stack(self.insp_mean)
            #self.insp_std = T.stack(self.insp_std)
            #self.insp = T.stack(self.insp[0],self.insp[1],self.insp[2],self.insp[3],self.insp[4],self.insp[5], T.mean(out))
        #else: self.insp =  T.stack(0,0)
        # out = normalize(out)
        if use.drop: out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output
        #maxout
        if use.maxout:
            out = maxout(out, (batch.micro,net.hidden))
            net.hidden /= 2

        
        # now assembly all the output
        self.out = out
        self.n_in_MLP = n_in_MLP
Exemple #2
0
    def __init__(self,
                 x,
                 use,
                 lr,
                 batch,
                 net,
                 reg,
                 drop,
                 mom,
                 tr,
                 res_dir,
                 load_path=""):

        self.out = []
        self.layers = []
        self.insp_mean = []
        self.insp_std = []

        for c in (use, lr, batch, net, reg, drop, mom, tr):
            write(c.__name__ + ":", res_dir)
            _s = c.__dict__
            del _s['__module__'], _s['__doc__']
            for key in _s.keys():
                val = str(_s[key])
                if val.startswith("<static"):
                    val = str(_s[key].__func__.__name__)
                if val.startswith("<Cuda"): continue
                if val.startswith("<Tensor"): continue
                write("  " + key + ": " + val, res_dir)

        ####################################################################
        ####################################################################
        print "\n%s\n\tbuilding\n%s" % (('-' * 30, ) * 2)
        ####################################################################
        ####################################################################
        # ConvNet
        # ------------------------------------------------------------------------------
        # calculate resulting video shapes for all stages
        print net.n_stages
        conv_shapes = []
        for i in xrange(net.n_stages):
            k, p, v = array(net.kernels[i]), array(net.pools[i]), array(
                tr.video_shapes[i])
            conv_s = tuple(v - k + 1)
            conv_shapes.append(conv_s)
            tr.video_shapes.append(tuple((v - k + 1) / p))
            print "stage", i
            if use.depth and i == 0:
                print "  conv", tr.video_shapes[
                    i], "x 2 ->", conv_s  #for body and hand
            else:
                print "  conv", tr.video_shapes[i], "->", conv_s
            print "  pool", conv_s, "->", tr.video_shapes[i +
                                                          1], "x", net.maps[i +
                                                                            1]

        # number of inputs for MLP = (# maps last stage)*(# convnets)*(resulting video shape) + trajectory size
        n_in_MLP = net.maps[-1] * net.n_convnets * prod(tr.video_shapes[-1])
        print 'debug1'
        if use.depth:
            if net.n_convnets == 2:
                out = [x[:, :, 0, :, :, :],
                       x[:, :, 1, :, :, :]]  # 2 nets: body and hand

        # build 3D ConvNet

        for stage in xrange(net.n_stages):
            for i in xrange(len(out)):  # for body and hand
                # normalization
                if use.norm and stage == 0:
                    gray_norm = NormLayer(out[i][:, 0:1],
                                          method="lcn",
                                          use_divisor=use.norm_div).output
                    gray_norm = std_norm(gray_norm, axis=[-3, -2, -1])
                    depth_norm = var_norm(out[i][:, 1:])
                    out[i] = T.concatenate([gray_norm, depth_norm], axis=1)
                elif use.norm:
                    out[i] = NormLayer(out[i],
                                       method="lcn",
                                       use_divisor=use.norm_div).output
                    out[i] = std_norm(out[i], axis=[-3, -2, -1])
                # convolutions
                out[i] *= net.scaler[stage][i]
                print 'debug2'
                self.layers.append(
                    ConvLayer(
                        out[i],
                        **conv_args(stage, i, batch, net, use, tr.rng,
                                    tr.video_shapes, load_path)))
                out[i] = self.layers[-1].output
                out[i] = PoolLayer(out[i],
                                   net.pools[stage],
                                   method=net.pool_method).output
                if tr.inspect:
                    self.insp_mean.append(T.mean(out[i]))
                    self.insp_std.append(T.std(out[i]))
        print 'debug2'
        # flatten all convnets outputs
        for i in xrange(len(out)):
            out[i] = std_norm(out[i], axis=[-3, -2, -1])
        out = [out[i].flatten(2) for i in range(len(out))]
        vid_ = T.concatenate(out, axis=1)
        print 'debug3'
        # dropout
        if use.drop:
            drop.p_vid = shared(float32(drop.p_vid_val))
            drop.p_hidden = shared(float32(drop.p_hidden_val))
            vid_ = DropoutLayer(vid_, rng=tr.rng, p=drop.p_hidden).output

        #maxout
        if use.maxout:
            vid_ = maxout(vid_, (batch.micro, n_in_MLP))
            net.activation = lin
            n_in_MLP /= 2
            # net.hidden *= 2

        # MLP
        # ------------------------------------------------------------------------------
        # fusion
        if net.fusion == "early":
            out = vid_
            # hidden layer
            if use.load:
                W, b = load_params(use, load_path)
                self.layers.append(
                    HiddenLayer(out,
                                n_in=n_in_MLP,
                                n_out=net.hidden_vid,
                                rng=tr.rng,
                                W=W,
                                b=b,
                                W_scale=net.W_scale[-2],
                                b_scale=net.b_scale[-2],
                                activation=net.activation))
            else:
                self.layers.append(
                    HiddenLayer(out,
                                n_in=n_in_MLP,
                                n_out=net.hidden_vid,
                                rng=tr.rng,
                                W_scale=net.W_scale[-2],
                                b_scale=net.b_scale[-2],
                                activation=net.activation))
            out = self.layers[-1].output

        #if tr.inspect:
        #self.insp_mean = T.stack(self.insp_mean)
        #self.insp_std = T.stack(self.insp_std)
        #self.insp = T.stack(self.insp[0],self.insp[1],self.insp[2],self.insp[3],self.insp[4],self.insp[5], T.mean(out))
        #else: self.insp =  T.stack(0,0)
        # out = normalize(out)
        if use.drop:
            out = DropoutLayer(out, rng=tr.rng, p=drop.p_hidden).output
        #maxout
        if use.maxout:
            out = maxout(out, (batch.micro, net.hidden))
            net.hidden /= 2
        print 'debug3'

        # now assembly all the output
        self.out = out
        self.n_in_MLP = n_in_MLP
# symbolic variables
# in shape: #frames * gray/depth * body/hand * 4 maps
x = ndtensor(len(tr.in_shape))(name = 'x') # video input
# x = T.TensorVariable(CudaNdarrayType([False] * len(in_shape))) # video input
y = T.ivector(name = 'y') # labels
idx_mini = T.lscalar(name="idx_mini") # minibatch index
idx_micro = T.lscalar(name="idx_micro") # microbatch index
x_ = _shared(empty(tr.in_shape))
y_ = _shared(empty((tr.batch_size,)))
y_int32 = T.cast(y_,'int32')

# print parameters
# ------------------------------------------------------------------------------
for c in (use, lr, batch, net, reg, drop, mom, tr):
    write(c.__name__+":", res_dir)
    _s = c.__dict__
    del _s['__module__'], _s['__doc__']
    for key in _s.keys(): 
        val = str(_s[key])
        if val.startswith("<static"): val = str(_s[key].__func__.__name__)
        if val.startswith("<Cuda"): continue
        if val.startswith("<Tensor"): continue
        write("  "+key+": "+val, res_dir)

####################################################################
####################################################################
print "\n%s\n\t preparing data \n%s"%(('-'*30,)*2)
####################################################################
####################################################################