def downhill_models(M,
                    P,
                    FE,
                    z,
                    K=20,
                    hh=.001,
                    ep=5000,
                    dp=0,
                    wsp=.001,
                    plt=False):
    from paris.signal import bss_eval

    rng = theano.tensor.shared_randomstreams.RandomStreams(0)

    # Shared variables to use
    x = Th.matrix('x')
    y = theano.shared(M.astype(theano.config.floatX))
    d = theano.shared(float32(dp))

    # Network weights
    W0 = theano.shared(
        sqrt(2. / (K + M.shape[0])) *
        random.randn(K, M.shape[0]).astype(theano.config.floatX))
    W1 = theano.shared(
        sqrt(2. / (K + M.shape[0])) *
        random.randn(M.shape[0], K).astype(theano.config.floatX))

    # First layer is the transform to a non-negative subspace
    h = psoftplus(W0.dot(x), 3.)

    # Dropout
    if dp > 0:
        h *= (1. / (1. - d) * (rng.uniform(size=h.shape) > d).astype(
            theano.config.floatX)).astype(theano.config.floatX)

    # Second layer reconstructs the input
    r = psoftplus(W1.dot(h), 3.)

    # Approximate input using KL-like distance
    cost = Th.mean(y * (Th.log(y + eps) - Th.log(r + eps)) - y +
                   r) + wsp * Th.mean(abs(W1))

    # Make an optimizer and define the training input
    opt = downhill.build('rprop', loss=cost, inputs=[x], params=[W0, W1])
    train = downhill.Dataset(M.astype(theano.config.floatX), batch_size=0)

    # Train it
    downhill_train(opt, train, hh, ep, None)

    # Get approximation
    d = 0
    _, _r = theano.function(inputs=[x], outputs=[h, r],
                            updates=[])(M.astype(theano.config.floatX))
    o = FE.ife(_r, P)
    sxr = bss_eval(o, 0, array([z]))

    return W1.get_value(), sxr
def lasagne_models(M,
                   P,
                   FE,
                   z,
                   K=20,
                   hh=.0001,
                   ep=5000,
                   d=0,
                   wsp=0.0001,
                   plt=True):
    from paris.signal import bss_eval

    # Copy key variables to GPU
    _M = Th.matrix('_M')

    # Input and forward transform
    I = InputLayer(shape=M.T.shape, input_var=_M)

    # First layer is the transform to a non-negative subspace
    H0 = DenseLayer(I,
                    num_units=K,
                    nonlinearity=lambda x: psoftplus(x, 3.),
                    b=None)

    # Optional dropout
    H = DropoutLayer(H0, d)

    # Compute source modulator
    R = DenseLayer(H,
                   num_units=M.T.shape[1],
                   nonlinearity=lambda x: psoftplus(x, 3.),
                   b=None)

    # Cost function
    cost = (_M*(Th.log(_M+eps) - Th.log( get_output( R)+eps)) - _M + get_output( R)).mean() \
       + wsp*Th.mean( abs( R.W))

    # Train it using Lasagne
    opt = downhill.build('rprop',
                         loss=cost,
                         inputs=[_M],
                         params=get_all_params(R))
    train = downhill.Dataset(M.T.astype(float32), batch_size=0)
    er = downhill_train(opt, train, hh, ep, None)[-1]

    # Get approximation
    _r = nget(R, _M, M.T.astype(float32)).T
    _h = nget(H, _M, M.T.astype(float32)).T
    o = FE.ife(_r, P)
    sxr = bss_eval(o, 0, array([z]))

    return R, sxr
def nmf_sep(Z, FE, K, s=None):
    from paris.signal import bss_eval

    if s is not None:
        random.seed(s)

    # Get features
    M1, P1 = FE.fe(Z[0])
    M2, P2 = FE.fe(Z[1])
    MT, PT = FE.fe(Z[2] + Z[3])

    # Overcomplete or not?
    t0 = time.time()
    if 1:
        w1, _ = pu(copy(M1), K[0], 300, 0, 0)
        w2, _ = pu(copy(M2), K[1], 300, 0, 0)
        w1 /= sum(w1, axis=0, keepdims=True)
        w2 /= sum(w2, axis=0, keepdims=True)
        w = (w1, w2)
        sp = [0, 0]
    else:
        # Get overcomplete bases
        w = [M1 / sum(M1, axis=0), M2 / sum(M2, axis=0)]
        sp = [.5, 1]

    # Fit 'em on mixture
    t1 = time.time()
    _, h = pu(copy(MT), w, 300, sp[0], sp[1])
    print 'Done in', time.time() - t0, time.time() - t1, 'sec'

    # Get modulator estimates
    q = cumsum([0, w[0].shape[1], w[1].shape[1]])
    fr = [w[i].dot(h[q[i]:q[i + 1], :]) for i in arange(2)]
    fr0 = hstack(w).dot(h) + eps

    # Resynth with Wiener filtering
    r = [FE.ife(fr[0] * (MT / fr0), PT), FE.ife(fr[1] * (MT / fr0), PT)]
    #r = [FE.ife( fr[0], PT),
    #     FE.ife( fr[1], PT)]

    # Get results
    sxr = array(
        [bss_eval(r[i], i, vstack((Z[2], Z[3]))) for i in arange(len(r))])

    return mean(sxr, axis=0), r
def downhill_separate(M,
                      P,
                      FE,
                      W1,
                      W2,
                      z1,
                      z2,
                      hh=.001,
                      ep=5000,
                      d=0,
                      wsp=.0001,
                      plt=True):
    from paris.signal import bss_eval

    # Get dictionary sizes
    K = [W1.shape[1], W2.shape[1]]

    # Cache some things
    y = Th.matrix('y')
    w1 = theano.shared(W1.astype(theano.config.floatX), 'w1')
    w2 = theano.shared(W2.astype(theano.config.floatX), 'w2')

    # Activations to learn
    h1 = theano.shared(
        sqrt(2. / (K[0] + M.shape[1])) *
        random.randn(K[0], M.shape[1]).astype(theano.config.floatX))
    h2 = theano.shared(
        sqrt(2. / (K[1] + M.shape[1])) *
        random.randn(K[1], M.shape[1]).astype(theano.config.floatX))

    # Dropout
    if d > 0:
        dw1 = w1 * 1. / (1. - d) * (rng.uniform(size=w1.shape) > d).astype(
            theano.config.floatX)
        dw2 = w2 * 1. / (1. - d) * (rng.uniform(size=w2.shape) > d).astype(
            theano.config.floatX)
    else:
        dw1 = w1
        dw2 = w2

    # Approximate input
    r1 = psoftplus(dw1.dot(h1), 3.)
    r2 = psoftplus(dw2.dot(h2), 3.)
    r = r1 + r2

    # KL-distance to input
    cost = Th.mean( y * (Th.log( y+eps) - Th.log( r+eps)) - y + r) \
       + wsp*(Th.mean( abs( h1)) + Th.mean( abs( h2)))

    # Make it callable and derive updates
    ffwd_f = theano.function(inputs=[], outputs=[r1, r2, h1, h2], updates=[])

    # Make an optimizer and define the inputs
    opt = downhill.build('rprop', loss=cost, inputs=[y], params=[h1, h2])
    train = downhill.Dataset(M.astype(theano.config.floatX), batch_size=0)

    # Train it
    cst = downhill_train(opt, train, hh, ep, None)

    # So what happened?
    d = 0
    _r1, _r2, _h1, _h2 = ffwd_f()
    _r = _r1 + _r2 + eps
    o1 = FE.ife(_r1 * (M / _r), P)
    o2 = FE.ife(_r2 * (M / _r), P)
    sxr = bss_eval(o1, 0, vstack((z1, z2))) + bss_eval(o2, 1, vstack((z1, z2)))

    # Return things of note
    return o1, o2, (array(sxr[:3]) + array(sxr[3:])) / 2.
def lasagne_separate2(M,
                      P,
                      FE,
                      W1,
                      W2,
                      z1,
                      z2,
                      hh=.0001,
                      ep=5000,
                      d=0,
                      wsp=.0001,
                      plt=True):
    from paris.signal import bss_eval

    # Gt dictionary shapes
    K = [W1.shape[0], W2.shape[0]]

    # GPU cached data
    _M = theano.shared(M.T.astype(float32))
    dum = Th.vector('dum')

    # We have weights to discover
    H = theano.shared(random.rand(M.T.shape[0], K[0] + K[1]).astype(float32))
    fI = InputLayer(shape=(M.T.shape[0], K[0] + K[1]), input_var=H)

    # Split in two pathways
    fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1)
    fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1)

    # Dropout?
    dfW1 = DropoutLayer(fW1, d)
    dfW2 = DropoutLayer(fW2, d)

    # Compute source modulators using previously learned dictionaries
    R1 = DenseLayer(dfW1,
                    num_units=M.shape[0],
                    W=W1.astype(float32),
                    nonlinearity=lambda x: psoftplus(x, 3.),
                    b=None)
    R2 = DenseLayer(dfW2,
                    num_units=M.shape[0],
                    W=W2.astype(float32),
                    nonlinearity=lambda x: psoftplus(x, 3.),
                    b=None)

    # Add the two approximations
    R = ElemwiseSumLayer([R1, R2])

    # Cost function
    cost = (_M*(Th.log(_M+eps) - Th.log( get_output( R)+eps)) - _M + get_output( R)).mean() \
       + wsp*Th.mean( H) + 0*Th.mean( dum)

    # Train it using Lasagne
    opt = downhill.build('rprop', loss=cost, inputs=[dum], params=[H])
    train = downhill.Dataset(array([0]).astype(float32), batch_size=0)
    er = downhill_train(opt, train, hh, ep, None)[-1]

    # Get outputs
    _r = nget(R, dum, array([0]).astype(float32)) + eps
    _r1 = nget(R1, dum, array([0]).astype(float32))
    _r2 = nget(R2, dum, array([0]).astype(float32))
    o1 = FE.ife(_r1 * (M / _r), P)
    o2 = FE.ife(_r2 * (M / _r), P)
    sxr = bss_eval(o1, 0, vstack((z1, z2))) + bss_eval(o2, 1, vstack((z1, z2)))

    return o1, o2, (array(sxr[:3]) + array(sxr[3:])) / 2.
def lasagne_separate(M,
                     P,
                     FE,
                     W1,
                     W2,
                     z1,
                     z2,
                     hh=.0001,
                     ep=5000,
                     d=0,
                     wsp=.0001,
                     plt=True):
    from paris.signal import bss_eval

    # Gt dictionary shapes
    K = [W1.shape[0], W2.shape[0]]

    # GPU cached data
    _M = theano.shared(M.astype(float32))

    # Input is the learned dictionary set
    lW = hstack((W1.T, W2.T)).astype(float32)
    _lW = Th.matrix('_lW')
    fI = InputLayer(shape=lW.shape, input_var=_lW)

    # Split in two paths
    fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1)
    fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1)

    # Dropout?
    dfW1 = DropoutLayer(fW1, d)
    dfW2 = DropoutLayer(fW2, d)

    # Compute source modulators
    R1 = DenseLayer(dfW1,
                    num_units=M.shape[1],
                    nonlinearity=lambda x: psoftplus(x, 3.),
                    b=None)
    R2 = DenseLayer(dfW2,
                    num_units=M.shape[1],
                    nonlinearity=lambda x: psoftplus(x, 3.),
                    b=None)

    # Bring to standard orientation
    R = ElemwiseSumLayer([R1, R2])

    # Cost function
    cost = (_M*(Th.log(_M+eps) - Th.log( get_output( R)+eps)) - _M + get_output( R)).mean() \
       + wsp*(Th.mean( abs( R1.W))+Th.mean( abs( R2.W)))

    # Train it using Lasagne
    opt = downhill.build('rprop',
                         loss=cost,
                         inputs=[_lW],
                         params=get_all_params(R))
    train = downhill.Dataset(lW, batch_size=0)
    er = downhill_train(opt, train, hh, ep, None)[-1]

    # Get outputs
    _r = nget(R, _lW, lW) + eps
    _r1 = nget(R1, _lW, lW)
    _r2 = nget(R2, _lW, lW)
    o1 = FE.ife(_r1 * (M / _r), P)
    o2 = FE.ife(_r2 * (M / _r), P)
    sxr = bss_eval(o1, 0, vstack((z1, z2))) + bss_eval(o2, 1, vstack((z1, z2)))

    return o1, o2, (array(sxr[:3]) + array(sxr[3:])) / 2.