Example #1
0
def main(fname, N, n, params):
    """Run GMM EM on the data in @fname"""

    gmm = GaussianMixtureModel.from_file(fname)
    k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights

    X = gmm.sample(N, n)

    # Set seed for the algorithm
    sc.random.seed(int(params.seed))

    algo = GaussianMixtureEM(k, d)

    O = M, S, w

    def report(i, O_, lhood):
        M_, _, _ = O_

    lhood, Z, O_ = algo.run(X, None, report)

    M_, S_, w_ = O_
    M_ = closest_permuted_matrix(M.T, M_.T).T

    # Table
    print column_aerr(M, M_), column_rerr(M, M_)
def main( prefix, N, n, delta, params ):
    """Run on sample in fname"""
    gmm = GaussianMixtureModel.from_file( prefix )
    k, d, M, w = gmm.k, gmm.d, gmm.means, gmm.weights
    logger.add( "M", M )
    logger.add_consts( "M", M, k, 2 )
    logger.add( "w_min", w.min() )
    logger.add( "w_max", w.max() )

    X = gmm.sample( N, n )
    logger.add( "k", k )
    logger.add( "d", d )
    logger.add( "n", n )

    # Set seed for the algorithm
    sc.random.seed( int( params.seed ) )
    logger.add( "seed", int( params.seed ) )

    P, T = sample_moments( X, k )
    Pe, Te = exact_moments( M, w )

    start = time.time()
    M_ = recover_components( k, P, T, Pe, Te, delta = delta )
    stop = time.time()
    logger.add( "time", stop - start )

    M_ = closest_permuted_matrix( M.T, M_.T ).T
    logger.add( "M_", M )

    # Error data
    logger.add_err( "M", M, M_ )
    logger.add_err( "M", M, M_, 'col' )

    print column_aerr(M, M_), column_rerr(M, M_)
Example #3
0
def test_gaussian_em():
    """Test the Gaussian EM on a small generated dataset"""
    fname = "gmm-3-10-0.7.npz"
    gmm = GaussianMixtureModel.generate(fname, 3, 3)
    k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights
    N, n = 1e6, 1e5

    X = gmm.sample(N, n)

    algo = GaussianMixtureEM(k, d)

    def report(i, O_, lhood):
        M_, _, _ = O_

    lhood, Z, O_ = algo.run(X, None, report)

    M_, S_, w_ = O_

    M_ = closest_permuted_matrix(M, M_)
    w_ = closest_permuted_vector(w, w_)

    print w, w_

    print norm(M - M_) / norm(M)
    print abs(S - S_).max()
    print norm(w - w_)

    assert (norm(M - M_) / norm(M) < 1e-1)
    assert (abs(S - S_) < 1).all()
    assert (norm(w - w_) < 1e-2)
Example #4
0
def test_gaussian_em():
    """Test the Gaussian EM on a small generated dataset"""
    fname = "gmm-3-10-0.7.npz"
    gmm = GaussianMixtureModel.generate( fname, 3, 3 )
    k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights
    N, n = 1e6, 1e5


    X = gmm.sample( N, n )

    algo = GaussianMixtureEM(k, d)

    def report( i, O_, lhood ):
        M_, _, _ = O_
    lhood, Z, O_ = algo.run( X, None, report )

    M_, S_, w_ = O_

    M_ = closest_permuted_matrix( M, M_ )
    w_ = closest_permuted_vector( w, w_ )

    print w, w_

    print norm( M - M_ )/norm(M)
    print abs(S - S_).max()
    print norm( w - w_ ) 

    assert( norm( M - M_ )/norm(M) < 1e-1 )
    assert (abs(S - S_) < 1 ).all()
    assert( norm( w - w_ ) < 1e-2 )
def main(fname, N, n, params):
    """Run GMM EM on the data in @fname"""

    gmm = GaussianMixtureModel.from_file( fname )
    k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights
    logger.add( "M", M )

    X = gmm.sample( N, n )
    logger.add( "k", k )
    logger.add( "d", d )
    logger.add( "n", n )

    # Set seed for the algorithm
    sc.random.seed( int( params.seed ) )
    logger.add( "seed", int( params.seed ) )

    algo = GaussianMixtureEM( k, d )

    O = M, S, w
    start = time.time()
    def report( i, O_, lhood ):
        M_, _, _ = O_
        logger.add_err( "M_t%d" % (i), M, M_, 'col' )
        logger.add( "time_%d" % (i), time.time() - start )
    lhood, Z, O_ = algo.run( X, None, report )
    logger.add( "time", time.time() - start )

    M_, S_, w_ = O_
    M_ = closest_permuted_matrix( M.T, M_.T ).T
    logger.add( "M_", M_ )

    # Table
    logger.add_err( "M", M, M_, 2 )
    logger.add_err( "M", M, M_, 'col' )
    print column_aerr( M, M_ ), column_rerr( M, M_ )
def test_gaussian_em():
    """Test the Gaussian EM on a small generated dataset"""
    fname = "./test-data/gmm-3-10-0.7.npz"
    gmm = GaussianMixtureModel.from_file( fname )
    k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights
    N, n = 1e6, 1e4


    X = gmm.sample( N, n )

    algo = GaussianMixtureEM(k, d)

    start = time.time()
    def report( i, O_, lhood ):
        M_, _, _ = O_
        logger.add_err( "M_t%d" % (i), M, M_, 'col' )
        logger.add( "time_%d" % (i), time.time() - start )
    lhood, Z, O_ = algo.run( X, None, report )
    logger.add( "time", time.time() - start )

    M_, S_, w_ = O_

    M_ = closest_permuted_matrix( M, M_ )
    w_ = closest_permuted_vector( w, w_ )

    print norm( M - M_ )/norm(M)
    print abs(S - S_) 
    print norm( w - w_ ) 

    assert( norm( M - M_ )/norm(M) < 1e-1 )
    assert( abs(S - S_) < 1 )
    assert( norm( w - w_ ) < 1e-3 )
def compare_error_bounds( model_fname, log_fname, delta = 0.1 ):
    """Compare error bounds theoretical analysis"""
    gmm = GaussianMixtureModel.from_file( model_fname )
    k, d, M, w = gmm.k, gmm.d, gmm.means, gmm.weights

    P, T = exact_moments( M, w )

    lg = sc.load( log_fname )

    # TODO: Use concentration bounds on aerr_P12
    n_M, sk_M = lg["norm_M_2"], lg["s_k_M"], 
    e_P, e_T = lg["aerr_P_2"], lg["aerr_T"], 
    n_P, sk_P, n_T = lg["norm_Pe_2"], lg["s_k_P"], lg["norm_Te"]
    w_min = min(w)

    # TODO: Ah, not computing sigma2! 

    # alpha_P and \beta_P
    a_P = e_P/sk_P
    b_P = a_P/(1-a_P)

    e_Wb = 2/sqrt(sk_P) * b_P
    e_W = lg["aerr_W_2"]

    e_Twb = 1/sqrt(sk_M * (1-a_P)) * e_T + n_T/sk_M * (1 + 1/sqrt(1-a_P) + 1/(1-a_P)) * e_W
    e_Tw = lg["aerr_Tw"]

    e_Lb = e_Tw
    e_L = lg["aerr_lambda"]

    D_M = column_sep( M )
    D_Tw = delta/(sqrt(sc.e) * k**2 * (1+sqrt(2 * log(k/delta)))) * D_M
    e_vb = 4 * sqrt(2) * e_Tw / D_Tw
    e_v = lg["aerr_v_col"]

    e_Wtb = 2 * sqrt( n_P + e_P ) * b_P
    n_Wtb = sqrt( n_P + e_P )

    e_mub = e_Lb + (1+1/sqrt(w_min)) * n_Wtb * e_vb + e_Wtb
    e_mu = lg["aerr_M_col"]

    print "A\t\tbound\t\tactual"
    print "W\t\t%f\t\t%f" % (e_Wb, e_W)
    print "Tw\t\t%f\t\t%f" % (e_Twb, e_Tw)
    print "L\t\t%f\t\t%f" % (e_Lb, e_L)
    print "v\t\t%f\t\t%f" % (e_vb, e_v)
    print "mu\t\t%f\t\t%f" % (e_mub, e_mu)
    return [(e_W/e_Wb), (e_Tw/e_Twb), (e_L / e_Lb), (e_v/e_vb), (e_mu / e_mub),]
def test_exact_recovery():
    """Test the exact recovery of topics"""
    fname = "./test-data/gmm-3-10-0.7.npz"
    gmm = GaussianMixtureModel.from_file( fname )
    k, d, A, w = gmm.k, gmm.d, gmm.means, gmm.weights

    P, T = exact_moments( A, w )

    A_ = recover_components( k, P, T, P, T, delta = 0.01 )
    A_ = closest_permuted_matrix( A.T, A_.T ).T

    print norm( A - A_ )/norm( A )
    print A
    print A_

    assert norm( A - A_ )/norm(A)  < 1e-3
Example #9
0
    def check( k, d ):
        model = GaussianMixtureModel.generate( k, d )
        M1, M2, M3 = model.means
        w = model.weights

        x1, x2, x3 = model.sample( 1e5 )

        # Get the first moments of the data
        X1 = M1.dot( w )
        X2 = M2.dot( w )
        X3 = M3.dot( w )

        X1_ = x1.mean( axis=0 )
        X2_ = x2.mean( axis=0 )
        X3_ = x3.mean( axis=0 )

        err1 = norm( X1 - X1_) 
        err2 = norm( X2 - X2_) 
        err3 = norm( X3 - X3_) 
        print err1, err2, err3
        assert err1 < 1e-02
        assert err2 < 1e-02
        assert err3 < 1e-02

        # Get pairwise estimates
        P12, P13, P123 = spectral.mixture.exact_moments( w, M1, M2, M3 )

        P12_ = sd.Pairs( x1, x2 )
        P13_ = sd.Pairs( x1, x3 )

        err12 = norm( P12 - P12_)
        err13 = norm( P13 - P13_)
        print err12, err13
        assert err12 < 1e-02
        assert err13 < 1e-02

        eta = sc.randn( d )

        # Get triple estimates
        P123 = M1.dot(  diag( M3.T.dot(eta) * w ).dot( M2.T ) )
        P123_ = sd.Triples( x1, x2, x3, eta )

        err123 = norm( P123 - P123_) 
        print err123
        assert norm( P123 - P123_) < 1e-01
def test_sample_recovery():
    """Test the recovery of topics from samples"""
    fname = "./test-data/gmm-3-10-0.7.npz"
    gmm = GaussianMixtureModel.from_file( fname )
    k, d, A, w = gmm.k, gmm.d, gmm.means, gmm.weights
    X = gmm.sample( 10**5 ) 
    P, T = sample_moments( X, k )

    Pe, Te = exact_moments( A, w )
    del gmm

    A_ = recover_components( k, P, T, Pe, Te )
    A_ = closest_permuted_matrix( A.T, A_.T ).T

    print norm( A - A_ )/norm( A )
    print A
    print A_

    assert norm( A - A_ )/norm( A ) < 5e-1
Example #11
0
def main( fname, dataset_type, N, k, d, params ):
    """Generate dataset in file fname"""
    if dataset_type == "gmm":
        if params.cov == "spherical" and params.sigma2 > 0:
            params.cov = array( [params.sigma2 * eye(d)] * k )
        gmm = GaussianMixtureModel.generate( fname, k, d, params.means,
                params.cov, params.weights )
        gmm.sample( N )
        gmm.save() 

    elif dataset_type == "mvgmm":
        views = params.views 
        if params.cov == "spherical" and params.sigma2 > 0:
            params.cov = array( [[params.sigma2 * eye(d)] * k] * views )
        mvgmm = MultiViewGaussianMixtureModel.generate( fname, k, d, views, params.means,
                params.cov, params.weights )
        mvgmm.sample( N )
        mvgmm.save()
    else:
        raise NotImplementedError
Example #12
0
def main(fname, N, n, params):
    """Run GMM EM on the data in @fname"""

    gmm = GaussianMixtureModel.from_file( fname )
    k, d, M, S, w = gmm.k, gmm.d, gmm.means, gmm.sigmas, gmm.weights

    X = gmm.sample( N, n )

    # Set seed for the algorithm
    sc.random.seed( int( params.seed ) )

    algo = GaussianMixtureEM( k, d )

    O = M, S, w
    def report( i, O_, lhood ):
        M_, _, _ = O_
    lhood, Z, O_ = algo.run( X, None, report )

    M_, S_, w_ = O_
    M_ = closest_permuted_matrix( M.T, M_.T ).T

    # Table
    print column_aerr( M, M_ ), column_rerr( M, M_ )
def main(args):
    # Load data
    trainDataPed = np.load('data/processed/test_data_ped.npy').astype(
        np.float32)
    trainDataBic = np.load('data/processed/test_data_bic.npy').astype(
        np.float32)
    trainLabelPed = np.load('data/processed/test_label_ped.npy')
    trainLabelBic = np.load('data/processed/test_label_bic.npy')

    testDataPed = np.load('data/processed/train_data_ped.npy').astype(
        np.float32)
    testDataBic = np.load('data/processed/train_data_bic.npy').astype(
        np.float32)
    testLabelPed = np.load('data/processed/train_label_ped.npy')
    testLabelBic = np.load('data/processed/train_label_bic.npy')

    # Downsample by a factor of 2
    trainDataPed_ds = utils.downsampler_2(trainDataPed)
    trainDataBic_ds = utils.downsampler_2(trainDataBic)
    testDataPed_ds = utils.downsampler_2(testDataPed)
    testDataBic_ds = utils.downsampler_2(testDataBic)

    # Vectorize "image" data
    trainDataPedVec = trainDataPed_ds.reshape((trainDataPed_ds.shape[0], -1),
                                              order='F')
    trainDataBicVec = trainDataBic_ds.reshape((trainDataBic_ds.shape[0], -1),
                                              order='F')
    testDataPedVec = testDataPed_ds.reshape((testDataPed_ds.shape[0], -1),
                                            order='F')
    testDataBicVec = testDataBic_ds.reshape((testDataBic_ds.shape[0], -1),
                                            order='F')

    # Check out the Downsampled data
    #mv.classification_data_visualizer(trainDataPedVec.reshape((trainDataPedVec.shape[0],200,72), order='F'), trainLabelPed)

    ## # --- Use for Feature Plots (PCA)
    #nFeatures = 16
    ## PCA Feature Extraction -- Compute Features via PCA using Mean Centered Ped & Bic spectrograms
    #trainDataPedVecWeights, trainDataPedVecFeatures = pca.PCA(trainDataPedVec-np.mean(trainDataPedVec, axis=0), nFeatures)
    #trainDataBicVecWeights, trainDataBicVecFeatures = pca.PCA(trainDataBicVec-np.mean(trainDataBicVec, axis=0), nFeatures)
    #
    ## NMF Feature Extraction -- Compute Features via NMF using Mean Centered Ped & Bic spectrograms
    ## trainDataPedVecWeightsNMF, trainDataPedVecFeaturesNMF = pca.PCA(trainDataPedVec-np.mean(trainDataPedVec, axis=0), nFeatures)
    ## trainDataBicVecWeightsNMF, trainDataBicVecFeaturesNMF = pca.PCA(trainDataBicVec-np.mean(trainDataBicVec, axis=0), nFeatures)
    #
    ## Check out the features
    ## mv.classification_data_visualizer(trainDataPedVecFeatures.reshape((nFeatures,200,72), order='F'), np.array([str(i) for i in range(nFeatures)]))
    #mv.feature_viewer(trainDataPedVecFeatures.reshape((nFeatures,200,72), order='F'),nFeatures, trainDataPed_ds.shape[1], trainDataPed_ds.shape[2], title='Pedestrian Features')
    #mv.feature_viewer(trainDataBicVecFeatures.reshape((nFeatures,200,72), order='F'),nFeatures, trainDataBic_ds.shape[1], trainDataBic_ds.shape[2], title='Bike Features')
    ## # --- Use for Feature Plots (PCA)

    # =============================================================================
    # 1) Gaussian Mixture Model
    # =============================================================================
    if args.model == 'GMM':
        print("[GMM] Begin GMM Training & Testing")
        nFeatures = 16
        nClasses = 2

        # Produce full set
        fullSet = np.concatenate((trainDataPedVec, trainDataBicVec), axis=0)
        fullSetLabel = np.concatenate((trainLabelPed, trainLabelBic), axis=0)

        trainingDataMean = np.mean(fullSet, axis=0)
        weights, features = pca.PCA(fullSet - trainingDataMean, nFeatures)

        if args.see_features:
            print(args.see_features)
            mv.feature_viewer(features.reshape((nFeatures, 200, 72),
                                               order='F'),
                              nFeatures,
                              trainDataBic_ds.shape[1],
                              trainDataBic_ds.shape[2],
                              title='GMM Features')
        if args.see_weights:
            print(args.see_weights)
            mv.weight_viewer(weights, fullSetLabel)

        # Generate mean and covariance for bike and pedestrian class
        gmm_classifier = GMM.GaussianMixtureModel(fullSet, nFeatures, 2, 1000)

        results = gmm_classifier.fit(fullSet)

        # Make a decision
        decision = np.argmax(results, axis=0)
        decisionLabeled = []
        for sample in decision:
            if sample == 0:
                decisionLabeled.append('ped    ')
            elif sample == 1:
                decisionLabeled.append('bic    ')
        decisionLabeled = np.array(decisionLabeled)

        # Calculate Statistics
        train_accuracy = np.mean(decisionLabeled == fullSetLabel)
        print("Training set accuracy: ", train_accuracy)

        # -- Now test
        testFullSet = np.concatenate((testDataPedVec, testDataBicVec), axis=0)
        testFullSetLabel = np.concatenate((testLabelPed, testLabelBic), axis=0)

        # Generate mean and covariance for bike and pedestrian class
        testResults = gmm_classifier.fit(testFullSet)

        # Make a decision
        testDecision = np.argmax(testResults, axis=0)
        testDecisionLabeled = []
        for sample in testDecision:
            if sample == 0:
                testDecisionLabeled.append('ped    ')
            elif sample == 1:
                testDecisionLabeled.append('bic    ')
        testDecisionLabeled = np.array(testDecisionLabeled)

        # Calculate Statistics
        bike_correct = 0
        bike_incorrect = 0
        ped_correct = 0
        ped_incorrect = 0
        for i in range(len(testDecisionLabeled)):
            if testDecisionLabeled[i] == 'ped    ':
                if testDecisionLabeled[i] == testFullSetLabel[i]:
                    ped_correct += 1
                else:
                    ped_incorrect += 1
            else:
                if testDecisionLabeled[i] == testFullSetLabel[i]:
                    bike_correct += 1
                else:
                    bike_incorrect += 1

        test_accuracy = np.mean(testDecisionLabeled == testFullSetLabel)
        print("[GMM] Testing set accuracy: ", test_accuracy)

    # =============================================================================
    # 2) Convolutional Neural Net
    # =============================================================================
    elif args.model == 'CNN':
        print("[CNN] Begin CNN Training & Testing")
        # Produce train set
        trainSet = np.concatenate((trainDataPed_ds, trainDataBic_ds), axis=0)
        trainSetLabel = np.concatenate((trainLabelPed, trainLabelBic), axis=0)
        # Convert train set to torch tensor
        trainSet = torch.tensor(trainSet, dtype=torch.float32)
        # Answer to the question: Is it a bike?
        trainSetLabel_binary = np.array(
            [int('bic    ' == elem) for elem in trainSetLabel])
        trainSetLabel_bool = np.array([('bic    ' == elem)
                                       for elem in trainSetLabel])

        # Produce test set
        testSet = np.concatenate((testDataPed_ds, testDataBic_ds), axis=0)
        testSetLabel = np.concatenate((testLabelPed, testLabelBic), axis=0)
        # Convert test set to torch tensor
        testSet = torch.tensor(testSet, dtype=torch.float32)
        # Answer to the question: Is it a bike?
        testSetLabel_bool = np.array(
            ['bic    ' == elem for elem in testSetLabel])

        train_flag = False

        if train_flag:
            _, net = CNN.fit(trainSet, trainSetLabel_binary, testSet, 10)
        else:
            loss_fn = torch.nn.CrossEntropyLoss()
            in_size = 0
            out_size = 2
            net = CNN.NeuralNet(0.03, loss_fn, in_size, out_size)
            net.load_state_dict(torch.load('net.model'))

        net.eval()
        batch_size = 10

        # Begin - Train
        num_batch_train = trainSet.shape[0] // batch_size
        result_train = np.zeros((num_batch_train * batch_size, 2))

        # Evaluate - Train
        for i in range(num_batch_train):
            result_train[i * 10:(i + 1) * 10] = net(
                trainSet[i * 10:(i + 1) * 10]).detach().numpy()

        # Decide - Train
        decision_train = np.array(
            [sample[0] < sample[1] for sample in result_train])
        train_accuracy = np.mean(decision_train == trainSetLabel_bool)
        print("[CNN] Training set accuracy: ", train_accuracy)

        # Begin - Test
        num_batch = testSet.shape[0] // batch_size
        result = np.zeros((num_batch * batch_size, 2))

        # Evaluate - Test
        for i in range(num_batch):
            result[i * 10:(i + 1) * 10] = net(testSet[i * 10:(i + 1) *
                                                      10]).detach().numpy()

        # Decide - Test
        decision = np.array([sample[0] < sample[1] for sample in result])
        test_accuracy = np.mean(decision == testSetLabel_bool)
        print("[CNN] Testing set accuracy: ", test_accuracy)