Ejemplo n.º 1
0
def _demo_kde():
    n = 100
    np.random.seed(1)

    # generate some dummy data
    x = np.concatenate((np.random.normal(0, 1, int(0.3 * n)),
                        np.random.normal(5, 1, int(0.7 * n))))[:, np.newaxis]

    # append 0 label to all data as we are interested in a single class case
    y = np.zeros(x.shape)

    # a subset of domain of the random variable x
    x_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]

    # generate a dummy density function to sample data from
    true_dens = (0.3 * norm(0, 1).pdf(x_plot[:, 0]) +
                 0.7 * norm(5, 1).pdf(x_plot[:, 0]))

    fig, ax = plt.subplots()
    ax.fill(x_plot[:, 0],
            true_dens,
            fc='black',
            alpha=0.2,
            label='input distribution')

    # thumb up rule for bandwidth selection
    bandwidth = 1.06 * min(np.std(x),
                           iqr(x) / 1.34) * np.power(x.shape[0], -0.2)

    # try different kernels and show how the look like
    for kernel in ['gaussian', 'tophat', 'epanechnikov']:
        kde = KernelDensityEstimate(kernel=kernel,
                                    bandwidth=bandwidth,
                                    num_cls=1)
        kde.fit(x, y)
        log_dens = kde.list_den_est[0].score_samples(x_plot)
        ax.plot(x_plot[:, 0],
                np.exp(log_dens),
                '-',
                label="kernel = '{0}'".format(kernel))

    ax.text(6, 0.38, "N={0} points".format(n))

    ax.legend(loc='upper left')
    ax.plot(x[:, 0], -0.005 - 0.01 * np.random.random(x.shape[0]), '+k')

    ax.set_xlim(-4, 9)
    ax.set_ylim(-0.02, 0.4)
    plt.show()
    print('KDE Flows!')
    return 0
Ejemplo n.º 2
0
def train_pca_rda_kde_model(x, y, k_folds=10):
    """ Trains the Cw-PCA RDA KDE model given the input data and labels with
        cross validation and returns the model
        Args:
            x(ndarray[float]): C x N x k data array
            y(ndarray[int]): N x 1 observation (class) array
                N is number of samples k is dimensionality of features
                C is number of channels
            k_folds(int): Number of cross validation folds
        Return:
            model(pipeline): trained likelihood model
            """

    # Pipeline is the model. It can be populated manually
    rda = RegularizedDiscriminantAnalysis()
    pca = ChannelWisePrincipalComponentAnalysis(var_tol=.1 ** 5,
                                                num_ch=x.shape[0])
    model = Pipeline()
    model.add(pca)
    model.add(rda)

    # Cross validate
    arg_cv = cross_validation(x, y, model=model, k_folds=k_folds)

    # Get the AUC before the regularization
    tmp, sc_cv, y_cv = cost_cross_validation_auc(model, 1, x, y, arg_cv,
                                                 k_folds=10, split='uniform')
    auc_init = -tmp
    # Start Cross validation
    lam = arg_cv[0]
    gam = arg_cv[1]
    log.debug('Optimized val [gam:{} \ lam:{}]'.format(lam, gam))
    model.pipeline[1].lam = lam
    model.pipeline[1].gam = gam
    tmp, sc_cv, y_cv = cost_cross_validation_auc(model, 1, x, y, arg_cv,
                                                 k_folds=10, split='uniform')
    auc_cv = -tmp

    # After finding cross validation scores do one more round to learn the final RDA model
    model.fit(x, y)

    # Insert the density estimates to the model and train using the cross validated
    # scores to avoid over fitting. Observe that these scores are not obtained using
    # the final model
    bandwidth = 1.06 * min(
        np.std(sc_cv), iqr(sc_cv) / 1.34) * np.power(x.shape[0], -0.2)
    model.add(KernelDensityEstimate(bandwidth=bandwidth))
    model.pipeline[-1].fit(sc_cv, y_cv)

    # Report AUC
    log.debug('AUC-i: {}, AUC-cv: {}'.format(auc_init, auc_cv))

    return model, auc_cv
Ejemplo n.º 3
0
def _demo_validate_data():
    dim_x = 75
    num_x_p = 500
    num_x_n = 500

    num_ch = 20

    x_p_train = np.asarray(
        [np.random.randn(num_x_p, dim_x) for i in range(num_ch)])
    x_n_train = np.array(
        [np.random.randn(num_x_p, dim_x) for i in range(num_ch)])
    y_p_train = [1] * num_x_p
    y_n_train = [0] * num_x_n

    x_train = np.concatenate((x_n_train, x_p_train), axis=1)
    y_train = np.concatenate((y_n_train, y_p_train), axis=0)

    permutation = np.random.permutation(x_train.shape[1])
    x_train = x_train[:, permutation, :]
    y_train = y_train[permutation]

    model, _ = train_pca_rda_kde_model(x_train, y_train, k_folds=10)

    fig = plt.figure()
    ax = fig.add_subplot(211)
    x_plot = np.linspace(np.min(model.line_el[-1]), np.max(model.line_el[-1]),
                         1000)[:, np.newaxis]
    ax.plot(model.line_el[2][y_train == 0],
            -0.005 -
            0.01 * np.random.random(model.line_el[2][y_train == 0].shape[0]),
            'ro',
            label='class(-)')
    ax.plot(model.line_el[2][y_train == 1],
            -0.005 -
            0.01 * np.random.random(model.line_el[2][y_train == 1].shape[0]),
            'go',
            label='class(+)')
    for idx in range(len(model.pipeline[2].list_den_est)):
        log_dens = model.pipeline[2].list_den_est[idx].score_samples(x_plot)
        ax.plot(x_plot[:, 0],
                np.exp(log_dens),
                'r-' * (idx == 0) + 'g-' * (idx == 1),
                linewidth=2.0)

    ax.legend(loc='upper right')
    plt.title('Training Data')
    plt.ylabel('p(e|l)')
    plt.xlabel('scores')

    # Test
    x_p_test = np.asarray(
        [np.random.randn(num_x_p, dim_x) for i in range(num_ch)])
    x_n_test = np.array(
        [np.random.randn(num_x_p, dim_x) for i in range(num_ch)])
    y_p_test = [1] * num_x_p
    y_n_test = [0] * num_x_n

    x_test = np.concatenate((x_n_test, x_p_test), axis=1)
    y_test = np.concatenate((y_n_test, y_p_test), axis=0)

    permutation = np.random.permutation(x_test.shape[1])
    x_test = x_test[:, permutation, :]
    y_test = y_test[permutation]

    model.transform(x_test)

    ax.plot(model.line_el[2][y_test == 0],
            -0.01 -
            0.01 * np.random.random(model.line_el[2][y_test == 0].shape[0]),
            'bo',
            label='t_class(-)')
    ax.plot(model.line_el[2][y_test == 1],
            -0.01 -
            0.01 * np.random.random(model.line_el[2][y_test == 1].shape[0]),
            'ko',
            label='t_class(+)')

    bandwidth = 1.06 * min(np.std(model.line_el[2]),
                           iqr(model.line_el[2]) / 1.34) * np.power(
                               model.line_el[2].shape[0], -0.2)
    test_kde = KernelDensityEstimate(bandwidth=bandwidth)
    test_kde.fit(model.line_el[2], y_test)

    for idx in range(len(model.pipeline[2].list_den_est)):
        log_dens = test_kde.list_den_est[idx].score_samples(x_plot)
        ax.plot(x_plot[:, 0],
                np.exp(log_dens),
                'b--' * (idx == 0) + 'k--' * (idx == 1),
                linewidth=2.0)

    ax.legend(loc='upper right')
    plt.title('Training Data')
    plt.ylabel('p(e|l)')
    plt.xlabel('scores')
    plt.show()
Ejemplo n.º 4
0
def _demo_validate_real_data():
    ds_rate = 2
    channel_map = [1] * 16 + [0, 0, 1, 1, 0, 1, 1, 1, 0]
    data_train_folder = load_experimental_data()

    mode = 'calibration'

    raw_dat, stamp_time, channels, type_amp, fs = read_data_csv(
        data_train_folder + '/rawdata.csv')

    dat = sig_pro(raw_dat, fs=fs, k=ds_rate)

    # Get data and labels
    s_i, t_t_i, t_i = trigger_decoder(mode=mode,
                                      trigger_loc=data_train_folder +
                                      '/triggers.txt')
    x_train, y_train, num_seq, _ = trial_reshaper(t_t_i,
                                                  t_i,
                                                  dat,
                                                  mode=mode,
                                                  fs=fs,
                                                  k=ds_rate,
                                                  channel_map=channel_map)

    model = train_pca_rda_kde_model(x_train, y_train, k_folds=10)

    fig = plt.figure()
    ax = fig.add_subplot(211)
    x_plot = np.linspace(np.min(model.line_el[-1]), np.max(model.line_el[-1]),
                         1000)[:, np.newaxis]
    ax.plot(model.line_el[2][y_train == 0],
            -0.005 -
            0.01 * np.random.random(model.line_el[2][y_train == 0].shape[0]),
            'ro',
            label='class(-)')
    ax.plot(model.line_el[2][y_train == 1],
            -0.005 -
            0.01 * np.random.random(model.line_el[2][y_train == 1].shape[0]),
            'go',
            label='class(+)')
    for idx in range(len(model.pipeline[2].list_den_est)):
        log_dens = model.pipeline[2].list_den_est[idx].score_samples(x_plot)
        ax.plot(x_plot[:, 0],
                np.exp(log_dens),
                'r-' * (idx == 0) + 'g-' * (idx == 1),
                linewidth=2.0)

    ax.legend(loc='upper right')
    plt.title('Training Data')
    plt.ylabel('p(e|l)')
    plt.xlabel('scores')

    # Test
    data_test_folder = load_experimental_data()

    mode = 'calibration'

    raw_dat, stamp_time, channels, type_amp, fs = read_data_csv(
        data_test_folder + '/rawdata.csv')
    dat = sig_pro(raw_dat, fs=fs, k=ds_rate)

    # Get data and labels
    s_i, t_t_i, t_i = trigger_decoder(mode=mode,
                                      trigger_loc=data_test_folder +
                                      '/triggers.txt')
    x_test, y_test, num_seq, _ = trial_reshaper(t_t_i,
                                                t_i,
                                                dat,
                                                mode=mode,
                                                fs=fs,
                                                k=ds_rate,
                                                channel_map=channel_map)

    model.transform(x_test)

    ax.plot(model.line_el[2][y_test == 0],
            -0.01 -
            0.01 * np.random.random(model.line_el[2][y_test == 0].shape[0]),
            'bo',
            label='t_class(-)')
    ax.plot(model.line_el[2][y_test == 1],
            -0.01 -
            0.01 * np.random.random(model.line_el[2][y_test == 1].shape[0]),
            'ko',
            label='t_class(+)')

    bandwidth = 1.06 * min(np.std(model.line_el[2]),
                           iqr(model.line_el[2]) / 1.34) * np.power(
                               model.line_el[2].shape[0], -0.2)
    test_kde = KernelDensityEstimate(bandwidth=bandwidth)
    test_kde.fit(model.line_el[2], y_test)

    for idx in range(len(model.pipeline[2].list_den_est)):
        log_dens = test_kde.list_den_est[idx].score_samples(x_plot)
        ax.plot(x_plot[:, 0],
                np.exp(log_dens),
                'b--' * (idx == 0) + 'k--' * (idx == 1),
                linewidth=2.0)

    ax.legend(loc='upper right')
    plt.title('Training Data')
    plt.ylabel('p(e|l)')
    plt.xlabel('scores')

    plt.show()
Ejemplo n.º 5
0
def _demo_pipeline():
    dim_x = 2
    num_x_p = 200
    num_x_n = 200
    var_tol = .8
    num_ch = 2

    mtx_p = [np.array([[1, 0], [0, 1]]), np.array([[1, 2], [2, 1]])]
    mtx_n = [np.array([[2, 0], [0, 2]]), np.array([[1, -2], [-2, 1]])]

    x_p = np.asarray([
        np.dot(np.random.randn(num_x_p, dim_x), mtx_p[i])
        for i in range(num_ch)
    ])
    x_n = 3 + np.array([
        np.dot(np.random.randn(num_x_p, dim_x), mtx_n[i])
        for i in range(num_ch)
    ])
    y_p = [1] * num_x_p
    y_n = [0] * num_x_n

    x = np.concatenate((x_n, x_p), axis=1)
    y = np.concatenate((y_n, y_p), axis=0)

    permutation = np.random.permutation(x.shape[1])
    x = x[:, permutation, :]
    y = y[permutation]
    """ Select bandwidth of the gaussian kernel assuming data is also
        comming from a gaussian distribution.
        Ref: Silverman, Bernard W. Density estimation for statistics and data
        analysis. Vol. 26. CRC press, 1986. """
    bandwidth = 1.06 * min(np.std(x),
                           iqr(x) / 1.34) * np.power(x.shape[0], -0.2)

    pca = ChannelWisePrincipalComponentAnalysis(num_ch=x.shape[0])
    rda = RegularizedDiscriminantAnalysis()
    kde = KernelDensityEstimate(bandwidth=bandwidth)

    model = Pipeline()
    model.add(pca)
    model.add(rda)
    model.add(kde)

    plt.ion()
    fig = plt.figure()
    ax = fig.add_subplot(212)
    ax_2 = fig.add_subplot(221)
    ax_3 = fig.add_subplot(222)

    for gam in [0, .3, .6, .9]:
        for lam in [0, .3, .6, .9]:
            model.pipeline[1].lam = lam
            model.pipeline[1].gam = gam

            if gam == 0 and lam == 0:
                # Show this once only bad implementation but I don't care
                model.pipeline[0].var_tol = 0
                model.fit(x, y)
                sv_init = [
                    model.pipeline[0].list_pca[i].singular_values_
                    for i in range(len(model.pipeline[0].list_pca))
                ]
                model.pipeline[0].var_tol = var_tol
                model.fit(x, y)
                sv_final = [
                    model.pipeline[0].list_pca[i].singular_values_
                    for i in range(len(model.pipeline[0].list_pca))
                ]
                print("Initial SV:{}".format(sv_init))
                print("-- using tolerance:{} -->".format(var_tol))
                print("Final SV:{}".format(sv_final))

                print("Init dim.:{} -> Final dim.:{}".format(
                    x.shape, model.line_el[1].shape))

            model.fit_transform(x, y)

            el = model.line_el[1]
            x_min, x_max = el[:, 0].min() - 1, el[:, 0].max() + 1
            y_min, y_max = el[:, 1].min() - 1, el[:, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                                 np.arange(y_min, y_max, 0.1))
            z = model.pipeline[1].predict(np.c_[xx.ravel(), yy.ravel()])
            z = z.reshape(xx.shape)

            ax.clear()
            ax_2.clear()
            ax_3.clear()
            ax.contourf(xx, yy, z, alpha=0.2, c=y, s=20)

            ax.scatter(model.line_el[1][y == 1, 0],
                       model.line_el[1][y == 1, 1],
                       c='r')
            ax.scatter(model.line_el[1][y == 0, 0],
                       model.line_el[1][y == 0, 1],
                       c='g')
            ax.set_title('after PCA')

            ax_2.scatter(x[0, y == 1, 0], x[0, y == 1, 1], c='r')
            ax_2.scatter(x[0, y == 0, 0], x[0, y == 0, 1], c='g')

            ax_3.scatter(x[1, y == 1, 0], x[1, y == 1, 1], c='r')
            ax_3.scatter(x[1, y == 0, 0], x[1, y == 0, 1], c='g')
            ax_2.set_title('1st dim')
            ax_3.set_title('2nd dim')

            fig.canvas.draw()

            time.sleep(.2)

    time.sleep(1)
    plt.ioff()
    fig_2, axn = plt.subplots()
    x_plot = np.linspace(np.min(model.line_el[-1]), np.max(model.line_el[-1]),
                         1000)[:, np.newaxis]
    axn.plot(model.line_el[2][y == 0],
             -0.005 -
             0.01 * np.random.random(model.line_el[2][y == 0].shape[0]),
             'ro',
             label='class(-)')
    axn.plot(model.line_el[2][y == 1],
             -0.005 -
             0.01 * np.random.random(model.line_el[2][y == 1].shape[0]),
             'go',
             label='class(+)')
    for idx in range(len(model.pipeline[2].list_den_est)):
        log_dens = model.pipeline[2].list_den_est[idx].score_samples(x_plot)
        axn.plot(x_plot[:, 0],
                 np.exp(log_dens),
                 'r-' * (idx == 0) + 'g--' * (idx == 1),
                 linewidth=2.0)
    axn.legend(loc='upper right')
    plt.title('Likelihoods Given the Labels')
    plt.ylabel('p(e|l)')
    plt.xlabel('scores')
    fig_2.show()
    time.sleep(10)