コード例 #1
0
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)

    # ----- Data generation ----
    means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma)

    (X_train, X_train_unlabelled, X_train_labelled),\
    (y_train, y_train_unlabelled, y_train_labelled),\
    y_train_gt = sample_data(means, covs, args.n_samples)

    # ----- Data scaling ----
    # Must be performed before to display final data in the right space
    if args.scaling:
        scale_data(X_train,
                   (X_train, X_train_unlabelled, X_train_labelled, means))

    # ----- Grid -----
    grid = generate_grid(X_train, args.sigma, args.resolution)

    # ----- Training -----
    clf = DensityForest(n_estimators=args.n_trees,
                        random_state=args.seed,
                        min_samples_leaf=2,
                        n_jobs=-1,
                        max_depth=args.max_depth,
                        max_features=args.max_features,
                        min_improvement=args.min_improvement)
    clf.fit(X_train)

    # ----- Prediction -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape(
        (np.product(grid.shape[1:]), grid.shape[0]))
    prob_predict = clf.predict_proba(X_test_pred)

    # ----- Ground truth -----
    X_test_gt = np.rollaxis(grid, 0, 3)
    prob_gt = np.sum([
        scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov)
        for mean, cov in zip(means, covs)
    ], 0)
    prob_gt /= args.n_clusters  # normalize

    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y)

    # first plot: gt
    plt.subplot(2, 1, 1)
    im = plt.imshow(prob_gt.T,
                    extent=[min(x), max(x), min(y),
                            max(y)],
                    interpolation='none',
                    cmap=plt.cm.afmhot,
                    aspect='auto',
                    origin='lower')  #'auto'
    plt.colorbar()

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('GT')

    # second plot: prediction
    plt.subplot(2, 1, 2)
    im = plt.imshow(prob_predict.reshape((x.size, y.size)).T,
                    extent=[min(x), max(x), min(y),
                            max(y)],
                    interpolation='none',
                    cmap=plt.cm.afmhot,
                    aspect='auto',
                    origin='lower')  #'auto'
    plt.colorbar()

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Prediction')

    plt.show()
コード例 #2
0
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)

    # make dataset
    X, y = make_sklearn_dataset(args.dataset, args.n_samples)

    # normalize
    if args.scaling:
        X = StandardScaler().fit_transform(X).astype(np.float32)

    # ----- Create training and testing sets
    labelled_mask = np.zeros(y.shape, np.bool)
    for cid in np.unique(y):
        m = (cid == y)
        for _ in range(args.n_labelled):
            repeat = True
            while repeat:
                sel = np.random.randint(0, y.size)
                repeat = ~(m[sel] and ~labelled_mask[sel]
                           )  # belonging to target class AND not yet selected
            labelled_mask[sel] = True

    X_train = X
    X_train_unlabelled = X_train[~labelled_mask]
    y_train = y.copy()
    y_train[~labelled_mask] = -1
    y_train_unlabelled_gt = y[~labelled_mask]
    X_train_labelled = X_train[labelled_mask]
    y_train_labelled = y[labelled_mask]

    # make custom map
    cmap = plt.get_cmap('jet', len(np.unique(y_train)))

    # ----- Grid -----
    grid = generate_grid(X_train, X_train.std(), args.resolution)

    # ----- Training -----
    clf = SemiSupervisedRandomForestClassifier(
        random_state=args.seed,
        n_estimators=args.n_trees,
        max_depth=args.max_depth,
        max_features=args.max_features,
        supervised_weight=args.supervised_weight,
        min_improvement=args.min_improvement,
        transduction_method=args.transduction_method,
        unsupervised_transformation=None)
    clf.fit(X_train, y_train)

    # ----- Learned distribution -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape(
        (np.product(grid.shape[1:]), grid.shape[0]))
    pdf = clf.pdf(X_test_pred)

    # ----- Transduction -----
    y_train_result = clf.transduced_labels_

    # ----- A-posteriori classification -----
    y_train_prediction = clf.predict(X_train_unlabelled)

    # ----- Scoring -----
    print 'SCORES:'
    print '\t', accuracy_score(y_train_unlabelled_gt,
                               y_train_result), 'Labeling through transduction'
    print '\t', accuracy_score(
        y_train_unlabelled_gt,
        y_train_prediction), 'Labeling through classification'

    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y)

    # colour range: pdf
    pdf_vmin = pdf.min()
    pdf_vmax = pdf.max()

    # plot: gt - pdf
    plt.subplot(3, 1, 1)
    plt.scatter(X_train_unlabelled[:, 0],
                X_train_unlabelled[:, 1],
                c=cmap(y_train_unlabelled_gt.astype(np.uint8)),
                s=20,
                alpha=.6)
    plt.scatter(X_train_labelled[:, 0],
                X_train_labelled[:, 1],
                c=cmap(y_train_labelled.astype(np.uint8)),
                s=100)

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Ground-truth: PDF + samples')

    if args.split_lines:
        draw_split_lines(clf.estimators_[0], x, y)

    # plot: learned - pdf
    plt.subplot(3, 1, 2)
    img = plt.imshow(pdf.reshape((x.size, y.size)).T,
                     extent=[min(x), max(x), min(y),
                             max(y)],
                     interpolation='none',
                     cmap=plt.cm.afmhot,
                     aspect='auto',
                     origin='lower',
                     vmin=pdf_vmin,
                     vmax=pdf_vmax,
                     alpha=.5)  #'auto'
    plt.scatter(X_train_unlabelled[:, 0],
                X_train_unlabelled[:, 1],
                c=cmap(y_train_result.astype(np.uint8)),
                s=20,
                alpha=.6)
    plt.scatter(X_train_labelled[:, 0],
                X_train_labelled[:, 1],
                c=cmap(y_train_labelled.astype(np.uint8)),
                s=100)
    plt.colorbar(img)

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Learned forest: PDF + samples labelled through transduction')

    if args.split_lines:
        draw_split_lines(clf.estimators_[0], x, y)

    # plot: learned - pdf
    plt.subplot(3, 1, 3)
    img = plt.imshow(pdf.reshape((x.size, y.size)).T,
                     extent=[min(x), max(x), min(y),
                             max(y)],
                     interpolation='none',
                     cmap=plt.cm.afmhot,
                     aspect='auto',
                     origin='lower',
                     vmin=pdf_vmin,
                     vmax=pdf_vmax,
                     alpha=.5)  #'auto'
    #plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_prediction.astype(np.uint8)), s=20, alpha=.6)
    #plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100)
    plt.colorbar(img)

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Learned forest: PDF + a-posteriori classification')

    # add split-lines
    if args.split_lines:
        draw_split_lines(clf.estimators_[0], x, y)

    if args.save:
        plt.savefig(args.save)
    else:
        plt.show()
コード例 #3
0
ファイル: plot_tree_gauss.py プロジェクト: loli/sklearnef
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)
    
    # ----- Data generation ----
    means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma)
    
    (X_train, X_train_unlabelled, X_train_labelled),\
    (y_train, y_train_unlabelled, y_train_labelled),\
    y_train_gt = sample_data(means, covs, args.n_samples)
    
    # ----- Data scaling ----
    # Must be performed before to display final data in the right space
    if args.scaling:
        scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means))
    
    # ----- Grid -----
    grid = generate_grid(X_train, args.sigma, args.resolution)
    
    # ----- Training -----
    clf = DensityTree(random_state=args.seed,
                      min_samples_leaf=2,
                      max_depth=args.max_depth,
                      max_features=args.max_features,
                      min_improvement=args.min_improvement)   
    clf.fit(X_train)
    
    # ----- Prediction -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0]))
    pdf = clf.pdf(X_test_pred)
    cdf = clf.cdf(X_test_pred)
    
    # ----- Ground truth -----
    X_test_gt = np.rollaxis(grid, 0, 3)
    prob_gt = np.sum([scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs)], 0)
    prob_gt /= args.n_clusters # normalize

    # ----- Goodness of fit measure -----
    X_eval = np.concatenate([scipy.stats.multivariate_normal.rvs(mean, cov, args.n_samples) for mean, cov in zip(means, covs)])
    gof = GoodnessOfFit(clf.cdf, X_eval)
    print 'Goodness of fit evaluation:'
    print '\tmaxium error:', gof.maximum()
    print '\tmean squared error:', gof.mean_squared_error()
    print '\tmean squared error weighted:', gof.mean_squared_error_weighted(clf.pdf)
    
    # ----- E(M)CDF -----
    emcdf = gof.ecdf(X_test_pred)
    
    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y) 
    
    # colour range: pdf
    pdf_vmin = min(prob_gt.min(), pdf.min())
    pdf_vmax = min(prob_gt.max(), pdf.max())
    
    # plot: gt - pdf
    plt.subplot(4, 1, 1)
    plt.imshow(prob_gt.T, extent=[min(x),max(x),min(y),max(y)], interpolation='none',
               cmap=plt.cm.afmhot, aspect='auto', origin='lower',
               vmin=pdf_vmin, vmax=pdf_vmax) #'auto'
    plt.colorbar()
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Ground-truth: PDF')
    
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)
    
    # plot: learned - pdf
    plt.subplot(4, 1, 2)
    plt.imshow(pdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)],
               interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower',
               vmin=pdf_vmin, vmax=pdf_vmax) #'auto'
    plt.colorbar()
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Learned: PDF')
    
    # add split-lines
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)
        
    # colour range: cdf
    cdf_vmin = min(emcdf.min(), cdf.min())
    cdf_vmax = min(emcdf.max(), cdf.max())        
        
    # plot: gt - ecdf 
    plt.subplot(4, 1, 3)
    plt.imshow(emcdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)],
               interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower',
               vmin=cdf_vmin, vmax=cdf_vmax) #'auto'
    plt.colorbar()
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Ground-truth: Empirical CDF')   
        
    # plot: cdf
    plt.subplot(4, 1, 4)
    plt.imshow(cdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)],
               interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower',
               vmin=cdf_vmin, vmax=cdf_vmax) #'auto'
    plt.colorbar()
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Learned: CDF')
    
    plt.show()
コード例 #4
0
ファイル: plot_tree_gauss.py プロジェクト: loli/sklearnef
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)

    # ----- Data generation ----
    means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma)

    (X_train, X_train_unlabelled, X_train_labelled),\
    (y_train, y_train_unlabelled, y_train_labelled),\
    y_train_gt = sample_data(means, covs, args.n_samples)

    # make custom map
    cmap = plt.get_cmap('jet', len(np.unique(y_train)))

    # ----- Data scaling ----
    # Must be performed before to display final data in the right space
    if args.scaling:
        scale_data(X_train,
                   (X_train, X_train_unlabelled, X_train_labelled, means))

    # ----- Grid -----
    grid = generate_grid(X_train, args.sigma, args.resolution)

    # ----- Training -----
    clf = SemiSupervisedDecisionTreeClassifier(
        random_state=args.seed,
        max_depth=args.max_depth,
        max_features=args.max_features,
        supervised_weight=args.supervised_weight,
        min_improvement=args.min_improvement,
        transduction_method=args.transduction_method,
        unsupervised_transformation='scale' if args.scaling else None)
    clf.fit(X_train, y_train)

    # ----- plot tree into file -----
    # Convert with: dot -Tps tree.dot -o tree.ps
    export_graphviz(clf)

    # ----- Learned distribution -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape(
        (np.product(grid.shape[1:]), grid.shape[0]))
    pdf = clf.pdf(X_test_pred)

    # ----- Ground truth distribution -----
    X_test_gt = np.rollaxis(grid, 0, 3)
    prob_gt = np.sum([
        scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov)
        for mean, cov in zip(means, covs)
    ], 0)
    prob_gt /= args.n_clusters  # normalize

    # ----- Transduction -----
    y_train_result = clf.transduced_labels_

    # ----- A-posteriori classification / induction -----
    y_train_prediction = clf.predict(X_train_unlabelled)

    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y)

    # colour range: pdf
    pdf_vmin = min(prob_gt.min(), pdf.min())
    pdf_vmax = min(prob_gt.max(), pdf.max())

    # plot: gt - pdf
    plt.subplot(3, 1, 1)
    plt.scatter(X_train_unlabelled[:, 0],
                X_train_unlabelled[:, 1],
                c=cmap(y_train_gt.astype(np.uint8)),
                s=20,
                alpha=.5)
    plt.scatter(X_train_labelled[:, 0],
                X_train_labelled[:, 1],
                c=cmap(y_train_labelled.astype(np.uint8)),
                s=100)
    img = plt.imshow(prob_gt.T,
                     extent=[min(x), max(x), min(y),
                             max(y)],
                     interpolation='none',
                     cmap=plt.cm.afmhot,
                     aspect='auto',
                     origin='lower',
                     vmin=pdf_vmin,
                     vmax=pdf_vmax,
                     alpha=.5)  #'auto'
    plt.colorbar(img)

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Ground-truth: PDF + samples')

    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    # plot: learned - pdf
    plt.subplot(3, 1, 2)
    plt.scatter(X_train_unlabelled[:, 0],
                X_train_unlabelled[:, 1],
                c=cmap(y_train_result.astype(np.uint8)),
                s=20,
                alpha=.5)
    plt.scatter(X_train_labelled[:, 0],
                X_train_labelled[:, 1],
                c=cmap(y_train_labelled.astype(np.uint8)),
                s=100)
    img = plt.imshow(pdf.reshape((x.size, y.size)).T,
                     extent=[min(x), max(x), min(y),
                             max(y)],
                     interpolation='none',
                     cmap=plt.cm.afmhot,
                     aspect='auto',
                     origin='lower',
                     vmin=pdf_vmin,
                     vmax=pdf_vmax,
                     alpha=.5)  #'auto'
    plt.colorbar(img)

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Learned: PDF + samples labelled through transduction')

    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    # plot: learned - pdf
    plt.subplot(3, 1, 3)
    plt.scatter(X_train_unlabelled[:, 0],
                X_train_unlabelled[:, 1],
                c=cmap(y_train_prediction.astype(np.int8)),
                s=20,
                alpha=.5)
    plt.scatter(X_train_labelled[:, 0],
                X_train_labelled[:, 1],
                c=cmap(y_train_labelled.astype(np.int8)),
                s=100)
    plt.colorbar(img)  # just for scale

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Learned: a-posteriori classification / induction')

    # add split-lines
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    plt.show()
コード例 #5
0
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)
    
    # create dataset
    if 'circles_distant' == args.dataset:
        dataset = datasets.make_circles(n_samples=args.n_samples, factor=.5, noise=.05)
    elif 'moons' == args.dataset:
        dataset = datasets.make_moons(n_samples=args.n_samples, noise=.05)
    elif 'blobs' == args.dataset:
        dataset = datasets.make_blobs(n_samples=args.n_samples, random_state=8)
    elif 'circles_near' == args.dataset:
        dataset = datasets.make_circles(n_samples=args.n_samples, noise=.05)
    elif 's_curve' == args.dataset:
        dataset = datasets.make_s_curve(n_samples=args.n_samples, noise=.05)
        dataset = np.vstack((dataset[0][:, 0], dataset[0][:, 2])).T, None
    elif 'swiss_roll' == args.dataset:
        dataset = datasets.make_swiss_roll(n_samples=args.n_samples, noise=.05)
        dataset = np.vstack((dataset[0][:,0], dataset[0][:,2])).T, None
        
    # split and normalize
    X, _ = dataset
    if args.scaling:
        X = StandardScaler().fit_transform(X).astype(np.float32)
    
    # ----- Grid -----
    grid = generate_grid(X, X.std(), args.resolution)
    
    # ----- Training -----
    clf = DensityForest(n_estimators=args.n_trees,
                        random_state=args.seed,
                        min_samples_leaf=2,
                        n_jobs=-1,
                        max_depth=args.max_depth,
                        max_features=args.max_features,
                        min_improvement=args.min_improvement)
    clf.fit(X)
    
    # ----- Prediction -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0]))
    prob_predict = clf.predict_proba(X_test_pred)
    
    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y)
    
    if not args.skipgt:
        if not args.skipdensity:
            plt.subplot(2, 1, 1, axisbg='k')
            plot_gt(X, x, y, args)
            plt.subplot(2, 1, 2)
            plot_density(prob_predict, x, y, args)
        else:
            plt.subplot(1, 1, 1, axisbg='k')
            plot_gt(X, x, y, args)
    else:
        plt.subplot(1, 1, 2)
        plot_density(prob_predict, x, y, args)
    
    if args.save:
        plt.savefig(args.save)
    else:
        plt.show()
コード例 #6
0
ファイル: plot_ensemble_gauss.py プロジェクト: loli/sklearnef
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)
    
    # ----- Data generation ----
    means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma)
    
    (X_train, X_train_unlabelled, X_train_labelled),\
    (y_train, y_train_unlabelled, y_train_labelled),\
    y_train_gt = sample_data(means, covs, args.n_samples)
    
    # ----- Data scaling ----
    # Must be performed before to display final data in the right space
    if args.scaling:
        scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means))
    
    # ----- Grid -----
    grid = generate_grid(X_train, args.sigma, args.resolution)
    
    # ----- Training -----
    clf = DensityForest(n_estimators=args.n_trees,
                        random_state=args.seed,
                        min_samples_leaf=2,
                        n_jobs=-1,
                        max_depth=args.max_depth,
                        max_features=args.max_features,
                        min_improvement=args.min_improvement)    
    clf.fit(X_train)
    
    # ----- Prediction -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0]))
    prob_predict = clf.predict_proba(X_test_pred)
    
    # ----- Ground truth -----
    X_test_gt = np.rollaxis(grid, 0, 3)
    prob_gt = np.sum([scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs)], 0)
    prob_gt /= args.n_clusters # normalize
    
    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y)
    
    # first plot: gt
    plt.subplot(2, 1, 1)
    im = plt.imshow(prob_gt.T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower') #'auto'
    plt.colorbar()
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('GT')
    
    # second plot: prediction
    plt.subplot(2, 1, 2)
    im = plt.imshow(prob_predict.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower') #'auto'
    plt.colorbar()
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Prediction')
    
    plt.show()
コード例 #7
0
ファイル: plot_tree_gauss.py プロジェクト: loli/sklearnef
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)
    
    # ----- Data generation ----
    means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma)
    
    (X_train, X_train_unlabelled, X_train_labelled),\
    (y_train, y_train_unlabelled, y_train_labelled),\
    y_train_gt = sample_data(means, covs, args.n_samples)
    
    # make custom map
    cmap = plt.get_cmap('jet', len(np.unique(y_train)))
    
    # ----- Data scaling ----
    # Must be performed before to display final data in the right space
    if args.scaling:
        scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means))
    
    # ----- Grid -----
    grid = generate_grid(X_train, args.sigma, args.resolution)
    
    # ----- Training -----
    clf = SemiSupervisedDecisionTreeClassifier(random_state=args.seed,
                                               max_depth=args.max_depth,
                                               max_features=args.max_features,
                                               supervised_weight=args.supervised_weight,
                                               min_improvement=args.min_improvement,
                                               transduction_method=args.transduction_method,
                                               unsupervised_transformation='scale' if args.scaling else None)
    clf.fit(X_train, y_train)
    
    # ----- plot tree into file -----
    # Convert with: dot -Tps tree.dot -o tree.ps
    export_graphviz(clf)
    
    # ----- Learned distribution -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0]))
    pdf = clf.pdf(X_test_pred)
    
    # ----- Ground truth distribution -----
    X_test_gt = np.rollaxis(grid, 0, 3)
    prob_gt = np.sum([scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs)], 0)
    prob_gt /= args.n_clusters # normalize
    
    # ----- Transduction -----
    y_train_result = clf.transduced_labels_
    
    # ----- A-posteriori classification / induction -----
    y_train_prediction = clf.predict(X_train_unlabelled)
    
    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y) 
    
    # colour range: pdf
    pdf_vmin = min(prob_gt.min(), pdf.min())
    pdf_vmax = min(prob_gt.max(), pdf.max())
    
    # plot: gt - pdf
    plt.subplot(3, 1, 1)
    plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_gt.astype(np.uint8)), s=20, alpha=.5)
    plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100)
    img = plt.imshow(prob_gt.T, extent=[min(x),max(x),min(y),max(y)], interpolation='none',
                     cmap=plt.cm.afmhot, aspect='auto', origin='lower',
                     vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto'
    plt.colorbar(img)
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Ground-truth: PDF + samples')
    
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    # plot: learned - pdf
    plt.subplot(3, 1, 2)
    plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_result.astype(np.uint8)), s=20, alpha=.5)
    plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100)
    img =   plt.imshow(pdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)],
                       interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower',
                       vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto'
    plt.colorbar(img)
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Learned: PDF + samples labelled through transduction')
    
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    # plot: learned - pdf
    plt.subplot(3, 1, 3)
    plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_prediction.astype(np.int8)), s=20, alpha=.5)
    plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.int8)), s=100)
    plt.colorbar(img) # just for scale
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Learned: a-posteriori classification / induction')
    
    # add split-lines
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)
    
    plt.show()
コード例 #8
0
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)
    
    # make dataset
    X, y = make_sklearn_dataset(args.dataset, args.n_samples)
        
    # normalize
    if args.scaling:
        X = StandardScaler().fit_transform(X).astype(np.float32)
    
    # ----- Create training and testing sets
    labelled_mask = np.zeros(y.shape, np.bool)
    for cid in np.unique(y):
        m = (cid == y)
        for _ in range(args.n_labelled):
            repeat = True
            while repeat:
                sel = np.random.randint(0, y.size)
                repeat = ~(m[sel] and ~labelled_mask[sel]) # belonging to target class AND not yet selected
            labelled_mask[sel] = True
    
    X_train = X
    X_train_unlabelled = X_train[~labelled_mask]
    y_train = y.copy()
    y_train[~labelled_mask] = -1
    y_train_unlabelled_gt = y[~labelled_mask]
    X_train_labelled = X_train[labelled_mask]
    y_train_labelled = y[labelled_mask]
    
    # make custom map
    cmap = plt.get_cmap('jet', len(np.unique(y_train)))
    
    # ----- Grid -----
    grid = generate_grid(X_train, X_train.std(), args.resolution)
    
    # ----- Training -----
    clf = SemiSupervisedRandomForestClassifier(random_state=args.seed,
                                               n_estimators=args.n_trees,
                                               max_depth=args.max_depth,
                                               max_features=args.max_features,
                                               supervised_weight=args.supervised_weight,
                                               min_improvement=args.min_improvement,
                                               transduction_method=args.transduction_method,
                                               unsupervised_transformation=None)
    clf.fit(X_train, y_train)
    
    # ----- Learned distribution -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0]))
    pdf = clf.pdf(X_test_pred)
    
    # ----- Transduction -----
    y_train_result = clf.transduced_labels_
    
    # ----- A-posteriori classification -----
    y_train_prediction = clf.predict(X_train_unlabelled)
    
    # ----- Scoring -----
    print 'SCORES:'
    print '\t', accuracy_score(y_train_unlabelled_gt, y_train_result), 'Labeling through transduction'
    print '\t', accuracy_score(y_train_unlabelled_gt, y_train_prediction), 'Labeling through classification'
    
    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y) 
    
    # colour range: pdf
    pdf_vmin = pdf.min()
    pdf_vmax = pdf.max()
    
    # plot: gt - pdf
    plt.subplot(3, 1, 1)
    plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_unlabelled_gt.astype(np.uint8)), s=20, alpha=.6)
    plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100)

    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Ground-truth: PDF + samples')
    
    if args.split_lines:
        draw_split_lines(clf.estimators_[0], x, y)
    
    # plot: learned - pdf
    plt.subplot(3, 1, 2)
    img = plt.imshow(pdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)],
                     interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower',
                     vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto'
    plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_result.astype(np.uint8)), s=20, alpha=.6)
    plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100)
    plt.colorbar(img)
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Learned forest: PDF + samples labelled through transduction')
    
    if args.split_lines:
        draw_split_lines(clf.estimators_[0], x, y)
    
    # plot: learned - pdf
    plt.subplot(3, 1, 3)
    img = plt.imshow(pdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)],
                     interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower',
                     vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto'
    #plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_prediction.astype(np.uint8)), s=20, alpha=.6)
    #plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100)
    plt.colorbar(img)
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Learned forest: PDF + a-posteriori classification')
    
    # add split-lines
    if args.split_lines:
        draw_split_lines(clf.estimators_[0], x, y)
    
    if args.save:
       plt.savefig(args.save)
    else:
      plt.show()
コード例 #9
0
ファイル: plot_tree_gauss.py プロジェクト: loli/sklearnef
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)

    # ----- Data generation ----
    means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma)

    (X_train, X_train_unlabelled, X_train_labelled),\
    (y_train, y_train_unlabelled, y_train_labelled),\
    y_train_gt = sample_data(means, covs, args.n_samples)

    # ----- Data scaling ----
    # Must be performed before to display final data in the right space
    if args.scaling:
        scale_data(X_train,
                   (X_train, X_train_unlabelled, X_train_labelled, means))

    # ----- Grid -----
    grid = generate_grid(X_train, args.sigma, args.resolution)

    # ----- Training -----
    clf = DensityTree(random_state=args.seed,
                      min_samples_leaf=2,
                      max_depth=args.max_depth,
                      max_features=args.max_features,
                      min_improvement=args.min_improvement)
    clf.fit(X_train)

    # ----- Prediction -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape(
        (np.product(grid.shape[1:]), grid.shape[0]))
    pdf = clf.pdf(X_test_pred)
    cdf = clf.cdf(X_test_pred)

    # ----- Ground truth -----
    X_test_gt = np.rollaxis(grid, 0, 3)
    prob_gt = np.sum([
        scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov)
        for mean, cov in zip(means, covs)
    ], 0)
    prob_gt /= args.n_clusters  # normalize

    # ----- Goodness of fit measure -----
    X_eval = np.concatenate([
        scipy.stats.multivariate_normal.rvs(mean, cov, args.n_samples)
        for mean, cov in zip(means, covs)
    ])
    gof = GoodnessOfFit(clf.cdf, X_eval)
    print 'Goodness of fit evaluation:'
    print '\tmaxium error:', gof.maximum()
    print '\tmean squared error:', gof.mean_squared_error()
    print '\tmean squared error weighted:', gof.mean_squared_error_weighted(
        clf.pdf)

    # ----- E(M)CDF -----
    emcdf = gof.ecdf(X_test_pred)

    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y)

    # colour range: pdf
    pdf_vmin = min(prob_gt.min(), pdf.min())
    pdf_vmax = min(prob_gt.max(), pdf.max())

    # plot: gt - pdf
    plt.subplot(4, 1, 1)
    plt.imshow(prob_gt.T,
               extent=[min(x), max(x), min(y), max(y)],
               interpolation='none',
               cmap=plt.cm.afmhot,
               aspect='auto',
               origin='lower',
               vmin=pdf_vmin,
               vmax=pdf_vmax)  #'auto'
    plt.colorbar()

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Ground-truth: PDF')

    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    # plot: learned - pdf
    plt.subplot(4, 1, 2)
    plt.imshow(pdf.reshape((x.size, y.size)).T,
               extent=[min(x), max(x), min(y), max(y)],
               interpolation='none',
               cmap=plt.cm.afmhot,
               aspect='auto',
               origin='lower',
               vmin=pdf_vmin,
               vmax=pdf_vmax)  #'auto'
    plt.colorbar()

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Learned: PDF')

    # add split-lines
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    # colour range: cdf
    cdf_vmin = min(emcdf.min(), cdf.min())
    cdf_vmax = min(emcdf.max(), cdf.max())

    # plot: gt - ecdf
    plt.subplot(4, 1, 3)
    plt.imshow(emcdf.reshape((x.size, y.size)).T,
               extent=[min(x), max(x), min(y), max(y)],
               interpolation='none',
               cmap=plt.cm.afmhot,
               aspect='auto',
               origin='lower',
               vmin=cdf_vmin,
               vmax=cdf_vmax)  #'auto'
    plt.colorbar()
    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Ground-truth: Empirical CDF')

    # plot: cdf
    plt.subplot(4, 1, 4)
    plt.imshow(cdf.reshape((x.size, y.size)).T,
               extent=[min(x), max(x), min(y), max(y)],
               interpolation='none',
               cmap=plt.cm.afmhot,
               aspect='auto',
               origin='lower',
               vmin=cdf_vmin,
               vmax=cdf_vmax)  #'auto'
    plt.colorbar()

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Learned: CDF')

    plt.show()