def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # ----- Data generation ---- means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma) (X_train, X_train_unlabelled, X_train_labelled),\ (y_train, y_train_unlabelled, y_train_labelled),\ y_train_gt = sample_data(means, covs, args.n_samples) # ----- Data scaling ---- # Must be performed before to display final data in the right space if args.scaling: scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means)) # ----- Grid ----- grid = generate_grid(X_train, args.sigma, args.resolution) # ----- Training ----- clf = DensityForest(n_estimators=args.n_trees, random_state=args.seed, min_samples_leaf=2, n_jobs=-1, max_depth=args.max_depth, max_features=args.max_features, min_improvement=args.min_improvement) clf.fit(X_train) # ----- Prediction ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape( (np.product(grid.shape[1:]), grid.shape[0])) prob_predict = clf.predict_proba(X_test_pred) # ----- Ground truth ----- X_test_gt = np.rollaxis(grid, 0, 3) prob_gt = np.sum([ scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs) ], 0) prob_gt /= args.n_clusters # normalize # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) # first plot: gt plt.subplot(2, 1, 1) im = plt.imshow(prob_gt.T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower') #'auto' plt.colorbar() plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('GT') # second plot: prediction plt.subplot(2, 1, 2) im = plt.imshow(prob_predict.reshape((x.size, y.size)).T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower') #'auto' plt.colorbar() plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Prediction') plt.show()
def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # make dataset X, y = make_sklearn_dataset(args.dataset, args.n_samples) # normalize if args.scaling: X = StandardScaler().fit_transform(X).astype(np.float32) # ----- Create training and testing sets labelled_mask = np.zeros(y.shape, np.bool) for cid in np.unique(y): m = (cid == y) for _ in range(args.n_labelled): repeat = True while repeat: sel = np.random.randint(0, y.size) repeat = ~(m[sel] and ~labelled_mask[sel] ) # belonging to target class AND not yet selected labelled_mask[sel] = True X_train = X X_train_unlabelled = X_train[~labelled_mask] y_train = y.copy() y_train[~labelled_mask] = -1 y_train_unlabelled_gt = y[~labelled_mask] X_train_labelled = X_train[labelled_mask] y_train_labelled = y[labelled_mask] # make custom map cmap = plt.get_cmap('jet', len(np.unique(y_train))) # ----- Grid ----- grid = generate_grid(X_train, X_train.std(), args.resolution) # ----- Training ----- clf = SemiSupervisedRandomForestClassifier( random_state=args.seed, n_estimators=args.n_trees, max_depth=args.max_depth, max_features=args.max_features, supervised_weight=args.supervised_weight, min_improvement=args.min_improvement, transduction_method=args.transduction_method, unsupervised_transformation=None) clf.fit(X_train, y_train) # ----- Learned distribution ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape( (np.product(grid.shape[1:]), grid.shape[0])) pdf = clf.pdf(X_test_pred) # ----- Transduction ----- y_train_result = clf.transduced_labels_ # ----- A-posteriori classification ----- y_train_prediction = clf.predict(X_train_unlabelled) # ----- Scoring ----- print 'SCORES:' print '\t', accuracy_score(y_train_unlabelled_gt, y_train_result), 'Labeling through transduction' print '\t', accuracy_score( y_train_unlabelled_gt, y_train_prediction), 'Labeling through classification' # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) # colour range: pdf pdf_vmin = pdf.min() pdf_vmax = pdf.max() # plot: gt - pdf plt.subplot(3, 1, 1) plt.scatter(X_train_unlabelled[:, 0], X_train_unlabelled[:, 1], c=cmap(y_train_unlabelled_gt.astype(np.uint8)), s=20, alpha=.6) plt.scatter(X_train_labelled[:, 0], X_train_labelled[:, 1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Ground-truth: PDF + samples') if args.split_lines: draw_split_lines(clf.estimators_[0], x, y) # plot: learned - pdf plt.subplot(3, 1, 2) img = plt.imshow(pdf.reshape((x.size, y.size)).T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' plt.scatter(X_train_unlabelled[:, 0], X_train_unlabelled[:, 1], c=cmap(y_train_result.astype(np.uint8)), s=20, alpha=.6) plt.scatter(X_train_labelled[:, 0], X_train_labelled[:, 1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) plt.colorbar(img) plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Learned forest: PDF + samples labelled through transduction') if args.split_lines: draw_split_lines(clf.estimators_[0], x, y) # plot: learned - pdf plt.subplot(3, 1, 3) img = plt.imshow(pdf.reshape((x.size, y.size)).T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' #plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_prediction.astype(np.uint8)), s=20, alpha=.6) #plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) plt.colorbar(img) plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Learned forest: PDF + a-posteriori classification') # add split-lines if args.split_lines: draw_split_lines(clf.estimators_[0], x, y) if args.save: plt.savefig(args.save) else: plt.show()
def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # ----- Data generation ---- means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma) (X_train, X_train_unlabelled, X_train_labelled),\ (y_train, y_train_unlabelled, y_train_labelled),\ y_train_gt = sample_data(means, covs, args.n_samples) # ----- Data scaling ---- # Must be performed before to display final data in the right space if args.scaling: scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means)) # ----- Grid ----- grid = generate_grid(X_train, args.sigma, args.resolution) # ----- Training ----- clf = DensityTree(random_state=args.seed, min_samples_leaf=2, max_depth=args.max_depth, max_features=args.max_features, min_improvement=args.min_improvement) clf.fit(X_train) # ----- Prediction ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0])) pdf = clf.pdf(X_test_pred) cdf = clf.cdf(X_test_pred) # ----- Ground truth ----- X_test_gt = np.rollaxis(grid, 0, 3) prob_gt = np.sum([scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs)], 0) prob_gt /= args.n_clusters # normalize # ----- Goodness of fit measure ----- X_eval = np.concatenate([scipy.stats.multivariate_normal.rvs(mean, cov, args.n_samples) for mean, cov in zip(means, covs)]) gof = GoodnessOfFit(clf.cdf, X_eval) print 'Goodness of fit evaluation:' print '\tmaxium error:', gof.maximum() print '\tmean squared error:', gof.mean_squared_error() print '\tmean squared error weighted:', gof.mean_squared_error_weighted(clf.pdf) # ----- E(M)CDF ----- emcdf = gof.ecdf(X_test_pred) # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) # colour range: pdf pdf_vmin = min(prob_gt.min(), pdf.min()) pdf_vmax = min(prob_gt.max(), pdf.max()) # plot: gt - pdf plt.subplot(4, 1, 1) plt.imshow(prob_gt.T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax) #'auto' plt.colorbar() plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Ground-truth: PDF') if not args.no_split_lines: draw_split_lines(clf, x, y) # plot: learned - pdf plt.subplot(4, 1, 2) plt.imshow(pdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax) #'auto' plt.colorbar() plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Learned: PDF') # add split-lines if not args.no_split_lines: draw_split_lines(clf, x, y) # colour range: cdf cdf_vmin = min(emcdf.min(), cdf.min()) cdf_vmax = min(emcdf.max(), cdf.max()) # plot: gt - ecdf plt.subplot(4, 1, 3) plt.imshow(emcdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=cdf_vmin, vmax=cdf_vmax) #'auto' plt.colorbar() plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Ground-truth: Empirical CDF') # plot: cdf plt.subplot(4, 1, 4) plt.imshow(cdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=cdf_vmin, vmax=cdf_vmax) #'auto' plt.colorbar() plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Learned: CDF') plt.show()
def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # ----- Data generation ---- means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma) (X_train, X_train_unlabelled, X_train_labelled),\ (y_train, y_train_unlabelled, y_train_labelled),\ y_train_gt = sample_data(means, covs, args.n_samples) # make custom map cmap = plt.get_cmap('jet', len(np.unique(y_train))) # ----- Data scaling ---- # Must be performed before to display final data in the right space if args.scaling: scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means)) # ----- Grid ----- grid = generate_grid(X_train, args.sigma, args.resolution) # ----- Training ----- clf = SemiSupervisedDecisionTreeClassifier( random_state=args.seed, max_depth=args.max_depth, max_features=args.max_features, supervised_weight=args.supervised_weight, min_improvement=args.min_improvement, transduction_method=args.transduction_method, unsupervised_transformation='scale' if args.scaling else None) clf.fit(X_train, y_train) # ----- plot tree into file ----- # Convert with: dot -Tps tree.dot -o tree.ps export_graphviz(clf) # ----- Learned distribution ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape( (np.product(grid.shape[1:]), grid.shape[0])) pdf = clf.pdf(X_test_pred) # ----- Ground truth distribution ----- X_test_gt = np.rollaxis(grid, 0, 3) prob_gt = np.sum([ scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs) ], 0) prob_gt /= args.n_clusters # normalize # ----- Transduction ----- y_train_result = clf.transduced_labels_ # ----- A-posteriori classification / induction ----- y_train_prediction = clf.predict(X_train_unlabelled) # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) # colour range: pdf pdf_vmin = min(prob_gt.min(), pdf.min()) pdf_vmax = min(prob_gt.max(), pdf.max()) # plot: gt - pdf plt.subplot(3, 1, 1) plt.scatter(X_train_unlabelled[:, 0], X_train_unlabelled[:, 1], c=cmap(y_train_gt.astype(np.uint8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:, 0], X_train_labelled[:, 1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) img = plt.imshow(prob_gt.T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' plt.colorbar(img) plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Ground-truth: PDF + samples') if not args.no_split_lines: draw_split_lines(clf, x, y) # plot: learned - pdf plt.subplot(3, 1, 2) plt.scatter(X_train_unlabelled[:, 0], X_train_unlabelled[:, 1], c=cmap(y_train_result.astype(np.uint8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:, 0], X_train_labelled[:, 1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) img = plt.imshow(pdf.reshape((x.size, y.size)).T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' plt.colorbar(img) plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Learned: PDF + samples labelled through transduction') if not args.no_split_lines: draw_split_lines(clf, x, y) # plot: learned - pdf plt.subplot(3, 1, 3) plt.scatter(X_train_unlabelled[:, 0], X_train_unlabelled[:, 1], c=cmap(y_train_prediction.astype(np.int8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:, 0], X_train_labelled[:, 1], c=cmap(y_train_labelled.astype(np.int8)), s=100) plt.colorbar(img) # just for scale plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Learned: a-posteriori classification / induction') # add split-lines if not args.no_split_lines: draw_split_lines(clf, x, y) plt.show()
def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # create dataset if 'circles_distant' == args.dataset: dataset = datasets.make_circles(n_samples=args.n_samples, factor=.5, noise=.05) elif 'moons' == args.dataset: dataset = datasets.make_moons(n_samples=args.n_samples, noise=.05) elif 'blobs' == args.dataset: dataset = datasets.make_blobs(n_samples=args.n_samples, random_state=8) elif 'circles_near' == args.dataset: dataset = datasets.make_circles(n_samples=args.n_samples, noise=.05) elif 's_curve' == args.dataset: dataset = datasets.make_s_curve(n_samples=args.n_samples, noise=.05) dataset = np.vstack((dataset[0][:, 0], dataset[0][:, 2])).T, None elif 'swiss_roll' == args.dataset: dataset = datasets.make_swiss_roll(n_samples=args.n_samples, noise=.05) dataset = np.vstack((dataset[0][:,0], dataset[0][:,2])).T, None # split and normalize X, _ = dataset if args.scaling: X = StandardScaler().fit_transform(X).astype(np.float32) # ----- Grid ----- grid = generate_grid(X, X.std(), args.resolution) # ----- Training ----- clf = DensityForest(n_estimators=args.n_trees, random_state=args.seed, min_samples_leaf=2, n_jobs=-1, max_depth=args.max_depth, max_features=args.max_features, min_improvement=args.min_improvement) clf.fit(X) # ----- Prediction ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0])) prob_predict = clf.predict_proba(X_test_pred) # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) if not args.skipgt: if not args.skipdensity: plt.subplot(2, 1, 1, axisbg='k') plot_gt(X, x, y, args) plt.subplot(2, 1, 2) plot_density(prob_predict, x, y, args) else: plt.subplot(1, 1, 1, axisbg='k') plot_gt(X, x, y, args) else: plt.subplot(1, 1, 2) plot_density(prob_predict, x, y, args) if args.save: plt.savefig(args.save) else: plt.show()
def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # ----- Data generation ---- means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma) (X_train, X_train_unlabelled, X_train_labelled),\ (y_train, y_train_unlabelled, y_train_labelled),\ y_train_gt = sample_data(means, covs, args.n_samples) # ----- Data scaling ---- # Must be performed before to display final data in the right space if args.scaling: scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means)) # ----- Grid ----- grid = generate_grid(X_train, args.sigma, args.resolution) # ----- Training ----- clf = DensityForest(n_estimators=args.n_trees, random_state=args.seed, min_samples_leaf=2, n_jobs=-1, max_depth=args.max_depth, max_features=args.max_features, min_improvement=args.min_improvement) clf.fit(X_train) # ----- Prediction ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0])) prob_predict = clf.predict_proba(X_test_pred) # ----- Ground truth ----- X_test_gt = np.rollaxis(grid, 0, 3) prob_gt = np.sum([scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs)], 0) prob_gt /= args.n_clusters # normalize # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) # first plot: gt plt.subplot(2, 1, 1) im = plt.imshow(prob_gt.T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower') #'auto' plt.colorbar() plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('GT') # second plot: prediction plt.subplot(2, 1, 2) im = plt.imshow(prob_predict.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower') #'auto' plt.colorbar() plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Prediction') plt.show()
def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # ----- Data generation ---- means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma) (X_train, X_train_unlabelled, X_train_labelled),\ (y_train, y_train_unlabelled, y_train_labelled),\ y_train_gt = sample_data(means, covs, args.n_samples) # make custom map cmap = plt.get_cmap('jet', len(np.unique(y_train))) # ----- Data scaling ---- # Must be performed before to display final data in the right space if args.scaling: scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means)) # ----- Grid ----- grid = generate_grid(X_train, args.sigma, args.resolution) # ----- Training ----- clf = SemiSupervisedDecisionTreeClassifier(random_state=args.seed, max_depth=args.max_depth, max_features=args.max_features, supervised_weight=args.supervised_weight, min_improvement=args.min_improvement, transduction_method=args.transduction_method, unsupervised_transformation='scale' if args.scaling else None) clf.fit(X_train, y_train) # ----- plot tree into file ----- # Convert with: dot -Tps tree.dot -o tree.ps export_graphviz(clf) # ----- Learned distribution ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0])) pdf = clf.pdf(X_test_pred) # ----- Ground truth distribution ----- X_test_gt = np.rollaxis(grid, 0, 3) prob_gt = np.sum([scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs)], 0) prob_gt /= args.n_clusters # normalize # ----- Transduction ----- y_train_result = clf.transduced_labels_ # ----- A-posteriori classification / induction ----- y_train_prediction = clf.predict(X_train_unlabelled) # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) # colour range: pdf pdf_vmin = min(prob_gt.min(), pdf.min()) pdf_vmax = min(prob_gt.max(), pdf.max()) # plot: gt - pdf plt.subplot(3, 1, 1) plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_gt.astype(np.uint8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) img = plt.imshow(prob_gt.T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' plt.colorbar(img) plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Ground-truth: PDF + samples') if not args.no_split_lines: draw_split_lines(clf, x, y) # plot: learned - pdf plt.subplot(3, 1, 2) plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_result.astype(np.uint8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) img = plt.imshow(pdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' plt.colorbar(img) plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Learned: PDF + samples labelled through transduction') if not args.no_split_lines: draw_split_lines(clf, x, y) # plot: learned - pdf plt.subplot(3, 1, 3) plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_prediction.astype(np.int8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.int8)), s=100) plt.colorbar(img) # just for scale plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Learned: a-posteriori classification / induction') # add split-lines if not args.no_split_lines: draw_split_lines(clf, x, y) plt.show()
def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # make dataset X, y = make_sklearn_dataset(args.dataset, args.n_samples) # normalize if args.scaling: X = StandardScaler().fit_transform(X).astype(np.float32) # ----- Create training and testing sets labelled_mask = np.zeros(y.shape, np.bool) for cid in np.unique(y): m = (cid == y) for _ in range(args.n_labelled): repeat = True while repeat: sel = np.random.randint(0, y.size) repeat = ~(m[sel] and ~labelled_mask[sel]) # belonging to target class AND not yet selected labelled_mask[sel] = True X_train = X X_train_unlabelled = X_train[~labelled_mask] y_train = y.copy() y_train[~labelled_mask] = -1 y_train_unlabelled_gt = y[~labelled_mask] X_train_labelled = X_train[labelled_mask] y_train_labelled = y[labelled_mask] # make custom map cmap = plt.get_cmap('jet', len(np.unique(y_train))) # ----- Grid ----- grid = generate_grid(X_train, X_train.std(), args.resolution) # ----- Training ----- clf = SemiSupervisedRandomForestClassifier(random_state=args.seed, n_estimators=args.n_trees, max_depth=args.max_depth, max_features=args.max_features, supervised_weight=args.supervised_weight, min_improvement=args.min_improvement, transduction_method=args.transduction_method, unsupervised_transformation=None) clf.fit(X_train, y_train) # ----- Learned distribution ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0])) pdf = clf.pdf(X_test_pred) # ----- Transduction ----- y_train_result = clf.transduced_labels_ # ----- A-posteriori classification ----- y_train_prediction = clf.predict(X_train_unlabelled) # ----- Scoring ----- print 'SCORES:' print '\t', accuracy_score(y_train_unlabelled_gt, y_train_result), 'Labeling through transduction' print '\t', accuracy_score(y_train_unlabelled_gt, y_train_prediction), 'Labeling through classification' # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) # colour range: pdf pdf_vmin = pdf.min() pdf_vmax = pdf.max() # plot: gt - pdf plt.subplot(3, 1, 1) plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_unlabelled_gt.astype(np.uint8)), s=20, alpha=.6) plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Ground-truth: PDF + samples') if args.split_lines: draw_split_lines(clf.estimators_[0], x, y) # plot: learned - pdf plt.subplot(3, 1, 2) img = plt.imshow(pdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_result.astype(np.uint8)), s=20, alpha=.6) plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) plt.colorbar(img) plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Learned forest: PDF + samples labelled through transduction') if args.split_lines: draw_split_lines(clf.estimators_[0], x, y) # plot: learned - pdf plt.subplot(3, 1, 3) img = plt.imshow(pdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' #plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_prediction.astype(np.uint8)), s=20, alpha=.6) #plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) plt.colorbar(img) plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Learned forest: PDF + a-posteriori classification') # add split-lines if args.split_lines: draw_split_lines(clf.estimators_[0], x, y) if args.save: plt.savefig(args.save) else: plt.show()
def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # ----- Data generation ---- means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma) (X_train, X_train_unlabelled, X_train_labelled),\ (y_train, y_train_unlabelled, y_train_labelled),\ y_train_gt = sample_data(means, covs, args.n_samples) # ----- Data scaling ---- # Must be performed before to display final data in the right space if args.scaling: scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means)) # ----- Grid ----- grid = generate_grid(X_train, args.sigma, args.resolution) # ----- Training ----- clf = DensityTree(random_state=args.seed, min_samples_leaf=2, max_depth=args.max_depth, max_features=args.max_features, min_improvement=args.min_improvement) clf.fit(X_train) # ----- Prediction ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape( (np.product(grid.shape[1:]), grid.shape[0])) pdf = clf.pdf(X_test_pred) cdf = clf.cdf(X_test_pred) # ----- Ground truth ----- X_test_gt = np.rollaxis(grid, 0, 3) prob_gt = np.sum([ scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs) ], 0) prob_gt /= args.n_clusters # normalize # ----- Goodness of fit measure ----- X_eval = np.concatenate([ scipy.stats.multivariate_normal.rvs(mean, cov, args.n_samples) for mean, cov in zip(means, covs) ]) gof = GoodnessOfFit(clf.cdf, X_eval) print 'Goodness of fit evaluation:' print '\tmaxium error:', gof.maximum() print '\tmean squared error:', gof.mean_squared_error() print '\tmean squared error weighted:', gof.mean_squared_error_weighted( clf.pdf) # ----- E(M)CDF ----- emcdf = gof.ecdf(X_test_pred) # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) # colour range: pdf pdf_vmin = min(prob_gt.min(), pdf.min()) pdf_vmax = min(prob_gt.max(), pdf.max()) # plot: gt - pdf plt.subplot(4, 1, 1) plt.imshow(prob_gt.T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax) #'auto' plt.colorbar() plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Ground-truth: PDF') if not args.no_split_lines: draw_split_lines(clf, x, y) # plot: learned - pdf plt.subplot(4, 1, 2) plt.imshow(pdf.reshape((x.size, y.size)).T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax) #'auto' plt.colorbar() plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Learned: PDF') # add split-lines if not args.no_split_lines: draw_split_lines(clf, x, y) # colour range: cdf cdf_vmin = min(emcdf.min(), cdf.min()) cdf_vmax = min(emcdf.max(), cdf.max()) # plot: gt - ecdf plt.subplot(4, 1, 3) plt.imshow(emcdf.reshape((x.size, y.size)).T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=cdf_vmin, vmax=cdf_vmax) #'auto' plt.colorbar() plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Ground-truth: Empirical CDF') # plot: cdf plt.subplot(4, 1, 4) plt.imshow(cdf.reshape((x.size, y.size)).T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=cdf_vmin, vmax=cdf_vmax) #'auto' plt.colorbar() plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Learned: CDF') plt.show()