def dr_cluster(X, y, X_test, dr_steps, savedir, ds): """ Apply dimensionality reduction and then KMeans and EM clustering :param X: np array, training samples :param y: np array, labels :param X_test: np.array, test data :param dr_steps: list of dimensionality reduction objects :param savedir: string, output directory :param ds: string, name of dataset :return: tuple, best clusters for each dr type """ cluster_idx = {'musk': 1, 'cancer': 0, 'shoppers': 0} best_km = [] best_em = [] best_test_km = [] best_test_em = [] for dr_step in dr_steps: km, em, km_test, em_test = A3.cluster(range(2, 21), X, y, savedir, ds, tnse_range=range(3, 5), dr_step=dr_step, X_test=X_test) best_km.append(km[cluster_idx[ds]]) best_em.append(em[cluster_idx[ds]]) best_test_km.append(km_test[cluster_idx[ds]]) best_test_em.append(em_test[cluster_idx[ds]]) return best_km, best_em, best_test_km, best_test_em
def main(): args = get_args() savedir = util.mktmpdir(args.outdir) # Logging copy-pasted from logging cookbook # http://docs.python.org/howto/logging-cookbook.html#logging-to-multiple-destinations logging.basicConfig(format='%(asctime)s %(message)s', filename='{}/output.log'.format(savedir), level=logging.INFO) console = logging.StreamHandler() console.setLevel(logging.INFO) logging.getLogger('').addHandler(console) for ds in ['musk', 'shoppers']: formatter = logging.Formatter( '{}: %(levelname)-8s %(message)s'.format(ds)) console.setFormatter(formatter) logging.info('==========Starting {} Dataset =============='.format(ds)) dataset = datajanitor.getDataset(ds) dataset.getData() x_train, x_test, y_train, y_test = \ dataset.partitionData(percent=0.3, randomState=10) # ********************* # # **** Clustering **** # # ********************* # if 'cluster' in args.phase or 'cluster-ann' in args.phase: km_train_clust, em_train_clust, km_test_clust, em_test_clust = \ A3.cluster(range(2, 21), x_train, y_train, savedir, ds, tnse_range=None, X_test=x_test) # ******************************* # # **** Clusters as features **** # # ******************************* # # one-hot encode and then add clusters to train and test features cluster_nn_scores = { 'km': [], 'em': [], 'km+em': [], 'km_only': [], 'em_only': [], 'kmem_only': [] } if 'cluster-ann' in args.phase: for i in range(5): km_x_train = add_cluster_dims(x_train, km_train_clust[i]) km_x_test = add_cluster_dims(x_test, km_test_clust[i]) em_x_train = add_cluster_dims(x_train, em_train_clust[i]) em_x_test = add_cluster_dims(x_test, em_test_clust[i]) km_score = cluster_nn(km_x_train, y_train, km_x_test, y_test, savedir, ds, 'km{}'.format(i + 2)) em_score = cluster_nn(em_x_train, y_train, em_x_test, y_test, savedir, ds, 'em{}'.format(i + 2)) kmem_x_train = add_cluster_dims(km_x_train, em_train_clust[i]) kmem_x_test = add_cluster_dims(km_x_test, em_test_clust[i]) kmem_score = cluster_nn(kmem_x_train, y_train, kmem_x_test, y_test, savedir, ds, 'kmem{}'.format(i + 2)) # do only clusters km_only = cluster_nn(km_train_clust[i].reshape(-1, 1), y_train, km_test_clust[i].reshape(-1, 1), y_test, savedir, ds, 'km_only{}'.format(i + 2)) em_only = cluster_nn(km_train_clust[i].reshape(-1, 1), y_train, km_test_clust[i].reshape(-1, 1), y_test, savedir, ds, 'em_only{}'.format(i + 2)) kmem_only = cluster_nn( np.append(km_train_clust[i].reshape(-1, 1), em_train_clust[i].reshape(-1, 1), axis=1), y_train, np.append(km_test_clust[i].reshape(-1, 1), em_test_clust[i].reshape(-1, 1), axis=1), y_test, savedir, ds, 'kmem_only{}'.format(i + 2)) util.plotBarScores([ km_score, em_score, kmem_score, km_only, em_only, kmem_only ], [ 'km-ann', 'em-ann', 'kmem-ann', 'km_only', 'em_only', 'kmem_only' ], ds, savedir, phaseName='{}-cluster-{}-ann'.format( ds, i + 2)) cluster_nn_scores['km'].append(km_score) cluster_nn_scores['em'].append(em_score) cluster_nn_scores['km+em'].append(kmem_score) cluster_nn_scores['km_only'].append(km_only) cluster_nn_scores['em_only'].append(em_only) cluster_nn_scores['kmem_only'].append(kmem_only) plt.close('all') pd.DataFrame.from_dict(data=cluster_nn_scores).to_csv( '{}/{}-clusternn.csv'.format(savedir, ds)) # ************************ # # **** Dim Reduction **** # # ************************ # # You actually have to do dimension reduction, there is no choice dr_steps = dr(x_train, y_train, savedir, ds) # *********************** # # **** DR + Cluster **** # # *********************** # if 'dr-cluster' in args.phase: km_train_clust, em_train_clust, km_test_clust, em_test_clust = \ dr_cluster(x_train, y_train, x_test, dr_steps, savedir, ds) # ******************* # # **** DR + ANN **** # # ******************* # dr_ann(x_train, y_train, x_test, y_test, dr_steps, savedir, ds)