예제 #1
0
def dr_cluster(X, y, X_test, dr_steps, savedir, ds):
    """
    Apply dimensionality reduction and then KMeans and EM clustering
    :param X: np array, training samples
    :param y: np array, labels
    :param X_test: np.array, test data
    :param dr_steps: list of dimensionality reduction objects
    :param savedir: string, output directory
    :param ds: string, name of dataset
    :return: tuple, best clusters for each dr type
    """
    cluster_idx = {'musk': 1, 'cancer': 0, 'shoppers': 0}
    best_km = []
    best_em = []
    best_test_km = []
    best_test_em = []

    for dr_step in dr_steps:
        km, em, km_test, em_test = A3.cluster(range(2, 21),
                                              X,
                                              y,
                                              savedir,
                                              ds,
                                              tnse_range=range(3, 5),
                                              dr_step=dr_step,
                                              X_test=X_test)

        best_km.append(km[cluster_idx[ds]])
        best_em.append(em[cluster_idx[ds]])
        best_test_km.append(km_test[cluster_idx[ds]])
        best_test_em.append(em_test[cluster_idx[ds]])

    return best_km, best_em, best_test_km, best_test_em
예제 #2
0
def main():
    args = get_args()
    savedir = util.mktmpdir(args.outdir)

    # Logging copy-pasted from logging cookbook
    # http://docs.python.org/howto/logging-cookbook.html#logging-to-multiple-destinations
    logging.basicConfig(format='%(asctime)s %(message)s',
                        filename='{}/output.log'.format(savedir),
                        level=logging.INFO)
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    logging.getLogger('').addHandler(console)
    for ds in ['musk', 'shoppers']:
        formatter = logging.Formatter(
            '{}: %(levelname)-8s %(message)s'.format(ds))
        console.setFormatter(formatter)
        logging.info('==========Starting {} Dataset =============='.format(ds))

        dataset = datajanitor.getDataset(ds)
        dataset.getData()
        x_train, x_test, y_train, y_test = \
            dataset.partitionData(percent=0.3, randomState=10)

        # ********************* #
        # **** Clustering  **** #
        # ********************* #
        if 'cluster' in args.phase or 'cluster-ann' in args.phase:
            km_train_clust, em_train_clust, km_test_clust, em_test_clust = \
                A3.cluster(range(2, 21), x_train, y_train, savedir, ds,
                           tnse_range=None,
                           X_test=x_test)

            # ******************************* #
            # **** Clusters as features  **** #
            # ******************************* #
            # one-hot encode and then add clusters to train and test features
            cluster_nn_scores = {
                'km': [],
                'em': [],
                'km+em': [],
                'km_only': [],
                'em_only': [],
                'kmem_only': []
            }
            if 'cluster-ann' in args.phase:
                for i in range(5):
                    km_x_train = add_cluster_dims(x_train, km_train_clust[i])
                    km_x_test = add_cluster_dims(x_test, km_test_clust[i])
                    em_x_train = add_cluster_dims(x_train, em_train_clust[i])
                    em_x_test = add_cluster_dims(x_test, em_test_clust[i])
                    km_score = cluster_nn(km_x_train, y_train, km_x_test,
                                          y_test, savedir, ds,
                                          'km{}'.format(i + 2))
                    em_score = cluster_nn(em_x_train, y_train, em_x_test,
                                          y_test, savedir, ds,
                                          'em{}'.format(i + 2))
                    kmem_x_train = add_cluster_dims(km_x_train,
                                                    em_train_clust[i])
                    kmem_x_test = add_cluster_dims(km_x_test, em_test_clust[i])
                    kmem_score = cluster_nn(kmem_x_train, y_train, kmem_x_test,
                                            y_test, savedir, ds,
                                            'kmem{}'.format(i + 2))
                    # do only clusters
                    km_only = cluster_nn(km_train_clust[i].reshape(-1,
                                                                   1), y_train,
                                         km_test_clust[i].reshape(-1, 1),
                                         y_test, savedir, ds,
                                         'km_only{}'.format(i + 2))
                    em_only = cluster_nn(km_train_clust[i].reshape(-1,
                                                                   1), y_train,
                                         km_test_clust[i].reshape(-1, 1),
                                         y_test, savedir, ds,
                                         'em_only{}'.format(i + 2))
                    kmem_only = cluster_nn(
                        np.append(km_train_clust[i].reshape(-1, 1),
                                  em_train_clust[i].reshape(-1, 1),
                                  axis=1), y_train,
                        np.append(km_test_clust[i].reshape(-1, 1),
                                  em_test_clust[i].reshape(-1, 1),
                                  axis=1), y_test, savedir, ds,
                        'kmem_only{}'.format(i + 2))
                    util.plotBarScores([
                        km_score, em_score, kmem_score, km_only, em_only,
                        kmem_only
                    ], [
                        'km-ann', 'em-ann', 'kmem-ann', 'km_only', 'em_only',
                        'kmem_only'
                    ],
                                       ds,
                                       savedir,
                                       phaseName='{}-cluster-{}-ann'.format(
                                           ds, i + 2))
                    cluster_nn_scores['km'].append(km_score)
                    cluster_nn_scores['em'].append(em_score)
                    cluster_nn_scores['km+em'].append(kmem_score)
                    cluster_nn_scores['km_only'].append(km_only)
                    cluster_nn_scores['em_only'].append(em_only)
                    cluster_nn_scores['kmem_only'].append(kmem_only)
                    plt.close('all')
                pd.DataFrame.from_dict(data=cluster_nn_scores).to_csv(
                    '{}/{}-clusternn.csv'.format(savedir, ds))

        # ************************ #
        # **** Dim Reduction  **** #
        # ************************ #
        # You actually have to do dimension reduction, there is no choice
        dr_steps = dr(x_train, y_train, savedir, ds)

        # *********************** #
        # **** DR + Cluster  **** #
        # *********************** #
        if 'dr-cluster' in args.phase:
            km_train_clust, em_train_clust, km_test_clust, em_test_clust = \
                dr_cluster(x_train, y_train, x_test, dr_steps, savedir, ds)

        # ******************* #
        # **** DR + ANN  **** #
        # ******************* #
        dr_ann(x_train, y_train, x_test, y_test, dr_steps, savedir, ds)