def visualizeLatentSpace(dataset_name, arch): ''' Plots and saves graphs for visualized images space, autoencoder latent space, and the final clustering latent space :param dataset_name: Name of dataset [MNIST, COIL20] :param arch: Architectures as a dictionary :return: None - (side effect) saved graphs in plots/ folder ''' rootLogger.info("Loading dataset") dataset = DatasetHelper(dataset_name) dataset.loadDataset() rootLogger.info("Done loading dataset") # We consider only the first 5000 point or less for better visualization max_points = min(dataset.input_flat.shape[0], 5000) # Image space visualizeData(dataset.input_flat[0:max_points], dataset.labels[0:max_points], dataset.getClusterCount(), "plots/%s/raw.png" % dataset.name) # Latent space - autoencoder Z = numpy.load('saved_params/' + dataset.name + '/z_' + arch['name'] + '.npy') visualizeData(Z[0:max_points], dataset.labels[0:max_points], dataset.getClusterCount(), "plots/%s/autoencoder.png" % dataset.name) # Latent space - kl div clustering network Z = numpy.load('saved_params/' + dataset.name + '/pc_z_' + arch['name'] + '.npy') visualizeData(Z[0:max_points], dataset.labels[0:max_points], dataset.getClusterCount(), "plots/%s/clustered_kld.png" % dataset.name) # Latent space - kmeans clustering network Z = numpy.load('saved_params/' + dataset.name + '/pc_km_z_' + arch['name'] + '.npy') visualizeData(Z[0:max_points], dataset.labels[0:max_points], dataset.getClusterCount(), "plots/%s/clustered_km.png" % dataset.name)
def testKMeans(dataset_name, archs): ''' Performs kMeans clustering, and report metrics on the output latent space produced by the networks defined in archs, with given dataset. Assumes that testOnlyClusterInitialization and testOnlyClusterImprovement have been run before this for the specified archs/datasets, as the results saved by them are used for clustering :param dataset_name: Name of dataset [MNIST, COIL20] :param archs: Architectures as a dictionary :return: None - reports the accuracy and nmi clustering metrics ''' rootLogger.info('Initial Cluster Quality Comparison') rootLogger.info(80 * '_') rootLogger.info('%-50s %8s %8s' % ('method', 'ACC', 'NMI')) rootLogger.info(80 * '_') dataset = DatasetHelper(dataset_name) dataset.loadDataset() rootLogger.info( evaluateKMeans(dataset.input_flat, dataset.labels, dataset.getClusterCount(), 'image')[0]) for arch in archs: Z = numpy.load('saved_params/' + dataset.name + '/z_' + arch['name'] + '.npy') rootLogger.info( evaluateKMeans(Z, dataset.labels, dataset.getClusterCount(), arch['name'])[0]) Z = numpy.load('saved_params/' + dataset.name + '/pc_z_' + arch['name'] + '.npy') rootLogger.info( evaluateKMeans(Z, dataset.labels, dataset.getClusterCount(), arch['name'])[0]) Z = numpy.load('saved_params/' + dataset.name + '/pc_km_z_' + arch['name'] + '.npy') rootLogger.info( evaluateKMeans(Z, dataset.labels, dataset.getClusterCount(), arch['name'])[0]) rootLogger.info(80 * '_')
def testOnlyClusterInitialization(dataset_name, arch, epochs): ''' Train an autoencoder defined by architecture arch and trains it with the dataset defined :param dataset_name: Name of the dataset with which the network will be trained [MNIST, COIL20] :param arch: Architecture of the network as a dictionary. Specification for architecture can be found in readme.md :param epochs: Number of train epochs :return: None - (side effect) saves the latent space and params of trained network in an appropriate location in saved_params folder ''' rootLogger.info("Loading dataset") dataset = DatasetHelper(dataset_name) dataset.loadDataset() rootLogger.info("Done loading dataset") rootLogger.info("Creating network") dcjc = DCJC(arch) rootLogger.info("Done creating network") rootLogger.info("Starting training") dcjc.pretrainWithData(dataset, epochs, False)
def testOnlyClusterImprovement(dataset_name, arch, epochs, method): ''' Use an initialized autoencoder and train it along with clustering loss. Assumed that pretrained autoencoder params are available, i.e. testOnlyClusterInitialization has been run already with the given params :param dataset_name: Name of the dataset with which the network will be trained [MNIST, COIL20] :param arch: Architecture of the network as a dictionary. Specification for architecture can be found in readme.md :param epochs: Number of train epochs :param method: Can be KM or KLD - depending on whether the clustering loss is KLDivergence loss between the current KMeans distribution(Q) and a more desired one(Q^2), or if the clustering loss is just the Kmeans loss :return: None - (side effect) saves latent space and params of the trained network ''' rootLogger.info("Loading dataset") dataset = DatasetHelper(dataset_name) dataset.loadDataset() rootLogger.info("Done loading dataset") rootLogger.info("Creating network") dcjc = DCJC(arch) rootLogger.info("Starting cluster improvement") if method == 'KM': dcjc.doClusteringWithKMeansLoss(dataset, epochs) elif method == 'KLD': dcjc.doClusteringWithKLdivLoss(dataset, True, epochs)