Example #1
0
def main(datadir, outdir, run_configs):
    """
    Run the main program.

    Parameters
    ----------
    datadir : string
        The path to the directory containing the input data files yeastEx.txt
        and yeastNames.txt.

    outdir : string
        The path to the top-level directory to which the output files will be
        saved. The output for each configuration will be stored in a
        subdirectory named with that configuration's key.

    run_configs : list of string
        A list of keys of the configurations to run (see ngcluster.config).
    """

    logfile = None

    def log(msg):
        print(msg)
        print(msg, file=logfile)

    data = np.loadtxt(os.path.join(datadir, 'yeastEx.txt'))
    names = np.loadtxt(os.path.join(datadir, 'yeastNames.txt'),
                       dtype=bytes).astype(str)

    if run_configs == []:
        print("Usage:\n"
              "    python3 run.py <config1> [<config2> ...]\n"
              "  or\n"
              "    python3 run.py all\n"
              "  to run all configurations\n"
              "  or\n"
              "    python3 run.py compile\n"
              "  to compile results from all previous runs into a CSV file\n"
              "Available configurations (see ngcluster/config.py):")
        for key, config in configurations.items():
            print("  {0}: {1}".format(key, config['description']))
        sys.exit(1)

    elif run_configs == ['all']:
        run_configs = list(configurations.keys())
        print("Running all {0} configurations: {1}".format(
            len(run_configs), ", ".join(run_configs)))
    elif run_configs == ['compile']:
        run_configs = []
    else:
        for key in run_configs:
            if key not in configurations:
                print("Error: '{0}' is not a valid configuration".format(key))
                sys.exit(1)

    external_clusterings = [
        (filename,
         load_external_clusters(names, os.path.join(datadir, filename)))
        for filename in external_cluster_files
    ]

    for key in run_configs:
        config = configurations[key]
        config_outdir = os.path.join(outdir, key)
        os.makedirs(config_outdir, exist_ok=True)
        logfile = open(os.path.join(config_outdir, key + '-log.txt'), 'w')

        print(
            "===============================================================")
        log(datetime.datetime.now().strftime('%c'))
        log("Running configuration " + key)
        log(str(config))

        cluster_fn, cluster_kwargs = config['cluster']
        graph_fn, graph_kwargs = config.get('graph', (None, {}))

        # Output data to be stored in a JSON file for later compilation into a
        # table for comparing the results of the various configurations
        outdict = OrderedDict([
            ('key', key),
            ('graph', graph_fn.__name__ if graph_fn else ''),
            ('metric', graph_kwargs.get('metric', '')),
            ('graph_threshold', graph_kwargs.get('threshold', '')),
            ('cluster', cluster_fn.__name__),
            ('k', cluster_kwargs.get('k', '')),
            ('cluster_threshold', cluster_kwargs.get('threshold', '')),
            ('max_clusters', cluster_kwargs.get('max_clusters', '')),
            ('iterations', cluster_kwargs.get('iterations', '')),
        ])

        log("Calculating aggregate FOM")
        try:
            fom = aggregate_fom(data, graph_fn, graph_kwargs, cluster_fn,
                                cluster_kwargs)
            log("Aggregate FOM = {0}".format(fom))
            outdict['aggregate_fom'] = fom
            pass
        except ClusterEvaluationError as e:
            log("Cannot calculate aggregate FOM: {0}".format(e))

        log("Clustering entire dataset")
        if graph_fn is None:
            # Do non-graph-based clustering
            outdict['edges_to_nodes_ratio'] = ''
            log("Computing clusters")
            clusters = cluster_fn(data, **cluster_kwargs)
        else:
            # Do graph-based clustering
            log("Computing graph")
            if graph_fn.__name__ == 'gabriel_graph':
                outdict['metric'] = 'euclidean'
            adj = graph_fn(data, **graph_kwargs)
            edges_to_nodes_ratio = float(count_edges(adj)) / data.shape[0]
            log("Edges-to-nodes ratio = {}".format(edges_to_nodes_ratio))
            outdict['edges_to_nodes_ratio'] = edges_to_nodes_ratio

            log("Computing clusters")
            clusters = cluster_fn(adj, **cluster_kwargs)

        num_clusters = int(clusters.max() + 1)
        log("{0} clusters generated".format(num_clusters))
        outdict['num_clusters'] = num_clusters
        if num_clusters <= 0:
            log("Error: There are no clusters. Skipping configuration")
            continue
        total_genes = len(data)
        clustered_genes = int((clusters >= 0).sum())
        clustered_genes_pct = round(100 * float(clustered_genes) / total_genes)
        log("{0} of {1} genes clustered ({2}%)".format(clustered_genes,
                                                       total_genes,
                                                       clustered_genes_pct))
        outdict['clustered_genes'] = clustered_genes
        outdict['clustered_genes_pct'] = clustered_genes_pct

        clusters_outdata = np.vstack((names, clusters)).transpose()
        np.savetxt(os.path.join(config_outdir, key + '-clusters.txt'),
                   clusters_outdata,
                   fmt='%s')

        log("\nSilhouette statistics:")
        log("{:11} {:>13} {:>9} {:>9}".format("metric", "weighted_mean", "min",
                                              "max"))
        for metric in 'euclidean', 'correlation', 'cosine':
            widths = silhouette_widths(clusters, data, metric)
            stats, summary = silhouette_stats(clusters, widths)
            log("{:11} {:13.3f} {:9.3f} {:9.3f}".format(
                metric, summary['weighted_mean'], summary['min'],
                summary['max']))
            outdict['silhouette_' + metric +
                    '_weighted_mean'] = (summary['weighted_mean'])
            outdict['silhouette_' + metric + '_min'] = summary['min']
            outdict['silhouette_' + metric + '_max'] = summary['max']

            np.savetxt(os.path.join(
                config_outdir, "{0}-silhouette-{1}.txt".format(key, metric)),
                       stats,
                       header=' '.join(stats.dtype.names),
                       fmt="%d %3d %6.3f %6.3f %6.3f",
                       comments='')

        log("\nCluster size:")
        log("{:>8} {:>8} {:>8}".format("mean", "min", "max"))
        cluster_size_mean = stats['count'].mean()
        cluster_size_min = stats['count'].min()
        cluster_size_max = stats['count'].max()
        log("{:8.2f} {:8d} {:8d}".format(cluster_size_mean, cluster_size_min,
                                         cluster_size_max))
        log('')
        outdict['cluster_size_mean'] = cluster_size_mean
        outdict['cluster_size_min'] = int(cluster_size_min)
        outdict['cluster_size_max'] = int(cluster_size_max)

        for ext_filename, ext_clusters in external_clusterings:

            # Only consider genes that are clustered in both clusterings
            ext_clusters = ext_clusters.copy()
            ext_clusters[clusters < 0] = -1
            int_clusters = clusters.copy()
            int_clusters[ext_clusters < 0] = -1

            rand_index_val = rand_index(int_clusters, ext_clusters)
            log("Rand index = {0} ({1})".format(rand_index_val, ext_filename))
            outdict['rand_' + ext_filename] = rand_index_val

        log("Plotting cluster expression levels")
        figs = plot_cluster_expression(names, data, clusters)
        #save_pdf(figs, os.path.join(config_outdir, key + '-figures.pdf'))
        for i, fig in enumerate(figs):
            fig.savefig(
                os.path.join(config_outdir,
                             key + '-cluster-{0}.png'.format(i)))
        plt.close('all')

        log("Finished running configuration {0}".format(key))
        log(datetime.datetime.now().strftime('%c'))
        print()
        logfile.close()

        with open(os.path.join(config_outdir, 'results.json'), 'w') as fp:
            json.dump(outdict, fp, indent=4)

    compile_results(outdir)
Example #2
0
 def testZeroAgreementWithUnclustered(self):
     X = np.array([0, 1, 2, -1])  # All in different clusters
     Y = np.array([0, 0, 0, -1])  # All in same cluster
     self.assertEqual(rand_index(X, Y), 0.0)
Example #3
0
 def testHalfAgreementWithUnclustered(self):
     X = np.array([0, 0, 0, 0, -1])  # All in same cluster
     Y = np.array([0, 0, 0, 1, -1])  # Half of the pairs are split
     self.assertEqual(rand_index(X, Y), 0.5)
Example #4
0
 def testPerfectAgreementSame(self):
     X = np.array([0, 0, 0])  # All in same cluster
     Y = np.array([0, 0, 0])  # All in same cluster
     self.assertEqual(rand_index(X, Y), 1.0)
Example #5
0
 def testRand1971Example(self):
     # This is from William Rand's 1971 paper that defined the index
     X = np.array([0, 0, 0, 1, 1, 1])
     Y = np.array([0, 0, 1, 1, 1, 2])
     self.assertEqual(rand_index(X, Y), 0.6)
Example #6
0
 def testHalfAgreementWithUnclustered(self):
     X = np.array([0, 0, 0, 0, -1]) # All in same cluster
     Y = np.array([0, 0, 0, 1, -1]) # Half of the pairs are split
     self.assertEqual(rand_index(X, Y), 0.5)
Example #7
0
 def testPerfectAgreementDifferent(self):
     X = np.array([0, 1, 2])  # All in different clusters
     Y = np.array([0, 1, 2])  # All in different clusters
     self.assertEqual(rand_index(X, Y), 1.0)
Example #8
0
 def testZeroAgreementWithUnclustered(self):
     X = np.array([0, 1, 2, -1]) # All in different clusters
     Y = np.array([0, 0, 0, -1]) # All in same cluster
     self.assertEqual(rand_index(X, Y), 0.0)
Example #9
0
 def testRand1971Example(self):
     # This is from William Rand's 1971 paper that defined the index
     X = np.array([0, 0, 0, 1, 1, 1])
     Y = np.array([0, 0, 1, 1, 1, 2])
     self.assertEqual(rand_index(X, Y), 0.6)
Example #10
0
 def testPerfectAgreementSame(self):
     X = np.array([0, 0, 0]) # All in same cluster
     Y = np.array([0, 0, 0]) # All in same cluster
     self.assertEqual(rand_index(X, Y), 1.0)
Example #11
0
 def testPerfectAgreementDifferent(self):
     X = np.array([0, 1, 2]) # All in different clusters
     Y = np.array([0, 1, 2]) # All in different clusters
     self.assertEqual(rand_index(X, Y), 1.0)
Example #12
0
def main(datadir, outdir, run_configs):
    """
    Run the main program.

    Parameters
    ----------
    datadir : string
        The path to the directory containing the input data files yeastEx.txt
        and yeastNames.txt.

    outdir : string
        The path to the top-level directory to which the output files will be
        saved. The output for each configuration will be stored in a
        subdirectory named with that configuration's key.

    run_configs : list of string
        A list of keys of the configurations to run (see ngcluster.config).
    """

    logfile = None
    def log(msg):
        print(msg)
        print(msg, file=logfile)

    data = np.loadtxt(os.path.join(datadir, 'yeastEx.txt'))
    names = np.loadtxt(os.path.join(datadir, 'yeastNames.txt'),
            dtype=bytes).astype(str)

    if run_configs == []:
        print("Usage:\n"
              "    python3 run.py <config1> [<config2> ...]\n"
              "  or\n"
              "    python3 run.py all\n"
              "  to run all configurations\n"
              "  or\n"
              "    python3 run.py compile\n"
              "  to compile results from all previous runs into a CSV file\n"
              "Available configurations (see ngcluster/config.py):")
        for key, config in configurations.items():
            print("  {0}: {1}".format(key, config['description']))
        sys.exit(1)

    elif run_configs == ['all']:
        run_configs = list(configurations.keys())
        print("Running all {0} configurations: {1}"
                .format(len(run_configs), ", ".join(run_configs)))
    elif run_configs == ['compile']:
        run_configs = []
    else:
        for key in run_configs:
            if key not in configurations:
                print("Error: '{0}' is not a valid configuration".format(key))
                sys.exit(1)

    external_clusterings = [
            (filename,
                load_external_clusters(names, os.path.join(datadir, filename)))
            for filename in external_cluster_files]

    for key in run_configs:
        config = configurations[key]
        config_outdir = os.path.join(outdir, key)
        os.makedirs(config_outdir, exist_ok=True)
        logfile = open(os.path.join(config_outdir, key + '-log.txt'), 'w')

        print("===============================================================")
        log(datetime.datetime.now().strftime('%c'))
        log("Running configuration " + key)
        log(str(config))

        cluster_fn, cluster_kwargs = config['cluster']
        graph_fn, graph_kwargs = config.get('graph', (None, {}))

        # Output data to be stored in a JSON file for later compilation into a
        # table for comparing the results of the various configurations
        outdict = OrderedDict([
            ('key', key),
            ('graph', graph_fn.__name__ if graph_fn else ''),
            ('metric', graph_kwargs.get('metric', '')),
            ('graph_threshold', graph_kwargs.get('threshold', '')),
            ('cluster', cluster_fn.__name__),
            ('k', cluster_kwargs.get('k', '')),
            ('cluster_threshold', cluster_kwargs.get('threshold', '')),
            ('max_clusters', cluster_kwargs.get('max_clusters', '')),
            ('iterations', cluster_kwargs.get('iterations', '')),
            ])

        log("Calculating aggregate FOM")
        try:
            fom = aggregate_fom(data,
                    graph_fn, graph_kwargs, cluster_fn, cluster_kwargs)
            log("Aggregate FOM = {0}".format(fom))
            outdict['aggregate_fom'] = fom
            pass
        except ClusterEvaluationError as e:
            log("Cannot calculate aggregate FOM: {0}".format(e))

        log("Clustering entire dataset")
        if graph_fn is None:
            # Do non-graph-based clustering
            outdict['edges_to_nodes_ratio'] = ''
            log("Computing clusters")
            clusters = cluster_fn(data, **cluster_kwargs)
        else:
            # Do graph-based clustering
            log("Computing graph")
            if graph_fn.__name__ == 'gabriel_graph':
                outdict['metric'] = 'euclidean'
            adj = graph_fn(data, **graph_kwargs)
            edges_to_nodes_ratio = float(count_edges(adj)) / data.shape[0]
            log("Edges-to-nodes ratio = {}".format(edges_to_nodes_ratio))
            outdict['edges_to_nodes_ratio'] = edges_to_nodes_ratio

            log("Computing clusters")
            clusters = cluster_fn(adj, **cluster_kwargs)

        num_clusters = int(clusters.max() + 1)
        log("{0} clusters generated".format(num_clusters))
        outdict['num_clusters'] = num_clusters
        if num_clusters <= 0:
            log("Error: There are no clusters. Skipping configuration")
            continue
        total_genes = len(data)
        clustered_genes = int((clusters >= 0).sum())
        clustered_genes_pct = round(100 * float(clustered_genes) / total_genes)
        log("{0} of {1} genes clustered ({2}%)"
                .format(clustered_genes, total_genes, clustered_genes_pct))
        outdict['clustered_genes'] = clustered_genes
        outdict['clustered_genes_pct'] = clustered_genes_pct

        clusters_outdata = np.vstack((names, clusters)).transpose()
        np.savetxt(os.path.join(config_outdir, key + '-clusters.txt'),
                clusters_outdata, fmt='%s')

        log("\nSilhouette statistics:")
        log("{:11} {:>13} {:>9} {:>9}".format(
            "metric", "weighted_mean", "min",  "max"))
        for metric in 'euclidean', 'correlation', 'cosine':
            widths = silhouette_widths(clusters, data, metric)
            stats, summary = silhouette_stats(clusters, widths)
            log("{:11} {:13.3f} {:9.3f} {:9.3f}".format(metric,
                summary['weighted_mean'], summary['min'], summary['max']))
            outdict['silhouette_' + metric + '_weighted_mean'] = (
                    summary['weighted_mean'])
            outdict['silhouette_' + metric + '_min'] = summary['min']
            outdict['silhouette_' + metric + '_max'] = summary['max']

            np.savetxt(
                    os.path.join(
                        config_outdir,
                        "{0}-silhouette-{1}.txt".format(key, metric)),
                    stats,
                    header=' '.join(stats.dtype.names),
                    fmt="%d %3d %6.3f %6.3f %6.3f",
                    comments='')

        log("\nCluster size:")
        log("{:>8} {:>8} {:>8}".format("mean", "min", "max"))
        cluster_size_mean = stats['count'].mean()
        cluster_size_min = stats['count'].min()
        cluster_size_max = stats['count'].max()
        log("{:8.2f} {:8d} {:8d}".format(
            cluster_size_mean, cluster_size_min, cluster_size_max))
        log('')
        outdict['cluster_size_mean'] = cluster_size_mean
        outdict['cluster_size_min'] = int(cluster_size_min)
        outdict['cluster_size_max'] = int(cluster_size_max)

        for ext_filename, ext_clusters in external_clusterings:

            # Only consider genes that are clustered in both clusterings
            ext_clusters = ext_clusters.copy()
            ext_clusters[clusters < 0] = -1
            int_clusters = clusters.copy()
            int_clusters[ext_clusters < 0] = -1

            rand_index_val = rand_index(int_clusters, ext_clusters)
            log("Rand index = {0} ({1})".format(rand_index_val, ext_filename))
            outdict['rand_' + ext_filename] = rand_index_val

        log("Plotting cluster expression levels")
        figs = plot_cluster_expression(names, data, clusters)
        #save_pdf(figs, os.path.join(config_outdir, key + '-figures.pdf'))
        for i, fig in enumerate(figs):
            fig.savefig(os.path.join(config_outdir, key + '-cluster-{0}.png'
                .format(i)))
        plt.close('all')

        log("Finished running configuration {0}".format(key))
        log(datetime.datetime.now().strftime('%c'))
        print()
        logfile.close()

        with open(os.path.join(config_outdir, 'results.json'), 'w') as fp:
            json.dump(outdict, fp, indent=4)

    compile_results(outdir)