예제 #1
0
def main(args, outs):
    np.random.seed(0)

    if args.skip:
        return

    with LogPerf('submatrix_load'):
        submatrix = cr_graphclust.load_ndarray_h5(args.submatrix, 'submatrix')

    with LogPerf('nn_idx_load'):
        balltree = cr_graphclust.load_neighbor_index(args.neighbor_index)

    with LogPerf('nn_query'):
        nn_matrix = cr_graphclust.compute_nearest_neighbors(
            submatrix, balltree, args.k_nearest, args.row_start)
        cr_graphclust.write_nearest_neighbors(nn_matrix,
                                              outs.chunked_neighbors)
예제 #2
0
def compute_snn_matrix(nn, k_nearest):
    """ Compute shared-nearest-neighbor matrix from a nearest-neighbor boolean matrix """
    with LogPerf('tocsr'):
        nn = nn.tocsr(copy=False)

    # The SNN (shared nearest neighbor) similarity is
    #   The length of the nearest-neighbor intersection between two rows
    #   (divided by the max number of neighbors)
    # This can be computed via the dot products of rows in the boolean NN matrix
    with LogPerf('snn'):
        snn = (nn.dot(nn.T)) / float(k_nearest)

    # Use the SNN similarity in the modularity optimization algorithm
    # Louvain takes a text edge-list and converts to its own binary format
    with LogPerf('tocoo'):
        snn = snn.tocoo(copy=False)

    return snn
예제 #3
0
def split(args):
    np.random.seed(0)

    if args.skip:
        return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]}

    if args.similarity_type not in SIMILARITY_TYPES:
        martian.exit("Unsupported similarity type: %s. Must be one of: %s" %
                     (args.similarity_type, ','.join(SIMILARITY_TYPES)))

    with LogPerf('load'):
        pca_mat = SingleGenomeAnalysis.load_pca_from_h5(
            args.pca_h5).transformed_pca_matrix

    # Subselect barcodes if desired
    if args.num_bcs is None:
        use_bcs = np.arange(pca_mat.shape[0])
    else:
        use_bcs = np.random.choice(pca_mat.shape[0],
                                   args.num_bcs,
                                   replace=False)
        pca_mat = pca_mat[use_bcs, :]

    # Record indices of selected barcodes
    use_bcs_path = martian.make_path('use_bcs.h5')
    cr_graphclust.save_ndarray_h5(use_bcs, use_bcs_path, 'use_bcs')

    # Subselect PCs if desired
    if args.input_pcs is not None:
        n_pcs = min(pca_mat.shape[1], args.input_pcs)
        pca_mat = pca_mat[:, np.arange(n_pcs)]

    # Build the nearest neighbor query index
    with LogPerf('nn_build'):
        balltree = cr_graphclust.build_neighbor_index(
            pca_mat, args.balltree_leaf_size or DEFAULT_BALLTREE_LEAFSIZE)
        neighbor_index = martian.make_path('neighbor_index.pickle')
        cr_graphclust.save_neighbor_index(balltree, neighbor_index)

    # Compute the actual number of nearest neighbors we'll use
    given_num_neighbors = args.num_neighbors if args.num_neighbors is not None else analysis_constants.GRAPHCLUST_NEIGHBORS_DEFAULT
    given_neighbor_a = args.neighbor_a if args.neighbor_a is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_A_DEFAULT
    given_neighbor_b = args.neighbor_b if args.neighbor_b is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_B_DEFAULT

    # Take max of {num_neighbors, a + b*log10(n)}
    use_neighbors = int(
        max(
            given_num_neighbors,
            np.round(given_neighbor_a +
                     given_neighbor_b * np.log10(len(use_bcs)))))

    # Clamp to [1, n - 1]
    num_neighbors = max(1, min(use_neighbors, len(use_bcs) - 1))
    print "Using %d neighbors" % num_neighbors

    # Divide the PCA matrix up into rows for NN queries
    with LogPerf('chunk_pca'):
        chunks = []
        for row_start in xrange(0, pca_mat.shape[0], NN_QUERIES_PER_CHUNK):
            row_end = min(row_start + NN_QUERIES_PER_CHUNK, pca_mat.shape[0])

            # Write the pca submatrix to an h5 file
            submatrix_path = martian.make_path('%d_submatrix.h5' % row_start)
            cr_graphclust.save_ndarray_h5(pca_mat[row_start:row_end, :],
                                          submatrix_path, 'submatrix')

            chunks.append({
                'neighbor_index': neighbor_index,
                'submatrix': submatrix_path,
                'row_start': row_start,
                'total_rows': pca_mat.shape[0],
                'k_nearest': num_neighbors,
                'use_bcs': use_bcs_path,
            })

    if args.similarity_type == SNN_SIMILARITY:
        join_mem_gb = 64
        join_threads = 4  # Overallocate
    else:
        # Scale memory with size of nearest-neighbor adjacency matrix
        join_mem_gb = max(
            h5_constants.MIN_MEM_GB,
            int(np.ceil(
                (num_neighbors * len(use_bcs)) / NN_ENTRIES_PER_MEM_GB)))
        # HACK: use more threads for bigger mem requests to avoid mem oversubscription on clusters that don't enforce it
        join_threads = cr_io.get_thread_request_from_mem_gb(join_mem_gb)

    return {
        'chunks': chunks,
        'join': {
            '__mem_gb': join_mem_gb,
            '__threads': join_threads,
        }
    }
예제 #4
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return
    # Merge the neighbor matrices
    with LogPerf('merge_nn'):
        nn = cr_graphclust.merge_nearest_neighbors(
            [chunk.chunked_neighbors for chunk in chunk_outs],
            chunk_defs[0].total_rows)
    print 'nn\tnn_nodes\t%0.4f' % nn.shape[0]
    print 'nn\tnn_links\t%0.4f' % nn.nnz
    print 'nn\tnn_density\t%0.4f' % cr_graphclust.matrix_density(nn)
    sys.stdout.flush()

    matrix_bin = martian.make_path('matrix.bin')
    matrix_weights = martian.make_path('matrix.weights')
    louvain_out = martian.make_path('louvain.out')

    if args.similarity_type == 'snn':
        snn = cr_graphclust.compute_snn_matrix(nn, chunk_defs[0].k_nearest)

        print 'snn\tsnn_nodes\t%d' % snn.shape[0]
        print 'snn\tsnn_links\t%d' % (snn.nnz / 2)
        print 'snn\tsnn_density\t%0.4f' % (
            (snn.nnz) / float(snn.shape[0] * (snn.shape[0] - 1)))
        sys.stdout.flush()

        with LogPerf('convert'):
            cr_graphclust.pipe_weighted_edgelist_to_convert(
                snn, matrix_bin, matrix_weights)

        with LogPerf('louvain'):
            cr_graphclust.run_louvain_weighted_clustering(
                matrix_bin, matrix_weights, louvain_out)

    else:
        with LogPerf('tocoo'):
            nn = nn.tocoo(copy=False)

        with LogPerf('convert'):
            cr_graphclust.pipe_unweighted_edgelist_to_convert(nn, matrix_bin)

        with LogPerf('louvain'):
            cr_graphclust.run_louvain_unweighted_clustering(
                matrix_bin, louvain_out)

    with LogPerf('load_bcs'):
        barcodes = SingleGenomeAnalysis.load_bcs_from_matrix_h5(args.matrix_h5)

    use_bcs = cr_graphclust.load_ndarray_h5(chunk_defs[0].use_bcs, 'use_bcs')

    labels = cr_graphclust.load_louvain_results(len(barcodes), use_bcs,
                                                louvain_out)

    labels = cr_clustering.relabel_by_size(labels)

    # Save cluster results
    with analysis_io.open_h5_for_writing(outs.clusters_h5) as f:
        cr_graphclust.save_graphclust_h5(f, labels)

    clustering_key = cr_clustering.format_clustering_key(
        cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0)

    cr_clustering.save_clustering_csv(outs.clusters_csv, clustering_key,
                                      labels, barcodes)

    outs.chunked_neighbors = None
예제 #5
0
def split(args):
    np.random.seed(0)

    if args.matrix_h5 is None:
        return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]}

    if not os.path.exists(args.reduced_data):
        raise IOError('reduced data not found at {}'.format(args.reduced_data))

    if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS):
        raise ValueError('Invalid factorization provided')

    if args.similarity_type not in SIMILARITY_TYPES:
        raise ValueError(
            'Unsupported similarity type: %s. Must be one of: %s' %
            (args.similarity_type, ','.join(SIMILARITY_TYPES)))

    reduction_summary = args.reduction_summary['h5']

    method_dict = {}
    for method in args.factorization:
        method_dict[method] = {}

    with LogPerf('load'):
        for method in args.factorization:
            if method == 'pca':
                method_dict[method][
                    'transformed_matrix'] = cr_pca.load_pca_from_h5(
                        reduction_summary[method]).transformed_pca_matrix
            if method == 'lsa':
                method_dict[method][
                    'transformed_matrix'] = cr_lsa.load_lsa_from_h5(
                        reduction_summary[method]).transformed_lsa_matrix
            if method == 'plsa':
                method_dict[method][
                    'transformed_matrix'] = cr_plsa.load_plsa_from_h5(
                        reduction_summary[method]).transformed_plsa_matrix

    # Record indices of selected barcodes. All methods must use same barcodes
    use_bcs = np.arange(
        method_dict[args.factorization[0]]['transformed_matrix'].shape[0])
    use_bcs_path = martian.make_path('use_bcs.h5')
    cr_graphclust.save_ndarray_h5(use_bcs, use_bcs_path, 'use_bcs')

    # Build the nearest neighbor query index
    with LogPerf('nn_build'):
        for method in args.factorization:
            method_mat = method_dict[method]['transformed_matrix']
            # normalize for plsa/lsa so that standard euclidean distance in normalized space is cosine distance in original space
            if method in ['plsa', 'lsa']:
                method_mat = method_mat / np.linalg.norm(
                    method_mat, axis=1, keepdims=True)
            balltree = cr_graphclust.build_neighbor_index(
                method_mat, args.balltree_leaf_size
                or DEFAULT_BALLTREE_LEAFSIZE)
            method_dict[method]['neighbor_index'] = martian.make_path(
                'neighbor_index_{}.pickle'.format(method))
            cr_graphclust.save_neighbor_index(
                balltree, method_dict[method]['neighbor_index'])

    # Compute the actual number of nearest neighbors we'll use
    given_num_neighbors = args.num_neighbors if args.num_neighbors is not None else analysis_constants.GRAPHCLUST_NEIGHBORS_DEFAULT
    given_neighbor_a = args.neighbor_a if args.neighbor_a is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_A_DEFAULT
    given_neighbor_b = args.neighbor_b if args.neighbor_b is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_B_DEFAULT

    # Take max of {num_neighbors, a + b*log10(n)}
    use_neighbors = int(
        max(
            given_num_neighbors,
            np.round(given_neighbor_a +
                     given_neighbor_b * np.log10(len(use_bcs)))))

    # Clamp to [1, n - 1]
    num_neighbors = max(1, min(use_neighbors, len(use_bcs) - 1))
    print "Using %d neighbors" % num_neighbors

    # Divide the PCA matrix up into rows for NN queries
    with LogPerf('chunk_matrix'):
        chunks = []
        for method in args.factorization:
            method_mat = method_dict[method]['transformed_matrix']
            for row_start in xrange(0, method_mat.shape[0],
                                    NN_QUERIES_PER_CHUNK):
                row_end = min(row_start + NN_QUERIES_PER_CHUNK,
                              method_mat.shape[0])

                # Write the submatrix to an h5 file
                submatrix_path = martian.make_path('{}_{}_submatrix.h5'.format(
                    method, row_start))
                cr_graphclust.save_ndarray_h5(method_mat[row_start:row_end, :],
                                              submatrix_path, 'submatrix')

                chunks.append({
                    'method':
                    method,
                    'neighbor_index':
                    method_dict[method]['neighbor_index'],
                    'submatrix':
                    submatrix_path,
                    'row_start':
                    row_start,
                    'total_rows':
                    method_mat.shape[0],
                    'k_nearest':
                    num_neighbors,
                    'use_bcs':
                    use_bcs_path,
                })

    if args.similarity_type == SNN_SIMILARITY:
        join_mem_gb = 64
        join_threads = 4  # Overallocate
    else:
        # Scale memory with size of nearest-neighbor adjacency matrix
        join_mem_gb = max(
            h5_constants.MIN_MEM_GB,
            int(np.ceil(
                (num_neighbors * len(use_bcs)) / NN_ENTRIES_PER_MEM_GB)))
        # HACK: use more threads for bigger mem requests to avoid mem oversubscription on clusters that don't enforce it
        join_threads = cr_io.get_thread_request_from_mem_gb(join_mem_gb)

    return {
        'chunks': chunks,
        'join': {
            '__mem_gb': join_mem_gb,
            '__threads': join_threads,
        }
    }
예제 #6
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.matrix_h5 is None:
        outs.graph_clustering_summary = {}
        return

    outs.graph_clustering_summary = {'h5': {}, 'csv': {}}
    # Merge the neighbor matrices
    for method in args.factorization:
        chunk_outs_def_method = [[
            chunk_out, chunk_def
        ] for chunk_out, chunk_def in zip(chunk_outs, chunk_defs)
                                 if chunk_def.method == method]
        chunk_outs_method = [c[0] for c in chunk_outs_def_method]
        chunk_defs_method = [c[1] for c in chunk_outs_def_method]

        with LogPerf('merge_nn'):
            nn = cr_graphclust.merge_nearest_neighbors(
                [chunk.chunked_neighbors for chunk in chunk_outs_method],
                chunk_defs_method[0].total_rows)
        print 'nn\tnn_nodes\t%0.4f' % nn.shape[0]
        print 'nn\tnn_links\t%0.4f' % nn.nnz
        print 'nn\tnn_density\t%0.4f' % cr_graphclust.matrix_density(nn)
        sys.stdout.flush()

        matrix_bin = martian.make_path('matrix_{}.bin'.format(method))
        matrix_weights = martian.make_path('matrix_{}.weights'.format(method))
        louvain_out = martian.make_path('louvain_{}.out'.format(method))

        if args.similarity_type == 'snn':
            snn = cr_graphclust.compute_snn_matrix(
                nn, chunk_defs_method[0].k_nearest)

            print 'snn\tsnn_nodes\t%d' % snn.shape[0]
            print 'snn\tsnn_links\t%d' % (snn.nnz / 2)
            print 'snn\tsnn_density\t%0.4f' % (
                (snn.nnz) / float(snn.shape[0] * (snn.shape[0] - 1)))
            sys.stdout.flush()

            with LogPerf('convert'):
                cr_graphclust.pipe_weighted_edgelist_to_convert(
                    snn, matrix_bin, matrix_weights)

            with LogPerf('louvain'):
                cr_graphclust.run_louvain_weighted_clustering(
                    matrix_bin, matrix_weights, louvain_out)

        else:
            with LogPerf('tocoo'):
                nn = nn.tocoo(copy=False)

            with LogPerf('convert'):
                cr_graphclust.pipe_unweighted_edgelist_to_convert(
                    nn, matrix_bin)

            with LogPerf('louvain'):
                cr_graphclust.run_louvain_unweighted_clustering(
                    matrix_bin, louvain_out)

        with LogPerf('load_bcs'):
            barcodes = None
            with h5.File(args.matrix_h5, 'r') as f:
                group_name = f.keys()[0]
                barcodes = cr_matrix.CountMatrix.load_bcs_from_h5_group(
                    f[group_name])

        use_bcs = cr_graphclust.load_ndarray_h5(chunk_defs_method[0].use_bcs,
                                                'use_bcs')

        labels = cr_graphclust.load_louvain_results(len(barcodes), use_bcs,
                                                    louvain_out)

        labels = cr_clustering.relabel_by_size(labels)

        # Save cluster results
        cr_io.mkdir(outs.knn_clusters, allow_existing=True)
        method_dir = os.path.join(outs.knn_clusters, method)
        cr_io.mkdir(method_dir, allow_existing=True)
        _h5 = os.path.join(method_dir, "clusters.h5")
        _csv = os.path.join(method_dir, "clusters_csv")
        with analysis_io.open_h5_for_writing(_h5) as f:
            cr_graphclust.save_graphclust_h5(f, labels)

        clustering_key = cr_clustering.format_clustering_key(
            cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0)
        cr_clustering.save_clustering_csv(_csv, clustering_key, labels,
                                          barcodes)
        outs.graph_clustering_summary['h5'][method] = _h5
        outs.graph_clustering_summary['csv'][method] = _csv

    outs.chunked_neighbors = None