Example #1
0
def main(args, outs):
    np.random.seed(0)

    if args.skip:
        return

    with LogPerf('submatrix_load'):
        submatrix = cr_graphclust.load_ndarray_h5(args.submatrix, 'submatrix')

    with LogPerf('nn_idx_load'):
        balltree = cr_graphclust.load_neighbor_index(args.neighbor_index)

    with LogPerf('nn_query'):
        nn_matrix = cr_graphclust.compute_nearest_neighbors(
            submatrix, balltree, args.k_nearest, args.row_start)
        cr_graphclust.write_nearest_neighbors(nn_matrix,
                                              outs.chunked_neighbors)
Example #2
0
def compute_snn_matrix(nn, k_nearest):
    """ Compute shared-nearest-neighbor matrix from a nearest-neighbor boolean matrix """
    with LogPerf('tocsr'):
        nn = nn.tocsr(copy=False)

    # The SNN (shared nearest neighbor) similarity is
    #   The length of the nearest-neighbor intersection between two rows
    #   (divided by the max number of neighbors)
    # This can be computed via the dot products of rows in the boolean NN matrix
    with LogPerf('snn'):
        snn = (nn.dot(nn.T)) / float(k_nearest)

    # Use the SNN similarity in the modularity optimization algorithm
    # Louvain takes a text edge-list and converts to its own binary format
    with LogPerf('tocoo'):
        snn = snn.tocoo(copy=False)

    return snn
Example #3
0
def split(args):
    np.random.seed(0)

    if args.skip:
        return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]}

    if args.similarity_type not in SIMILARITY_TYPES:
        martian.exit("Unsupported similarity type: %s. Must be one of: %s" %
                     (args.similarity_type, ','.join(SIMILARITY_TYPES)))

    with LogPerf('load'):
        pca_mat = SingleGenomeAnalysis.load_pca_from_h5(
            args.pca_h5).transformed_pca_matrix

    # Subselect barcodes if desired
    if args.num_bcs is None:
        use_bcs = np.arange(pca_mat.shape[0])
    else:
        use_bcs = np.random.choice(pca_mat.shape[0],
                                   args.num_bcs,
                                   replace=False)
        pca_mat = pca_mat[use_bcs, :]

    # Record indices of selected barcodes
    use_bcs_path = martian.make_path('use_bcs.h5')
    cr_graphclust.save_ndarray_h5(use_bcs, use_bcs_path, 'use_bcs')

    # Subselect PCs if desired
    if args.input_pcs is not None:
        n_pcs = min(pca_mat.shape[1], args.input_pcs)
        pca_mat = pca_mat[:, np.arange(n_pcs)]

    # Build the nearest neighbor query index
    with LogPerf('nn_build'):
        balltree = cr_graphclust.build_neighbor_index(
            pca_mat, args.balltree_leaf_size or DEFAULT_BALLTREE_LEAFSIZE)
        neighbor_index = martian.make_path('neighbor_index.pickle')
        cr_graphclust.save_neighbor_index(balltree, neighbor_index)

    # Compute the actual number of nearest neighbors we'll use
    given_num_neighbors = args.num_neighbors if args.num_neighbors is not None else analysis_constants.GRAPHCLUST_NEIGHBORS_DEFAULT
    given_neighbor_a = args.neighbor_a if args.neighbor_a is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_A_DEFAULT
    given_neighbor_b = args.neighbor_b if args.neighbor_b is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_B_DEFAULT

    # Take max of {num_neighbors, a + b*log10(n)}
    use_neighbors = int(
        max(
            given_num_neighbors,
            np.round(given_neighbor_a +
                     given_neighbor_b * np.log10(len(use_bcs)))))

    # Clamp to [1, n - 1]
    num_neighbors = max(1, min(use_neighbors, len(use_bcs) - 1))
    print "Using %d neighbors" % num_neighbors

    # Divide the PCA matrix up into rows for NN queries
    with LogPerf('chunk_pca'):
        chunks = []
        for row_start in xrange(0, pca_mat.shape[0], NN_QUERIES_PER_CHUNK):
            row_end = min(row_start + NN_QUERIES_PER_CHUNK, pca_mat.shape[0])

            # Write the pca submatrix to an h5 file
            submatrix_path = martian.make_path('%d_submatrix.h5' % row_start)
            cr_graphclust.save_ndarray_h5(pca_mat[row_start:row_end, :],
                                          submatrix_path, 'submatrix')

            chunks.append({
                'neighbor_index': neighbor_index,
                'submatrix': submatrix_path,
                'row_start': row_start,
                'total_rows': pca_mat.shape[0],
                'k_nearest': num_neighbors,
                'use_bcs': use_bcs_path,
            })

    if args.similarity_type == SNN_SIMILARITY:
        join_mem_gb = 64
        join_threads = 4  # Overallocate
    else:
        # Scale memory with size of nearest-neighbor adjacency matrix
        join_mem_gb = max(
            h5_constants.MIN_MEM_GB,
            int(np.ceil(
                (num_neighbors * len(use_bcs)) / NN_ENTRIES_PER_MEM_GB)))
        # HACK: use more threads for bigger mem requests to avoid mem oversubscription on clusters that don't enforce it
        join_threads = cr_io.get_thread_request_from_mem_gb(join_mem_gb)

    return {
        'chunks': chunks,
        'join': {
            '__mem_gb': join_mem_gb,
            '__threads': join_threads,
        }
    }
Example #4
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return
    # Merge the neighbor matrices
    with LogPerf('merge_nn'):
        nn = cr_graphclust.merge_nearest_neighbors(
            [chunk.chunked_neighbors for chunk in chunk_outs],
            chunk_defs[0].total_rows)
    print 'nn\tnn_nodes\t%0.4f' % nn.shape[0]
    print 'nn\tnn_links\t%0.4f' % nn.nnz
    print 'nn\tnn_density\t%0.4f' % cr_graphclust.matrix_density(nn)
    sys.stdout.flush()

    matrix_bin = martian.make_path('matrix.bin')
    matrix_weights = martian.make_path('matrix.weights')
    louvain_out = martian.make_path('louvain.out')

    if args.similarity_type == 'snn':
        snn = cr_graphclust.compute_snn_matrix(nn, chunk_defs[0].k_nearest)

        print 'snn\tsnn_nodes\t%d' % snn.shape[0]
        print 'snn\tsnn_links\t%d' % (snn.nnz / 2)
        print 'snn\tsnn_density\t%0.4f' % (
            (snn.nnz) / float(snn.shape[0] * (snn.shape[0] - 1)))
        sys.stdout.flush()

        with LogPerf('convert'):
            cr_graphclust.pipe_weighted_edgelist_to_convert(
                snn, matrix_bin, matrix_weights)

        with LogPerf('louvain'):
            cr_graphclust.run_louvain_weighted_clustering(
                matrix_bin, matrix_weights, louvain_out)

    else:
        with LogPerf('tocoo'):
            nn = nn.tocoo(copy=False)

        with LogPerf('convert'):
            cr_graphclust.pipe_unweighted_edgelist_to_convert(nn, matrix_bin)

        with LogPerf('louvain'):
            cr_graphclust.run_louvain_unweighted_clustering(
                matrix_bin, louvain_out)

    with LogPerf('load_bcs'):
        barcodes = SingleGenomeAnalysis.load_bcs_from_matrix_h5(args.matrix_h5)

    use_bcs = cr_graphclust.load_ndarray_h5(chunk_defs[0].use_bcs, 'use_bcs')

    labels = cr_graphclust.load_louvain_results(len(barcodes), use_bcs,
                                                louvain_out)

    labels = cr_clustering.relabel_by_size(labels)

    # Save cluster results
    with analysis_io.open_h5_for_writing(outs.clusters_h5) as f:
        cr_graphclust.save_graphclust_h5(f, labels)

    clustering_key = cr_clustering.format_clustering_key(
        cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0)

    cr_clustering.save_clustering_csv(outs.clusters_csv, clustering_key,
                                      labels, barcodes)

    outs.chunked_neighbors = None
Example #5
0
def split(args):
    np.random.seed(0)

    if args.matrix_h5 is None:
        return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]}

    if not os.path.exists(args.reduced_data):
        raise IOError('reduced data not found at {}'.format(args.reduced_data))

    if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS):
        raise ValueError('Invalid factorization provided')

    if args.similarity_type not in SIMILARITY_TYPES:
        raise ValueError(
            'Unsupported similarity type: %s. Must be one of: %s' %
            (args.similarity_type, ','.join(SIMILARITY_TYPES)))

    reduction_summary = args.reduction_summary['h5']

    method_dict = {}
    for method in args.factorization:
        method_dict[method] = {}

    with LogPerf('load'):
        for method in args.factorization:
            if method == 'pca':
                method_dict[method][
                    'transformed_matrix'] = cr_pca.load_pca_from_h5(
                        reduction_summary[method]).transformed_pca_matrix
            if method == 'lsa':
                method_dict[method][
                    'transformed_matrix'] = cr_lsa.load_lsa_from_h5(
                        reduction_summary[method]).transformed_lsa_matrix
            if method == 'plsa':
                method_dict[method][
                    'transformed_matrix'] = cr_plsa.load_plsa_from_h5(
                        reduction_summary[method]).transformed_plsa_matrix

    # Record indices of selected barcodes. All methods must use same barcodes
    use_bcs = np.arange(
        method_dict[args.factorization[0]]['transformed_matrix'].shape[0])
    use_bcs_path = martian.make_path('use_bcs.h5')
    cr_graphclust.save_ndarray_h5(use_bcs, use_bcs_path, 'use_bcs')

    # Build the nearest neighbor query index
    with LogPerf('nn_build'):
        for method in args.factorization:
            method_mat = method_dict[method]['transformed_matrix']
            # normalize for plsa/lsa so that standard euclidean distance in normalized space is cosine distance in original space
            if method in ['plsa', 'lsa']:
                method_mat = method_mat / np.linalg.norm(
                    method_mat, axis=1, keepdims=True)
            balltree = cr_graphclust.build_neighbor_index(
                method_mat, args.balltree_leaf_size
                or DEFAULT_BALLTREE_LEAFSIZE)
            method_dict[method]['neighbor_index'] = martian.make_path(
                'neighbor_index_{}.pickle'.format(method))
            cr_graphclust.save_neighbor_index(
                balltree, method_dict[method]['neighbor_index'])

    # Compute the actual number of nearest neighbors we'll use
    given_num_neighbors = args.num_neighbors if args.num_neighbors is not None else analysis_constants.GRAPHCLUST_NEIGHBORS_DEFAULT
    given_neighbor_a = args.neighbor_a if args.neighbor_a is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_A_DEFAULT
    given_neighbor_b = args.neighbor_b if args.neighbor_b is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_B_DEFAULT

    # Take max of {num_neighbors, a + b*log10(n)}
    use_neighbors = int(
        max(
            given_num_neighbors,
            np.round(given_neighbor_a +
                     given_neighbor_b * np.log10(len(use_bcs)))))

    # Clamp to [1, n - 1]
    num_neighbors = max(1, min(use_neighbors, len(use_bcs) - 1))
    print "Using %d neighbors" % num_neighbors

    # Divide the PCA matrix up into rows for NN queries
    with LogPerf('chunk_matrix'):
        chunks = []
        for method in args.factorization:
            method_mat = method_dict[method]['transformed_matrix']
            for row_start in xrange(0, method_mat.shape[0],
                                    NN_QUERIES_PER_CHUNK):
                row_end = min(row_start + NN_QUERIES_PER_CHUNK,
                              method_mat.shape[0])

                # Write the submatrix to an h5 file
                submatrix_path = martian.make_path('{}_{}_submatrix.h5'.format(
                    method, row_start))
                cr_graphclust.save_ndarray_h5(method_mat[row_start:row_end, :],
                                              submatrix_path, 'submatrix')

                chunks.append({
                    'method':
                    method,
                    'neighbor_index':
                    method_dict[method]['neighbor_index'],
                    'submatrix':
                    submatrix_path,
                    'row_start':
                    row_start,
                    'total_rows':
                    method_mat.shape[0],
                    'k_nearest':
                    num_neighbors,
                    'use_bcs':
                    use_bcs_path,
                })

    if args.similarity_type == SNN_SIMILARITY:
        join_mem_gb = 64
        join_threads = 4  # Overallocate
    else:
        # Scale memory with size of nearest-neighbor adjacency matrix
        join_mem_gb = max(
            h5_constants.MIN_MEM_GB,
            int(np.ceil(
                (num_neighbors * len(use_bcs)) / NN_ENTRIES_PER_MEM_GB)))
        # HACK: use more threads for bigger mem requests to avoid mem oversubscription on clusters that don't enforce it
        join_threads = cr_io.get_thread_request_from_mem_gb(join_mem_gb)

    return {
        'chunks': chunks,
        'join': {
            '__mem_gb': join_mem_gb,
            '__threads': join_threads,
        }
    }
Example #6
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.matrix_h5 is None:
        outs.graph_clustering_summary = {}
        return

    outs.graph_clustering_summary = {'h5': {}, 'csv': {}}
    # Merge the neighbor matrices
    for method in args.factorization:
        chunk_outs_def_method = [[
            chunk_out, chunk_def
        ] for chunk_out, chunk_def in zip(chunk_outs, chunk_defs)
                                 if chunk_def.method == method]
        chunk_outs_method = [c[0] for c in chunk_outs_def_method]
        chunk_defs_method = [c[1] for c in chunk_outs_def_method]

        with LogPerf('merge_nn'):
            nn = cr_graphclust.merge_nearest_neighbors(
                [chunk.chunked_neighbors for chunk in chunk_outs_method],
                chunk_defs_method[0].total_rows)
        print 'nn\tnn_nodes\t%0.4f' % nn.shape[0]
        print 'nn\tnn_links\t%0.4f' % nn.nnz
        print 'nn\tnn_density\t%0.4f' % cr_graphclust.matrix_density(nn)
        sys.stdout.flush()

        matrix_bin = martian.make_path('matrix_{}.bin'.format(method))
        matrix_weights = martian.make_path('matrix_{}.weights'.format(method))
        louvain_out = martian.make_path('louvain_{}.out'.format(method))

        if args.similarity_type == 'snn':
            snn = cr_graphclust.compute_snn_matrix(
                nn, chunk_defs_method[0].k_nearest)

            print 'snn\tsnn_nodes\t%d' % snn.shape[0]
            print 'snn\tsnn_links\t%d' % (snn.nnz / 2)
            print 'snn\tsnn_density\t%0.4f' % (
                (snn.nnz) / float(snn.shape[0] * (snn.shape[0] - 1)))
            sys.stdout.flush()

            with LogPerf('convert'):
                cr_graphclust.pipe_weighted_edgelist_to_convert(
                    snn, matrix_bin, matrix_weights)

            with LogPerf('louvain'):
                cr_graphclust.run_louvain_weighted_clustering(
                    matrix_bin, matrix_weights, louvain_out)

        else:
            with LogPerf('tocoo'):
                nn = nn.tocoo(copy=False)

            with LogPerf('convert'):
                cr_graphclust.pipe_unweighted_edgelist_to_convert(
                    nn, matrix_bin)

            with LogPerf('louvain'):
                cr_graphclust.run_louvain_unweighted_clustering(
                    matrix_bin, louvain_out)

        with LogPerf('load_bcs'):
            barcodes = None
            with h5.File(args.matrix_h5, 'r') as f:
                group_name = f.keys()[0]
                barcodes = cr_matrix.CountMatrix.load_bcs_from_h5_group(
                    f[group_name])

        use_bcs = cr_graphclust.load_ndarray_h5(chunk_defs_method[0].use_bcs,
                                                'use_bcs')

        labels = cr_graphclust.load_louvain_results(len(barcodes), use_bcs,
                                                    louvain_out)

        labels = cr_clustering.relabel_by_size(labels)

        # Save cluster results
        cr_io.mkdir(outs.knn_clusters, allow_existing=True)
        method_dir = os.path.join(outs.knn_clusters, method)
        cr_io.mkdir(method_dir, allow_existing=True)
        _h5 = os.path.join(method_dir, "clusters.h5")
        _csv = os.path.join(method_dir, "clusters_csv")
        with analysis_io.open_h5_for_writing(_h5) as f:
            cr_graphclust.save_graphclust_h5(f, labels)

        clustering_key = cr_clustering.format_clustering_key(
            cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0)
        cr_clustering.save_clustering_csv(_csv, clustering_key, labels,
                                          barcodes)
        outs.graph_clustering_summary['h5'][method] = _h5
        outs.graph_clustering_summary['csv'][method] = _csv

    outs.chunked_neighbors = None
Example #7
0
def main(args, outs):
    np.random.seed(0)

    LogPerf.mem()

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()
        barcode_info = mc.get_barcode_info()

        metrics_in = mc.get_all_metrics()
        metrics_out = copy.deepcopy(metrics_in)

        # Compute subsampling rate and approximate new total readpair count
        frac_reads_kept = np.array(args.frac_reads_kept, dtype=float)
        total_reads_in = mc.get_raw_read_pairs_per_library()
        total_reads_out = total_reads_in * frac_reads_kept

        for lib_idx, _ in enumerate(library_info):
            metrics_out[cr_mol_counter.LIBRARIES_METRIC][str(
                lib_idx)][cr_mol_counter.
                          DOWNSAMPLED_READS_METRIC] = total_reads_out[lib_idx]

        # downsample molecule info
        chunk = slice(args.chunk_start, args.chunk_start + args.chunk_len)
        mol_library_idx = mc.get_column_lazy('library_idx')[chunk]
        mol_read_pairs = mc.get_column_lazy('count')[chunk]

        mol_rate = frac_reads_kept[mol_library_idx]
        del mol_library_idx

        new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate)
        del mol_read_pairs
        del mol_rate

        keep_mol = np.flatnonzero(new_read_pairs)
        new_read_pairs = new_read_pairs[keep_mol]

        mol_gem_group = mc.get_column_lazy('gem_group')[chunk][keep_mol]
        mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk][keep_mol]
        mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk][keep_mol]

        # Assert that gem groups start at 1 and are contiguous
        gem_groups = sorted(set(lib['gem_group'] for lib in library_info))
        assert(min(gem_groups) == 1 and \
               np.all(np.diff(np.array(gem_groups,dtype=int)) == 1))

        feature_ref = mc.get_feature_ref()

        # Compute matrix dimensions
        # Get the range of possible barcode indices for each gem group.
        gg_barcode_idx_start = np.zeros(1 + len(gem_groups), dtype=int)
        gg_barcode_idx_len = np.zeros(1 + len(gem_groups), dtype=int)
        for gg_str, idx_range in sorted(
                args.gem_group_barcode_ranges.iteritems(),
                key=lambda kv: int(kv[0])):
            gg = int(gg_str)
            gg_barcode_idx_start[gg] = idx_range[0]
            gg_barcode_idx_len[gg] = idx_range[1] - idx_range[0]

        num_bcs = gg_barcode_idx_len.sum()
        num_features = feature_ref.get_num_features()

        print 'downsampled'
        LogPerf.mem()

        # Convert molecule barcode indices into matrix barcode indices
        # The molecule info barcode_idx is in this space:
        #  [W_0, W_1, ...] where W_i is distinct original whitelist i.
        # The matrix is in, e.g., this space:
        #  [w_0-1, w_1-2, w_0-3, ...] where w_i-j is a copy of whitelist i for gem group j.

        # Return to the original whitelist index
        mol_barcode_idx -= gg_barcode_idx_start.astype(
            np.uint64)[mol_gem_group]

        # Offset by the cumulative whitelist length up to a barcode's gem group
        gg_barcode_matrix_start = np.cumsum(gg_barcode_idx_len).astype(
            np.uint64)
        mol_barcode_idx += gg_barcode_matrix_start[mol_gem_group - 1]

        ones = np.ones(len(mol_barcode_idx),
                       dtype=cr_matrix.DEFAULT_DATA_DTYPE)
        umi_matrix = sp_sparse.coo_matrix(
            (ones, (mol_feature_idx, mol_barcode_idx)),
            shape=(num_features, num_bcs))
        print 'created umi matrix'
        LogPerf.mem()

        # Create a read-count matrix so we can summarize reads per barcode
        read_matrix = sp_sparse.coo_matrix(
            (new_read_pairs, (mol_feature_idx, mol_barcode_idx)),
            shape=(num_features, num_bcs))
        del ones
        del mol_feature_idx
        del mol_barcode_idx
        del new_read_pairs

        # Get all barcodes strings for the raw matrix
        barcode_seqs = mc.get_barcodes()

        print len(barcode_seqs), len(gem_groups)
        print 'creating barcode strings'
        LogPerf.mem()

        barcodes = []
        for gg in gem_groups:
            idx_start = gg_barcode_idx_start[gg]
            idx_end = idx_start + gg_barcode_idx_len[gg]
            gg_bcs = np.array([
                cr_utils.format_barcode_seq(bc, gg)
                for bc in barcode_seqs[idx_start:idx_end]
            ])
            barcodes.append(gg_bcs)
        barcodes = np.concatenate(barcodes)
        barcodes.flags.writeable = False

        print 'created barcode strings'
        LogPerf.mem()

        # Get mapped reads per barcode per library,genome
        read_summary = {}
        read_matrix = CountMatrix(feature_ref, barcodes, read_matrix)
        read_matrix.m = read_matrix.m.tocsc(copy=True)
        read_summary = summarize_read_matrix(read_matrix, library_info,
                                             barcode_info, barcode_seqs)
        del read_matrix

        print 'created read matrix'
        LogPerf.mem()
        # Construct the raw UMI matrix
        raw_umi_matrix = CountMatrix(feature_ref, barcodes, umi_matrix)
        raw_umi_matrix.save_h5_file(outs.raw_matrix_h5)
        outs.raw_nnz = raw_umi_matrix.m.nnz

        # Construct the filtered UMI matrix
        filtered_bcs = MoleculeCounter.get_filtered_barcodes(
            barcode_info, library_info, barcode_seqs)
        filtered_umi_matrix = raw_umi_matrix.select_barcodes_by_seq(
            filtered_bcs)
        filtered_umi_matrix.save_h5_file(outs.filtered_matrix_h5)
        outs.filtered_nnz = filtered_umi_matrix.m.nnz

        print 'created filtered umi matrix'
        LogPerf.mem()

        summary = {
            'read_summary': read_summary,
            'mol_metrics': metrics_out,
        }

        with open(outs.chunk_summary, 'w') as f:
            json.dump(tk_safe_json.json_sanitize(summary),
                      f,
                      indent=4,
                      sort_keys=True)

    # Don't write MEX from chunks.
    outs.raw_matrices_mex = None
    outs.filtered_matrices_mex = None