コード例 #1
0
ファイル: __init__.py プロジェクト: mosquitoCat/cellranger
def main(args, outs):
    np.random.seed(args.random_seed)

    if args.skip or args.is_multi_genome:
        return

    matrix = cr_matrix.GeneBCMatrix.load_h5(args.matrix_h5)
    pca = cr_pca.load_pca_from_h5(args.pca_h5)
    pca_mat = pca.transformed_pca_matrix

    # Subsample barcodes
    if args.num_bcs is not None:
        use_bcs = np.random.choice(pca_mat.shape[0],
                                   args.num_bcs,
                                   replace=False)
        matrix = matrix.select_barcodes(use_bcs)
        pca_mat = pca_mat[use_bcs, :]

    # Subset principal components
    if args.num_pcs is not None:
        pca_mat = pca_mat[:, np.arange(args.num_pcs)]

    kmeans = cr_kmeans.run_kmeans(pca_mat,
                                  args.n_clusters,
                                  random_state=args.random_seed)

    with cr_io.open_h5_for_writing(outs.kmeans_h5) as f:
        cr_kmeans.save_kmeans_h5(f, args.n_clusters, kmeans)

    clustering_key = cr_clustering.format_clustering_key(
        cr_clustering.CLUSTER_TYPE_KMEANS, args.n_clusters)

    cr_clustering.save_clustering_csv(outs.kmeans_csv, clustering_key,
                                      kmeans.clusters, matrix.bcs)
コード例 #2
0
def main(args, outs):
    np.random.seed(0)

    if args.filtered_matrix is None:
        return

    if not os.path.exists(outs.clustered_data):
        cr_io.mkdir(outs.clustered_data)

    matrix_bcs = cr_matrix.CountMatrix.load_bcs_from_h5_file(
        args.filtered_matrix)
    for method in args.factorization:
        transformed_matrix = args.transformed_matrix[method]
        method_dir = os.path.join(outs.clustered_data, method)
        cr_io.mkdir(method_dir, allow_existing=True)
        file_head = CLUSTER_FILE_HEAD[method]
        _h5 = os.path.join(method_dir, file_head + ".h5")
        _csv = os.path.join(method_dir, file_head + "_csv")
        dr_mat = None

        if not os.path.exists(transformed_matrix):
            raise IOError('matrix does not exist')

        if method == 'pca':
            pca = cr_pca.load_pca_from_h5(transformed_matrix)
            dr_mat = pca.transformed_pca_matrix
        if method == 'lsa':
            lsa = cr_lsa.load_lsa_from_h5(transformed_matrix)
            lsa = lsa._replace(
                transformed_lsa_matrix=lsa.transformed_lsa_matrix + 1e-120)
            dr_mat = lsa.transformed_lsa_matrix / np.linalg.norm(
                lsa.transformed_lsa_matrix, axis=1, keepdims=True)
        if method == 'plsa':
            plsa = cr_plsa.load_plsa_from_h5(args.transformed_matrix[method])
            plsa = plsa._replace(
                transformed_plsa_matrix=plsa.transformed_plsa_matrix + 1e-120)
            dr_mat = plsa.transformed_plsa_matrix / np.linalg.norm(
                plsa.transformed_plsa_matrix, axis=1, keepdims=True)

        if args.num_dims is not None:
            if args.num_dims > dr_mat.shape[1]:
                raise ValueError(
                    'number of dimensions requested to use is larger than number of dimensions in data'
                )
            dr_mat = dr_mat[:, np.arange(args.num_dims)]

        kmeans = cr_kmeans.run_kmeans(dr_mat,
                                      args.n_clusters,
                                      random_state=args.random_seed)
        with analysis_io.open_h5_for_writing(_h5) as f:
            cr_kmeans.save_kmeans_h5(f, args.n_clusters, kmeans)
        clustering_key = cr_clustering.format_clustering_key(
            cr_clustering.CLUSTER_TYPE_KMEANS, args.n_clusters)
        cr_clustering.save_clustering_csv(_csv, clustering_key,
                                          kmeans.clusters, matrix_bcs)
コード例 #3
0
def main(args, outs):
    if args.filtered_matrix is None:
        return

    if not os.path.exists(outs.tsne):
        os.mkdir(outs.tsne)

    matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_matrix)

    if args.method == 'pca':
        transformed_matrix = cr_pca.load_pca_from_h5(args.transformed_matrix_h5).transformed_pca_matrix
    if args.method == 'lsa':
        lsa = cr_lsa.load_lsa_from_h5(args.transformed_matrix_h5)
        lsa = lsa._replace(transformed_lsa_matrix=lsa.transformed_lsa_matrix + 1e-120)
        # transform matrix to unit normed so that euclidean distance in new space is cosine distance in old space
        transformed_matrix = lsa.transformed_lsa_matrix / np.linalg.norm(lsa.transformed_lsa_matrix, axis=1, keepdims=True)
    if args.method == 'plsa':
        plsa = cr_plsa.load_plsa_from_h5(args.transformed_matrix_h5)
        plsa = plsa._replace(transformed_plsa_matrix=plsa.transformed_plsa_matrix + 1e-120)
        # transform matrix to unit normed so that euclidean distance in new space is cosine distance in old space
        transformed_matrix = plsa.transformed_plsa_matrix / np.linalg.norm(plsa.transformed_plsa_matrix, axis=1, keepdims=True)

    tsne_dims = args.tsne_dims
    tsne = cr_tsne.run_tsne(transformed_matrix,
                            key=str(tsne_dims),
                            tsne_dims=tsne_dims,
                            input_pcs=args.tsne_input_pcs,
                            perplexity=args.tsne_perplexity,
                            theta=args.tsne_theta,
                            max_iter=args.tsne_max_iter,
                            stop_lying_iter=args.tsne_stop_lying_iter,
                            mom_switch_iter=args.tsne_mom_switch_iter,
                            random_state=args.random_seed)

    filters = tables.Filters(complevel=h5_constants.H5_COMPRESSION_LEVEL)
    _h5 = os.path.join(outs.tsne, args.method + '_tsne.h5')
    _csv = os.path.join(outs.tsne, args.method + '_tsne_csv')
    with tables.open_file(_h5, 'w', filters=filters) as f:
        cr_tsne.save_tsne_h5(tsne, f)

    cr_tsne.save_tsne_csv(tsne, matrix, _csv)
コード例 #4
0
ファイル: __init__.py プロジェクト: yu1033704806/cellranger
def main(args, outs):
    if args.skip:
        return

    tsne_dims = args.tsne_dims

    matrix = cr_matrix.CountMatrix.load_h5_file(args.matrix_h5)

    if args.feature_type == lib_constants.GENE_EXPRESSION_LIBRARY_TYPE:
        # Use PCA for gene expression
        pca = cr_pca.load_pca_from_h5(args.pca_h5)
        tsne_input = pca.transformed_pca_matrix
    else:
        # Use feature space for other feature types
        # Assumes other feature types are much lower dimension than gene expression
        matrix = matrix.select_features_by_type(args.feature_type)
        matrix.m.data = np.log2(1 + matrix.m.data)
        tsne_input = matrix.m.transpose().todense()

    name = get_tsne_name(args.feature_type, args.tsne_dims)
    key = get_tsne_key(args.feature_type, args.tsne_dims)

    tsne = cr_tsne.run_tsne(tsne_input,
                            name=name,
                            key=key,
                            input_pcs=args.input_pcs,
                            perplexity=args.perplexity,
                            theta=args.theta,
                            tsne_dims=tsne_dims,
                            max_iter=args.max_iter,
                            stop_lying_iter=args.stop_lying_iter,
                            mom_switch_iter=args.mom_switch_iter,
                            random_state=args.random_seed)

    filters = tables.Filters(complevel=h5_constants.H5_COMPRESSION_LEVEL)
    with tables.open_file(outs.tsne_h5, 'w', filters=filters) as f:
        cr_tsne.save_tsne_h5(tsne, f)

    cr_tsne.save_tsne_csv(tsne, matrix, outs.tsne_csv)
コード例 #5
0
ファイル: __init__.py プロジェクト: GWW/cellranger_211_mirror
def main(args, outs):
    if args.skip or args.is_multi_genome:
        return

    tsne_dims = args.tsne_dims

    matrix = cr_matrix.GeneBCMatrix.load_h5(args.matrix_h5)
    pca = cr_pca.load_pca_from_h5(args.pca_h5)
    tsne = cr_tsne.run_tsne(pca.transformed_pca_matrix,
                            input_pcs=args.input_pcs,
                            perplexity=args.perplexity,
                            theta=args.theta,
                            tsne_dims=tsne_dims,
                            max_iter=args.max_iter,
                            stop_lying_iter=args.stop_lying_iter,
                            mom_switch_iter=args.mom_switch_iter,
                            random_state=args.random_seed)
    tsne_map = {tsne_dims: tsne}

    filters = tables.Filters(complevel=cr_constants.H5_COMPRESSION_LEVEL)
    with tables.open_file(outs.tsne_h5, 'w', filters=filters) as f:
        cr_tsne.save_tsne_h5(tsne_map, f)

    cr_tsne.save_tsne_csv(tsne_map, matrix, outs.tsne_csv)
コード例 #6
0
def split(args):
    np.random.seed(0)

    if args.matrix_h5 is None:
        return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]}

    if not os.path.exists(args.reduced_data):
        raise IOError('reduced data not found at {}'.format(args.reduced_data))

    if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS):
        raise ValueError('Invalid factorization provided')

    if args.similarity_type not in SIMILARITY_TYPES:
        raise ValueError(
            'Unsupported similarity type: %s. Must be one of: %s' %
            (args.similarity_type, ','.join(SIMILARITY_TYPES)))

    reduction_summary = args.reduction_summary['h5']

    method_dict = {}
    for method in args.factorization:
        method_dict[method] = {}

    with LogPerf('load'):
        for method in args.factorization:
            if method == 'pca':
                method_dict[method][
                    'transformed_matrix'] = cr_pca.load_pca_from_h5(
                        reduction_summary[method]).transformed_pca_matrix
            if method == 'lsa':
                method_dict[method][
                    'transformed_matrix'] = cr_lsa.load_lsa_from_h5(
                        reduction_summary[method]).transformed_lsa_matrix
            if method == 'plsa':
                method_dict[method][
                    'transformed_matrix'] = cr_plsa.load_plsa_from_h5(
                        reduction_summary[method]).transformed_plsa_matrix

    # Record indices of selected barcodes. All methods must use same barcodes
    use_bcs = np.arange(
        method_dict[args.factorization[0]]['transformed_matrix'].shape[0])
    use_bcs_path = martian.make_path('use_bcs.h5')
    cr_graphclust.save_ndarray_h5(use_bcs, use_bcs_path, 'use_bcs')

    # Build the nearest neighbor query index
    with LogPerf('nn_build'):
        for method in args.factorization:
            method_mat = method_dict[method]['transformed_matrix']
            # normalize for plsa/lsa so that standard euclidean distance in normalized space is cosine distance in original space
            if method in ['plsa', 'lsa']:
                method_mat = method_mat / np.linalg.norm(
                    method_mat, axis=1, keepdims=True)
            balltree = cr_graphclust.build_neighbor_index(
                method_mat, args.balltree_leaf_size
                or DEFAULT_BALLTREE_LEAFSIZE)
            method_dict[method]['neighbor_index'] = martian.make_path(
                'neighbor_index_{}.pickle'.format(method))
            cr_graphclust.save_neighbor_index(
                balltree, method_dict[method]['neighbor_index'])

    # Compute the actual number of nearest neighbors we'll use
    given_num_neighbors = args.num_neighbors if args.num_neighbors is not None else analysis_constants.GRAPHCLUST_NEIGHBORS_DEFAULT
    given_neighbor_a = args.neighbor_a if args.neighbor_a is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_A_DEFAULT
    given_neighbor_b = args.neighbor_b if args.neighbor_b is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_B_DEFAULT

    # Take max of {num_neighbors, a + b*log10(n)}
    use_neighbors = int(
        max(
            given_num_neighbors,
            np.round(given_neighbor_a +
                     given_neighbor_b * np.log10(len(use_bcs)))))

    # Clamp to [1, n - 1]
    num_neighbors = max(1, min(use_neighbors, len(use_bcs) - 1))
    print "Using %d neighbors" % num_neighbors

    # Divide the PCA matrix up into rows for NN queries
    with LogPerf('chunk_matrix'):
        chunks = []
        for method in args.factorization:
            method_mat = method_dict[method]['transformed_matrix']
            for row_start in xrange(0, method_mat.shape[0],
                                    NN_QUERIES_PER_CHUNK):
                row_end = min(row_start + NN_QUERIES_PER_CHUNK,
                              method_mat.shape[0])

                # Write the submatrix to an h5 file
                submatrix_path = martian.make_path('{}_{}_submatrix.h5'.format(
                    method, row_start))
                cr_graphclust.save_ndarray_h5(method_mat[row_start:row_end, :],
                                              submatrix_path, 'submatrix')

                chunks.append({
                    'method':
                    method,
                    'neighbor_index':
                    method_dict[method]['neighbor_index'],
                    'submatrix':
                    submatrix_path,
                    'row_start':
                    row_start,
                    'total_rows':
                    method_mat.shape[0],
                    'k_nearest':
                    num_neighbors,
                    'use_bcs':
                    use_bcs_path,
                })

    if args.similarity_type == SNN_SIMILARITY:
        join_mem_gb = 64
        join_threads = 4  # Overallocate
    else:
        # Scale memory with size of nearest-neighbor adjacency matrix
        join_mem_gb = max(
            h5_constants.MIN_MEM_GB,
            int(np.ceil(
                (num_neighbors * len(use_bcs)) / NN_ENTRIES_PER_MEM_GB)))
        # HACK: use more threads for bigger mem requests to avoid mem oversubscription on clusters that don't enforce it
        join_threads = cr_io.get_thread_request_from_mem_gb(join_mem_gb)

    return {
        'chunks': chunks,
        'join': {
            '__mem_gb': join_mem_gb,
            '__threads': join_threads,
        }
    }