コード例 #1
0
ファイル: __init__.py プロジェクト: GWW/cellranger_211_mirror
def split(args):
    chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist)
    whitelist_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist, args.gem_groups, use_min=False)

    # Estimate the total number of rows in the final molecule info. Worst case.
    total_reads = cr_utils.get_metric_from_json(args.extract_reads_summary,
                                                'total_reads')
    mol_info_rows = total_reads

    # Memory for sorting in MoleculeCounter.concatenate_sort:
    # N = total number of rows
    # 8*N bytes to store the sort indices
    # (8+8+8)*N bytes to load, concatenate, and index into a 64-bit data column
    mol_info_mem_gb = int(math.ceil((32 * mol_info_rows) / 1e9))
    join_mem_gb = min(
        MAX_MEM_GB,
        max(cr_constants.MIN_MEM_GB, whitelist_mem_gb + mol_info_mem_gb))

    chunks = []
    for chunk_input in args.inputs:
        chunks.append({
            'chunk_input': chunk_input,
            '__mem_gb': chunk_mem_gb,
        })
    join = {
        '__mem_gb': join_mem_gb,
    }
    return {'chunks': chunks, 'join': join}
コード例 #2
0
ファイル: __init__.py プロジェクト: mosquitoCat/cellranger
def split(args):
    mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist, args.gem_groups)
    chunks = [{
        '__mem_gb': mem_gb,
    }]
    return {'chunks': chunks}
コード例 #3
0
ファイル: __init__.py プロジェクト: GWW/cellranger_211_mirror
def split(args):
    chunks = []
    for molecule_h5 in args.molecule_chunks:
        with cr_mol_counter.MoleculeCounter.open(molecule_h5,
                                                 'r') as mol_counter:
            barcode_whitelist = mol_counter.get_barcode_whitelist()
            whitelist_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
                barcode_whitelist, use_min=False)
            for gem_group, chunk_start, chunk_len in mol_counter.get_chunks_by_gem_group(
            ):
                mol_mem_gb = cr_mol_counter.MoleculeCounter.estimate_mem_gb(
                    chunk_len)
                recovered_cells = mol_counter.get_metric(
                    cr_mol_counter.GEM_GROUPS_METRIC)[gem_group].get(
                        cr_mol_counter.GG_RECOVERED_CELLS_METRIC, None)
                force_cells = mol_counter.get_metric(
                    cr_mol_counter.GEM_GROUPS_METRIC)[gem_group].get(
                        cr_mol_counter.GG_FORCE_CELLS_METRIC, None)
                chunks.append({
                    'molecule_h5': molecule_h5,
                    'gem_group': str(gem_group),
                    'recovered_cells': recovered_cells,
                    'force_cells': force_cells,
                    'chunk_start': chunk_start,
                    'chunk_len': chunk_len,
                    '__mem_gb': whitelist_mem_gb + mol_mem_gb,
                })

    return {'chunks': chunks}
コード例 #4
0
def split(args):
    chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(args.barcode_whitelist)
    join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(args.barcode_whitelist, args.gem_groups)

    chunks = []
    for chunk_genome_input, chunk_trimmed_input, gem_group in itertools.izip_longest(
            args.genome_inputs, args.trimmed_inputs or [], args.gem_groups):
        chunks.append({
            'chunk_genome_input': chunk_genome_input,
            'chunk_trimmed_input': chunk_trimmed_input,
            'gem_group': gem_group,
            '__mem_gb': chunk_mem_gb,
        })
    join = {
        '__mem_gb': join_mem_gb,
    }
    return {'chunks': chunks, 'join': join}
コード例 #5
0
ファイル: __init__.py プロジェクト: mosquitoCat/cellranger
def split(args):
    mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(args.barcode_whitelist, args.gem_groups)
    return {
        'chunks': [{}],
        'join': {
            '__mem_gb': mem_gb,
        },
    }
コード例 #6
0
def split(args):
    chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist)

    chunks = []
    for chunk_input in args.inputs:
        chunks.append({
            'chunk_input': chunk_input,
            '__mem_gb': chunk_mem_gb,
        })

    join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist, args.gem_groups, use_min=False)

    # Account for memory used by reporters (particularly the bc and umi diversity dicts)
    genomes = cr_utils.get_reference_genomes(args.reference_path)

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    if barcode_whitelist is not None:
        num_barcodes = len(barcode_whitelist) * max(args.gem_groups)
    else:
        num_barcodes = cr_utils.get_num_barcodes_from_barcode_summary(
            args.barcode_summary)

    max_bc_diversity_entries = num_barcodes
    max_umi_diversity_entries = 4**cr_chem.get_umi_length(args.chemistry_def)

    # Multiply by 2 to hold the current reporter + accumulating reporter in the merge
    bc_diversity_mem_gb = (2 * max_bc_diversity_entries *
                           cr_constants.BYTES_PER_STR_INT_DICT_ENTRY *
                           (len(genomes) + 1) *
                           len(cr_constants.READ_TYPES)) / 1e9
    umi_diversity_mem_gb = (2 * max_umi_diversity_entries *
                            cr_constants.BYTES_PER_STR_INT_DICT_ENTRY *
                            (len(genomes) + 1) *
                            len(cr_constants.READ_TYPES)) / 1e9
    join_mem_gb = min(
        cr_constants.COUNT_GENES_MAX_MEM_GB,
        max(cr_constants.MIN_MEM_GB,
            int(join_mem_gb + bc_diversity_mem_gb + umi_diversity_mem_gb)))
    join = {
        '__mem_gb': join_mem_gb,
    }
    return {'chunks': chunks, 'join': join}
コード例 #7
0
ファイル: __init__.py プロジェクト: GWW/cellranger_211_mirror
def split(args):
    gem_groups = [chunk['gem_group'] for chunk in args.chunks]
    chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(args.barcode_whitelist)
    join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(args.barcode_whitelist, gem_groups)

    chunks = []
    for chunk in args.chunks:
        chunk['__mem_gb'] = chunk_mem_gb

        if args.initial_reads is None:
            chunk['initial_reads'] = None
        else:
            chunk['initial_reads'] = args.initial_reads / len(args.chunks)

        chunks.append(chunk)
    join = {
        '__mem_gb': join_mem_gb,
    }
    return {'chunks': chunks, 'join': join}
コード例 #8
0
def split(args):
    # Write BAM comments to json file
    bam_comment_fn = martian.make_path('bam_comments.json')
    with open(bam_comment_fn, 'w') as f:
        json.dump(args.bam_comments, f)

    chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist)
    join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist, args.gem_groups)

    chunks = []
    for chunk_genome_input, gem_group in itertools.izip_longest(
            args.genome_inputs, args.gem_groups):
        chunks.append({
            'chunk_genome_input': chunk_genome_input,
            'gem_group': gem_group,
            'bam_comments_json': bam_comment_fn,
            '__mem_gb': chunk_mem_gb,
        })
    join = {
        '__mem_gb': join_mem_gb,
    }
    return {'chunks': chunks, 'join': join}
コード例 #9
0
ファイル: __init__.py プロジェクト: GWW/cellranger_211_mirror
def split(args):
    chunks = []
    with cr_mol_counter.MoleculeCounter.open(args.raw_molecules,
                                             'r') as mol_counter:
        barcode_whitelist = mol_counter.get_barcode_whitelist()
        matrix_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
            barcode_whitelist, use_min=False)
        for chunk_start, chunk_len in mol_counter.get_chunks(
                cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK):
            mol_mem_gb = cr_mol_counter.MoleculeCounter.estimate_mem_gb(
                chunk_len)
            tot_mem_gb = 4 * (matrix_mem_gb + mol_mem_gb)
            threads = max(1, int(round(tot_mem_gb / 8.0)))
            chunks.append({
                'chunk_start': str(chunk_start),
                'chunk_len': str(chunk_len),
                '__mem_gb': tot_mem_gb,
                '__threads': threads,
            })
    join_mem_gb = max(matrix_mem_gb, cr_constants.MIN_MEM_GB)
    join = {'__mem_gb': join_mem_gb}
    return {'chunks': chunks, 'join': join}
コード例 #10
0
def split(args):
    # Get the cell count
    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bcs = set()
    for _, bcs in filtered_bcs_per_genome.iteritems():
        filtered_bcs |= set(bcs)
    n_cells = len(filtered_bcs)

    if n_cells == 0:
        return {
            'chunks': [{
                'chunk_start': 0,
                'chunk_len': 0,
                'subsample_info': {}
            }]
        }

    # Get required info from the mol info
    with MoleculeCounter.open(args.molecule_info, 'r') as mol_counter:
        n_molecule_info_entries = mol_counter.nrows()
        barcode_whitelist = mol_counter.get_barcode_whitelist()
        gem_groups = mol_counter.get_gem_groups()

        raw_reads = mol_counter.get_total_raw_reads()
        raw_rpc = tk_stats.robust_divide(raw_reads, n_cells)
        mapped_reads = mol_counter.get_total_conf_mapped_filtered_bc_reads()

    mapped_read_frac = tk_stats.robust_divide(mapped_reads, raw_reads)

    subsamplings = list()  # track subsample info definitions

    # Calculate extra deciles to add in based on raw reads
    if raw_reads > 0:
        subsampling_deciles = [
            round(decile * raw_rpc) for decile in np.arange(0.1, 1.1, 0.1)
        ]
    else:
        subsampling_deciles = []

    # All target depths
    target_rpcs = cr_constants.SUBSAMPLE_READS_PER_CELL + subsampling_deciles

    for subsample_type, rpc_multiplier in [
        (cr_constants.RAW_SUBSAMPLE_TYPE, mapped_read_frac),
        (cr_constants.MAPPED_SUBSAMPLE_TYPE, 1.0)
    ]:
        # Generate subsampling definitions
        for target_rpc in target_rpcs:
            target_mapped_reads = int(
                float(target_rpc) * float(n_cells) * rpc_multiplier)

            subsample_rate = tk_stats.robust_divide(target_mapped_reads,
                                                    mapped_reads)

            if subsample_rate > 1.0:
                continue

            subsamplings.append({
                'subsample_type': subsample_type,
                'target_rpc': target_rpc,
                'subsample_rate': subsample_rate,
                'all_target_rpc': target_rpcs,
            })

    # Each chunk needs to store the entire gene-bc matrix and a piece of the mol info h5
    matrix_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        barcode_whitelist, gem_groups)
    chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK
    chunk_mem_gb = matrix_mem_gb + MoleculeCounter.estimate_mem_gb(chunk_len)
    join_mem_gb = matrix_mem_gb

    # Split the molecule info h5 into equi-RAM chunks
    chunks = []
    for subsample_info in subsamplings:
        for chunk_start in xrange(0, n_molecule_info_entries, chunk_len):
            chunks.append({
                'chunk_start':
                str(chunk_start),
                'chunk_len':
                str(min(n_molecule_info_entries - chunk_start, chunk_len)),
                'subsample_info':
                subsample_info,
                '__mem_gb':
                chunk_mem_gb,
            })
    join = {
        '__mem_gb': join_mem_gb,
    }

    if len(chunks) == 0:
        chunks.append({
            'chunk_start': str(0),
            'chunk_len': str(0),
            'subsample_info': {},
        })

    return {'chunks': chunks, 'join': join}