예제 #1
0
def split(args):
    # Estimate the total number of rows in the final molecule info. Worst case.
    total_reads = cr_utils.get_metric_from_json(args.extract_reads_summary,
                                                'total_reads')
    mol_info_rows = total_reads

    # Memory for chunk
    if len(args.inputs) > 0:
        avg_rows_per_chunk = int(total_reads / len(args.inputs))
        avg_chunk_mem_gb = int(math.ceil((32 * avg_rows_per_chunk) / 2.5e8))
        chunk_mem_gb = min(MAX_MEM_GB, max(8, avg_chunk_mem_gb))
    else:
        chunk_mem_gb = 1

    # Memory for concatenating molecule info
    # N = total number of rows
    # 8*N bytes to store the sort indices
    # (8+8+8)*N bytes to load, concatenate, and index into a 64-bit data column
    mol_info_mem_gb = int(math.ceil((32 * mol_info_rows) / 2.5e8))
    join_mem_gb = min(MAX_MEM_GB, max(4, mol_info_mem_gb))

    chunks = []
    for chunk_input in args.inputs:
        chunks.append({
            'chunk_input': chunk_input,
            '__mem_gb': chunk_mem_gb,
        })
    join = {
        '__mem_gb': join_mem_gb,
    }
    return {'chunks': chunks, 'join': join}
예제 #2
0
def split(args):
    chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist)
    whitelist_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist, args.gem_groups, use_min=False)

    # Estimate the total number of rows in the final molecule info. Worst case.
    total_reads = cr_utils.get_metric_from_json(args.extract_reads_summary,
                                                'total_reads')
    mol_info_rows = total_reads

    # Memory for sorting in MoleculeCounter.concatenate_sort:
    # N = total number of rows
    # 8*N bytes to store the sort indices
    # (8+8+8)*N bytes to load, concatenate, and index into a 64-bit data column
    mol_info_mem_gb = int(math.ceil((32 * mol_info_rows) / 1e9))
    join_mem_gb = min(
        MAX_MEM_GB,
        max(cr_constants.MIN_MEM_GB, whitelist_mem_gb + mol_info_mem_gb))

    chunks = []
    for chunk_input in args.inputs:
        chunks.append({
            'chunk_input': chunk_input,
            '__mem_gb': chunk_mem_gb,
        })
    join = {
        '__mem_gb': join_mem_gb,
    }
    return {'chunks': chunks, 'join': join}
예제 #3
0
def main(args, outs):
    np.random.seed(0)

    unique_gem_groups = np.unique(args.gem_groups).tolist()
    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    cell_barcodes = set()
    bc_support = defaultdict(int)

    # Load barcode whitelist
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)

    all_gem_groups = sorted(set(args.gem_groups))

    if args.recovered_cells:
        recovered_cells = args.recovered_cells
    else:
        recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len(
            all_gem_groups)

    for gem_group in all_gem_groups:
        if barcode_whitelist is None:
            break

        # Load barcode raw read count distribution
        barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                                  barcode_whitelist,
                                                  gem_group,
                                                  proportions=False)
        counts = np.array(barcode_dist.values())

        # Append gem group to barcode seqs
        barcodes = np.array([
            cr_utils.format_barcode_seq(seq, gem_group)
            for seq in barcode_dist.keys()
        ])

        # Call cell barcodes
        gg_bc_support, gg_cell_bcs, rpu_threshold, umi_threshold, confidence = call_cell_barcodes(
            args.umi_info, int(gem_group))

        # Record the RPU and UMI thresholds
        reporter._get_metric_attr('vdj_filter_bcs_rpu_threshold',
                                  gem_group).set_value(rpu_threshold)
        reporter._get_metric_attr('vdj_filter_bcs_umi_threshold',
                                  gem_group).set_value(umi_threshold)
        reporter._get_metric_attr('vdj_filter_bcs_confidence',
                                  gem_group).set_value(confidence)

        if len(gg_bc_support) > 0:
            if args.force_cells is not None:
                sorted_bcs = map(
                    lambda kv: kv[0],
                    sorted(gg_bc_support.items(),
                           key=lambda kv: kv[1],
                           reverse=True))
                gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells
                                              )]

            # Update set of BCs called as cells
            cell_barcodes.update(set(gg_cell_bcs))

            # Sum BC support
            for bc, count in gg_bc_support.iteritems():
                bc_support[bc] += count

        # Load the extract_reads summary to get the total raw reads
        total_read_pairs = cr_utils.get_metric_from_json(
            args.extract_reads_summary, 'total_read_pairs')

        reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts,
                                        total_read_pairs, recovered_cells)

    save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes)

    with open(outs.barcode_support, 'w') as f:
        f.write('barcode,count\n')
        for k, v in bc_support.iteritems():
            f.write('%s,%d\n' % (k, v))

    write_barcode_umi_summary(args.umi_info, reporter,
                              outs.barcode_umi_summary,
                              args.min_readpairs_per_umi, cell_barcodes)

    reporter.report_summary_json(outs.summary)
예제 #4
0
def main(args, outs):
    unique_gem_groups = np.unique(args.gem_groups).tolist()
    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    cell_barcodes = set()
    bc_support = {}

    # Load barcode whitelist
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)

    all_gem_groups = sorted(set(args.gem_groups))

    if args.recovered_cells:
        recovered_cells = args.recovered_cells
    else:
        recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len(
            all_gem_groups)

    for gem_group in all_gem_groups:
        if barcode_whitelist is None:
            break

        # Load barcode raw read count distribution
        barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                                  barcode_whitelist,
                                                  gem_group,
                                                  proportions=False)
        counts = np.array(barcode_dist.values())

        # Append gem group to barcode seqs
        barcodes = np.array([
            cr_utils.format_barcode_seq(seq, gem_group)
            for seq in barcode_dist.keys()
        ])

        # Call cell barcodes
        gg_bc_support, gg_cell_bcs, threshold = call_cell_barcodes(
            args.umi_summary, int(gem_group), args.min_umis,
            args.readpairs_per_umi_nx, args.readpairs_per_umi_ratio)

        # Record the threshold
        reporter._get_metric_attr(
            'vdj_filtered_bc_contig_kth_umi_readpair_threshold',
            gem_group).set_value(threshold)

        if len(gg_bc_support) > 0:
            if args.force_cells is not None:
                sorted_bcs = map(
                    lambda kv: kv[0],
                    sorted(gg_bc_support.items(),
                           key=lambda kv: kv[1],
                           reverse=True))
                gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells
                                              )]

            cell_barcodes.update(set(gg_cell_bcs))
            bc_support.update(gg_bc_support)

        # Load the extract_reads summary to get the total raw reads
        total_read_pairs = cr_utils.get_metric_from_json(
            args.extract_reads_summary, 'total_read_pairs')

        # Load the assembly metrics summary to get the total assemblable reads
        assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json(
            args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc')
        assemblable_read_pairs = sum(
            assemblable_read_pairs_by_bc.get(bc, 0) for bc in cell_barcodes)

        reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts,
                                        total_read_pairs,
                                        assemblable_read_pairs,
                                        recovered_cells)

    save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes)

    with open(outs.barcode_support, 'w') as f:
        f.write('barcode,count\n')
        for k, v in bc_support.iteritems():
            f.write('%s,%d\n' % (k, v))

    write_barcode_umi_summary(args.umi_info, reporter,
                              outs.barcode_umi_summary,
                              args.min_readpairs_per_umi, cell_barcodes)

    reporter.report_summary_json(outs.summary)
예제 #5
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    barcode_contigs = defaultdict(list)
    contig_annotations = {}

    # Get annotations for each contig
    for annotation in iter(json.load(open(args.annotations))):
        contig_annotations[annotation['contig_name']] = annotation

    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary,
                                     header=0,
                                     index_col=None,
                                     sep='\t',
                                     dtype={
                                         'component': int,
                                         'num_reads': int,
                                         'num_pairs': int,
                                         'num_umis': int,
                                         'umi_list': str,
                                     })
        contig_summary = contig_summary.groupby('barcode')
    else:
        contig_summary = None

    if args.umi_summary and os.path.isfile(args.umi_summary):
        umi_summary = pd.read_csv(args.umi_summary,
                                  header=0,
                                  index_col=None,
                                  sep='\t')
        umi_summary = umi_summary.groupby('barcode')
    else:
        umi_summary = None

    if args.filter_summary:
        filter_summary = vdj_utils.load_contig_summary_table(
            args.filter_summary)
    else:
        filter_summary = None

    # Get contigs for each barcode
    for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)):
        contig_name = contig_hdr.split(' ')[0]
        if not filter_summary is None and not vdj_utils.is_contig_filtered(
                filter_summary, contig_name):
            continue

        barcode = vdj_utils.get_barcode_from_contig_name(contig_name)
        barcode_contigs[barcode].append((contig_name, contig_seq))

    # Compute metrics for each barcode
    if args.cell_barcodes:
        barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes)
    else:
        # Pass an empty barcode JSON for bulk
        barcodes = {''}

    reference = vdj_ref.VdjReference(args.vdj_reference_path)

    for barcode in barcodes:
        contigs = barcode_contigs[barcode]
        annotations = [contig_annotations[contig[0]] for contig in contigs]

        reporter.vdj_barcode_contig_cb(barcode, contigs, annotations,
                                       reference)

        if not contig_summary is None and barcode in contig_summary.groups:
            bc_contig_summary = contig_summary.get_group(barcode)
        else:
            bc_contig_summary = None

        if not umi_summary is None and barcode in umi_summary.groups:
            bc_umi_summary = umi_summary.get_group(barcode)
        else:
            bc_umi_summary = None

        reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary,
                                 annotations, reference)

    ## Compute post-assembly per-cell metrics
    # Load the assembly metrics summary to get the total assemblable reads
    if args.assemble_metrics_summary and args.reads_summary:
        assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json(
            args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc')
        assemblable_read_pairs = sum(
            assemblable_read_pairs_by_bc.get(bc, 0) for bc in barcodes)

        total_read_pairs = cr_utils.get_metric_from_json(
            args.reads_summary, 'total_read_pairs')

        reporter._get_metric_attr(
            'vdj_assemblable_read_pairs_per_filtered_bc').set_value(
                assemblable_read_pairs, len(barcodes))
        reporter._get_metric_attr('vdj_sequencing_efficiency').set_value(
            assemblable_read_pairs, total_read_pairs)

    ## Try to autodetect the chain type
    # Find all chains w/ a significant presence.
    # If there's exactly one, set the chain type filter to that.
    # Otherwise, show all chain types.

    chain_count = defaultdict(int)
    for anno_dict in contig_annotations.itervalues():
        contig = vdj_annotations.AnnotatedContig.from_dict(
            anno_dict, reference)
        if contig.is_cell and contig.high_confidence and contig.productive:
            for anno in contig.annotations:
                if anno.feature.chain_type in vdj_constants.VDJ_CHAIN_TYPES:
                    chain_count[anno.feature.chain_type] += 1

    outs.chain_type = vdj_constants.ALL_CHAIN_TYPES

    print chain_count

    if len(chain_count) > 0:
        n_contigs = sum(chain_count.itervalues())
        sig_chains = [
            ct
            for ct, count in chain_count.iteritems() if tk_stats.robust_divide(
                count, n_contigs) >= MIN_CHAIN_TYPE_CONTIG_FRAC
        ]
        if len(sig_chains) == 1:
            outs.chain_type = sig_chains[0]

    reporter.report_summary_json(outs.summary)