Ejemplo n.º 1
0
def split(args):
    # Use a dummy chunk to appease Martian if there was no input
    if args.contig_bam is None or not vdj_utils.bam_has_seqs(args.contig_bam):
        return {
            'chunks': [
                {
                    'contigs': None,
                },
            ],
        }

    # Reuse the chunk structure from assemble_vdj_pd to chunk sets of contigs by barcode chunks
    chunks = []
    bam_iter = tk_bam.create_bam_infile(args.contig_bam)

    all_contigs = list(bam_iter.references)

    if args.barcodes_in_chunks is None:
        # Contigs are not grouped by barcode. Just split into a few chunks.
        contigs_per_chunk = int(np.ceil(
            len(all_contigs) / float(BULK_NCHUNKS)))
        for ch in range(BULK_NCHUNKS):
            start = ch * contigs_per_chunk
            stop = min(len(all_contigs), (ch + 1) * contigs_per_chunk)
            contig_names = list(itertools.islice(all_contigs, start, stop))
            chunks.append({'contigs': contig_names})
    else:
        for barcode_chunk in args.barcodes_in_chunks:
            with open(barcode_chunk) as f:
                barcode_chunk = set(json.load(f))
            contig_names = [
                contig for contig in all_contigs if
                vdj_utils.get_barcode_from_contig_name(contig) in barcode_chunk
            ]
            chunks.append({'contigs': contig_names, '__mem_gb': 6})

    bam_iter.close()
    return {'chunks': chunks}
Ejemplo n.º 2
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    barcode_contigs = collections.defaultdict(list)
    contig_annotations = {}

    # Get annotations for each contig
    for annotation in iter(json.load(open(args.annotations))):
        contig_annotations[annotation['contig_name']] = annotation

    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t',
                                     dtype={'component': int, 'num_reads': int,
                                            'num_pairs': int, 'num_umis': int,
                                            'umi_list': str,
                                     })
        contig_summary = contig_summary.groupby('barcode')
    else:
        contig_summary = None

    if args.umi_summary and os.path.isfile(args.umi_summary):
        umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t')
        umi_summary = umi_summary.groupby('barcode')
    else:
        umi_summary = None

    if args.filter_summary:
        filter_summary = vdj_utils.load_contig_summary_table(args.filter_summary)
    else:
        filter_summary = None

    # Get contigs for each barcode
    for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)):
        contig_name = contig_hdr.split(' ')[0]
        if not filter_summary is None and not vdj_utils.is_contig_filtered(filter_summary, contig_name):
            continue

        barcode = vdj_utils.get_barcode_from_contig_name(contig_name)
        barcode_contigs[barcode].append((contig_name, contig_seq))

    # Compute metrics for each barcode
    if args.cell_barcodes:
        barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes)
    else:
        # Pass an empty barcode JSON for bulk
        barcodes = {''}


    reference = vdj_ref.VdjReference(args.vdj_reference_path)

    for barcode in barcodes:
        contigs = barcode_contigs[barcode]
        annotations = [contig_annotations[contig[0]] for contig in contigs]

        reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference)

        if not contig_summary is None and barcode in contig_summary.groups:
            bc_contig_summary = contig_summary.get_group(barcode)
        else:
            bc_contig_summary = None

        if not umi_summary is None and barcode in umi_summary.groups:
            bc_umi_summary = umi_summary.get_group(barcode)
        else:
            bc_umi_summary = None

        reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference)

    reporter.report_summary_json(outs.summary)
Ejemplo n.º 3
0
def main(args, outs):
    if args.vdj_reference_path is None:
        outs.chunked_annotations = None
        return
    chunk_contigs = []
    barcodes_in_chunk = set(args.barcodes)

    # Set of barcodes that were called as cells
    if args.cell_barcodes:
        cell_barcodes_set = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes))
    else:
        cell_barcodes_set = set()

    # Setup feature reference sequences
    res = vdj_annot.setup_feature_aligners(args.vdj_reference_path,
                                           args.min_score_ratios,
                                           args.min_word_sizes)
    feature_types, feature_aligners, feature_filters = res

    # Setup primer reference sequnces
    if args.primers:
        primer_aligner, primer_filter = vdj_annot.setup_primer_aligner(args.primers,
                                                                       vdj_constants.VDJ_ANNOTATION_MIN_SCORE_RATIO)

    read_counts = {}
    umi_counts = {}
    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t')
        for _, row in contig_summary.iterrows():
            read_counts[row.contig_name] = int(row.num_reads)
            umi_counts[row.contig_name] = int(row.num_umis)

    if args.filter_summary:
        try:
            filter_summary = vdj_utils.load_contig_summary_table(open(args.filter_summary))
        except EmptyDataError:
            filter_summary = None
    else:
        filter_summary = None

    if not args.contigs_fastq is None:
        fq_iter = tk_fasta.read_generator_fastq(open(args.contigs_fastq), paired_end=False)

    for header, contig_sequence in cr_utils.get_fasta_iter(open(args.contigs)):
        if args.contigs_fastq is None:
            contig_quals = None
        else:
            header_fq, contig_sequence_fq, contig_quals = fq_iter.next()
            assert(contig_sequence_fq == contig_sequence)
            assert(header_fq == header)

        barcode = vdj_utils.get_barcode_from_contig_name(header)
        contig_name = header.split(' ')[0]

        # Only annotate barcodes assigned to this chunk and contigs with enough read support
        if barcode in barcodes_in_chunk:
            if filter_summary is not None:
                filtered = vdj_utils.is_contig_filtered(filter_summary, contig_name)
            else:
                filtered = True

            contig = vdj_annot.AnnotatedContig(contig_name,
                                               contig_sequence,
                                               quals=contig_quals,
                                               barcode=barcode,
                                               is_cell=barcode in cell_barcodes_set,
                                               filtered=filtered,
                                               read_count=read_counts.get(contig_name),
                                               umi_count=umi_counts.get(contig_name),
                                               )

            contig.annotations = contig.annotate_features(feature_types,
                                                          feature_aligners,
                                                          feature_filters)

            if args.primers:
                contig.primer_annotations = contig.annotate_features_by_group(primer_aligner,
                                                                              alignment_filter=primer_filter)

            contig.annotate_cdr3()

            chunk_contigs.append(contig)

    cPickle.dump(chunk_contigs, open(outs.chunked_annotations, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
Ejemplo n.º 4
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    barcode_contigs = defaultdict(list)
    contig_annotations = {}

    # Get annotations for each contig
    for annotation in iter(json.load(open(args.annotations))):
        contig_annotations[annotation['contig_name']] = annotation

    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary,
                                     header=0,
                                     index_col=None,
                                     sep='\t',
                                     dtype={
                                         'component': int,
                                         'num_reads': int,
                                         'num_pairs': int,
                                         'num_umis': int,
                                         'umi_list': str,
                                     })
        contig_summary = contig_summary.groupby('barcode')
    else:
        contig_summary = None

    if args.umi_summary and os.path.isfile(args.umi_summary):
        umi_summary = pd.read_csv(args.umi_summary,
                                  header=0,
                                  index_col=None,
                                  sep='\t')
        umi_summary = umi_summary.groupby('barcode')
    else:
        umi_summary = None

    if args.filter_summary:
        filter_summary = vdj_utils.load_contig_summary_table(
            args.filter_summary)
    else:
        filter_summary = None

    # Get contigs for each barcode
    for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)):
        contig_name = contig_hdr.split(' ')[0]
        if not filter_summary is None and not vdj_utils.is_contig_filtered(
                filter_summary, contig_name):
            continue

        barcode = vdj_utils.get_barcode_from_contig_name(contig_name)
        barcode_contigs[barcode].append((contig_name, contig_seq))

    # Compute metrics for each barcode
    if args.cell_barcodes:
        barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes)
    else:
        # Pass an empty barcode JSON for bulk
        barcodes = {''}

    reference = vdj_ref.VdjReference(args.vdj_reference_path)

    for barcode in barcodes:
        contigs = barcode_contigs[barcode]
        annotations = [contig_annotations[contig[0]] for contig in contigs]

        reporter.vdj_barcode_contig_cb(barcode, contigs, annotations,
                                       reference)

        if not contig_summary is None and barcode in contig_summary.groups:
            bc_contig_summary = contig_summary.get_group(barcode)
        else:
            bc_contig_summary = None

        if not umi_summary is None and barcode in umi_summary.groups:
            bc_umi_summary = umi_summary.get_group(barcode)
        else:
            bc_umi_summary = None

        reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary,
                                 annotations, reference)

    ## Compute post-assembly per-cell metrics
    # Load the assembly metrics summary to get the total assemblable reads
    if args.assemble_metrics_summary and args.reads_summary:
        assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json(
            args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc')
        assemblable_read_pairs = sum(
            assemblable_read_pairs_by_bc.get(bc, 0) for bc in barcodes)

        total_read_pairs = cr_utils.get_metric_from_json(
            args.reads_summary, 'total_read_pairs')

        reporter._get_metric_attr(
            'vdj_assemblable_read_pairs_per_filtered_bc').set_value(
                assemblable_read_pairs, len(barcodes))
        reporter._get_metric_attr('vdj_sequencing_efficiency').set_value(
            assemblable_read_pairs, total_read_pairs)

    ## Try to autodetect the chain type
    # Find all chains w/ a significant presence.
    # If there's exactly one, set the chain type filter to that.
    # Otherwise, show all chain types.

    chain_count = defaultdict(int)
    for anno_dict in contig_annotations.itervalues():
        contig = vdj_annotations.AnnotatedContig.from_dict(
            anno_dict, reference)
        if contig.is_cell and contig.high_confidence and contig.productive:
            for anno in contig.annotations:
                if anno.feature.chain_type in vdj_constants.VDJ_CHAIN_TYPES:
                    chain_count[anno.feature.chain_type] += 1

    outs.chain_type = vdj_constants.ALL_CHAIN_TYPES

    print chain_count

    if len(chain_count) > 0:
        n_contigs = sum(chain_count.itervalues())
        sig_chains = [
            ct
            for ct, count in chain_count.iteritems() if tk_stats.robust_divide(
                count, n_contigs) >= MIN_CHAIN_TYPE_CONTIG_FRAC
        ]
        if len(sig_chains) == 1:
            outs.chain_type = sig_chains[0]

    reporter.report_summary_json(outs.summary)