예제 #1
0
def join(args, outs, chunk_defs, chunk_outs):
    if len(chunk_outs) == 0:
        # Set all outputs to null
        for slot in outs.slots:
            setattr(outs, slot, None)
        return

    reporters = [chunk_out.chunked_reporter for chunk_out in chunk_outs]
    final_report = cr_report.merge_reporters(reporters)
    final_report.report_summary_json(outs.summary)

    consensus_contigs = []
    ref_contigs = []
    all_bams = []
    all_ref_bams = []

    for chunk in chunk_outs:
        if chunk.consensus_annotations_json and os.path.isfile(
                chunk.consensus_annotations_json):
            # Collect consensus annotations
            new_contigs = vdj_annot.load_cell_contigs_from_json(
                chunk.consensus_annotations_json,
                args.vdj_reference_path,
                group_key='clonotype')
            for cl in new_contigs:
                consensus_contigs.extend(cl.chains)

            # Collect concat_ref annotations
            new_ref_contigs = vdj_annot.load_cell_contigs_from_json(
                chunk.concat_ref_annotations_json,
                args.vdj_reference_path,
                group_key='clonotype')
            for cl in new_ref_contigs:
                ref_contigs.extend(cl.chains)

            all_bams.extend(chunk.chunked_consensus_bams)
            all_ref_bams.extend(chunk.chunked_concat_ref_bams)

    if consensus_contigs:
        all_fastqs = [chunk_out.consensus_fastq for chunk_out in chunk_outs]
        cr_io.concatenate_files(outs.consensus_fastq, all_fastqs)

        all_fastas = [chunk_out.consensus_fasta for chunk_out in chunk_outs]
        concatenate_and_index_fastas(outs.consensus_fasta, all_fastas)
        outs.consensus_fasta_fai = outs.consensus_fasta + '.fai'

        all_fastas = [chunk_out.concat_ref_fasta for chunk_out in chunk_outs]
        concatenate_and_index_fastas(outs.concat_ref_fasta, all_fastas)
        outs.concat_ref_fasta_fai = outs.concat_ref_fasta + '.fai'

        concatenate_sort_and_index_bams(outs.consensus_bam, all_bams)
        outs.consensus_bam_bai = outs.consensus_bam + '.bai'
        concatenate_sort_and_index_bams(outs.concat_ref_bam, all_ref_bams)
        outs.concat_ref_bam_bai = outs.concat_ref_bam + '.bai'

        # Sort contigs (and clonotypes) by frequency.
        with open(args.clonotype_assignments) as f:
            clonotypes = json.load(f)
        clonotype_freqs = {cid: c['freq'] for cid, c in clonotypes.iteritems()}

    consensus_contigs.sort(key=lambda x: clonotype_freqs[x.clonotype],
                           reverse=True)
    ref_contigs.sort(key=lambda x: clonotype_freqs[x.clonotype], reverse=True)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)

    with open(outs.consensus_annotations_csv, 'w') as out_file:
        vdj_annot.save_consensus_list_csv(out_file, consensus_contigs)

    with open(outs.clonotypes, 'w') as f:
        vdj_annot.save_clonotype_info_csv(f, consensus_contigs)

    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []
예제 #2
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    cell_barcodes = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes))

    barcode_contigs = vdj_annot.load_cell_contigs_from_json(
        args.annotations, args.vdj_reference_path, group_key='barcode')

    # From CDR sequence to sequence id
    sequences = {}
    # From clonotype (tuple of CDR ids) to clonotype id
    clonotypes = {}

    # From barcode to clonotype id
    bc_clonotype_assignments = {}

    # First pass: Just keep track of observed CDR3s
    for contig_list in barcode_contigs:

        # This will be a tuple of sequences like "TRA_<cdr seq>"
        barcode_clonotype_tuple = contig_list.clonotype_tuple(
            require_productive=not args.use_non_productive,
            require_full_len=True,
            require_high_conf=True)

        # Give unique numerical ids to the CDR3 sequences
        if barcode_clonotype_tuple:
            for cdr_seq in barcode_clonotype_tuple:
                sequences.setdefault(cdr_seq, len(sequences))

    # From sequence id to CDR sequence
    sequence_ids = {seq_id: seq for seq, seq_id in sequences.iteritems()}

    # Do a second pass to potentially use non-full length contigs with a valid CDR3.
    for contig_list in barcode_contigs:
        if args.use_non_full_len:
            barcode_clonotype_tuple = []

            for c in contig_list.contigs():
                (_, cl_seq) = c.clonotype_seq()
                # If this contig has a CDR3 and we can infer the gene type of
                # that CDR3 (either based on the contig itself or based on
                # other full-length contigs that had this CDR3, then add this
                # to the clonotype tuple).
                if cl_seq in sequences:
                    # this will rescue contigs that have a chain and CDR3 assigned
                    # but aren't full length
                    barcode_clonotype_tuple.append(cl_seq)
        else:
            barcode_clonotype_tuple = contig_list.clonotype_tuple(
                require_productive=(not args.use_non_productive),
                require_full_len=True,
                require_high_conf=True)
        barcode_clonotype = tuple(
            sorted(list(set([sequences[s] for s in barcode_clonotype_tuple]))))

        if barcode_clonotype:
            clonotype_id = clonotypes.setdefault(barcode_clonotype,
                                                 len(clonotypes))
            bc_clonotype_assignments[contig_list.name] = clonotype_id

    # From clonotype id to tuple of CDRs
    clonotype_ids = {
        clonotype_id: clonotype_tuple
        for clonotype_tuple, clonotype_id in clonotypes.iteritems()
    }

    out_clonotypes = vdj_annot.report_clonotypes(reporter, 'raw',
                                                 cell_barcodes, clonotype_ids,
                                                 sequence_ids, barcode_contigs,
                                                 bc_clonotype_assignments)

    with open(outs.clonotype_assignments, 'w') as out_file:
        tk_safe_json.dump_numpy(tk_safe_json.json_sanitize(out_clonotypes),
                                out_file,
                                pretty=True)

    # Add clonotype assignments to contig annotations
    del barcode_contigs
    with open(args.annotations) as f:
        all_contigs = vdj_annot.load_contig_list_from_json(
            f, args.vdj_reference_path)

    vdj_annot.label_contigs_with_consensus(out_clonotypes, all_contigs, 'raw')

    # Write augmented contig annotations
    with open(outs.contig_annotations, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, all_contigs)

    with open(outs.contig_annotations_csv, 'w') as out_file:
        vdj_annot.save_contig_list_csv(out_file,
                                       all_contigs,
                                       write_inferred=False)

    with open(outs.contig_annotations_pickle, 'w') as out_file:
        cPickle.dump(all_contigs, out_file, protocol=cPickle.HIGHEST_PROTOCOL)

    # Write filtered contig annotations
    with open(outs.filtered_contig_annotations_csv, 'w') as out_file:
        filtered_contigs = filter(lambda x: x.high_confidence and x.is_cell,
                                  all_contigs)
        vdj_annot.save_contig_list_csv(out_file,
                                       filtered_contigs,
                                       write_inferred=False)

    # Set a default value for paired clonotype diversity so that it will be
    # present in the metric summary csv even when there are no paired cells
    # or in denovo mode
    paired_diversity_metric = reporter._get_metric_attr(
        'vdj_paired_clonotype_diversity', MULTI_REFS_PREFIX, 'raw')
    if not paired_diversity_metric.d:
        paired_diversity_metric.add(None, 0)

    reporter.report_summary_json(outs.summary)