def join(args, outs, chunk_defs, chunk_outs):
    barcodes_csv = [chunk_out.cell_barcodes for chunk_out in chunk_outs]
    cr_utils.concatenate_files(outs.cell_barcodes, barcodes_csv)
    outs.gem_group_metrics = {
        cd.gem_group: co.gem_group_metrics
        for (cd, co) in zip(chunk_defs, chunk_outs)
    }
def concatenate_and_index_fastas(out_fasta, fastas):
    cr_utils.concatenate_files(out_fasta, fastas)
    tk_subproc.check_call(['samtools', 'faidx', out_fasta], cwd=os.getcwd())
def join(args, outs, chunk_defs, chunk_outs):
    if len(chunk_outs) == 0:
        # Set all outputs to null
        for slot in outs.slots:
            setattr(outs, slot, None)
        return

    reporters = [chunk_out.chunked_reporter for chunk_out in chunk_outs]
    final_report = cr_report.merge_reporters(reporters)
    final_report.report_summary_json(outs.summary)

    consensus_contigs = []
    ref_contigs = []
    all_bams = []
    all_ref_bams = []

    for chunk in chunk_outs:
        if chunk.consensus_annotations_json and os.path.isfile(chunk.consensus_annotations_json):
            # Collect consensus annotations
            new_contigs = vdj_annot.load_cell_contigs_from_json(chunk.consensus_annotations_json,
                                                                args.vdj_reference_path,
                                                                group_key='clonotype')
            for cl in new_contigs:
                consensus_contigs.extend(cl.chains)

            # Collect concat_ref annotations
            new_ref_contigs = vdj_annot.load_cell_contigs_from_json(chunk.concat_ref_annotations_json,
                                                                    args.vdj_reference_path,
                                                                    group_key='clonotype')
            for cl in new_ref_contigs:
                ref_contigs.extend(cl.chains)

            all_bams.extend(chunk.chunked_consensus_bams)
            all_ref_bams.extend(chunk.chunked_concat_ref_bams)

    if consensus_contigs:
        all_fastqs = [chunk_out.consensus_fastq for chunk_out in chunk_outs]
        cr_utils.concatenate_files(outs.consensus_fastq, all_fastqs)

        all_fastas = [chunk_out.consensus_fasta for chunk_out in chunk_outs]
        concatenate_and_index_fastas(outs.consensus_fasta, all_fastas)
        outs.consensus_fasta_fai = outs.consensus_fasta + '.fai'

        all_fastas = [chunk_out.concat_ref_fasta for chunk_out in chunk_outs]
        concatenate_and_index_fastas(outs.concat_ref_fasta, all_fastas)
        outs.concat_ref_fasta_fai = outs.concat_ref_fasta + '.fai'

        concatenate_sort_and_index_bams(outs.consensus_bam, all_bams)
        outs.consensus_bam_bai = outs.consensus_bam + '.bai'
        concatenate_sort_and_index_bams(outs.concat_ref_bam, all_ref_bams)
        outs.concat_ref_bam_bai = outs.concat_ref_bam + '.bai'

        # Sort contigs (and clonotypes) by frequency.
        with open(args.clonotype_assignments) as f:
            clonotypes = json.load(f)
        clonotype_freqs = {cid:c['freq'] for cid, c in clonotypes.iteritems()}

    consensus_contigs.sort(key=lambda x:clonotype_freqs[x.clonotype], reverse=True)
    ref_contigs.sort(key=lambda x:clonotype_freqs[x.clonotype], reverse=True)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)

    with open(outs.consensus_annotations_csv, 'w') as out_file:
        vdj_annot.save_consensus_list_csv(out_file, consensus_contigs)

    with open(outs.clonotypes, 'w') as f:
        vdj_annot.save_clonotype_info_csv(f, consensus_contigs)

    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []
Exemple #4
0
def join(args, outs, chunk_defs, chunk_outs):
    contigs = []
    contig_fastqs = []
    contig_bams = []

    if len(chunk_outs) == 0:
        # No input reads
        # Create empty BAM file
        with open(outs.contig_bam, 'w') as f:
            pass
        outs.contig_bam_bai = None
        # Create empty contig FASTA
        with open(outs.contig_fasta, 'w') as f:
            pass
        outs.contig_fasta_fai = None
        # Create empty contig FASTQ
        with open(outs.contig_fastq, 'w') as f:
            pass
        outs.metrics_summary_json = None
        outs.summary_tsv = None
        outs.umi_summary_tsv = None
        return

    summary_tsvs = []
    umi_summary_tsvs = []

    for chunk_out in chunk_outs:
        if not os.path.isfile(chunk_out.contig_fasta):
            continue
        contigs.append(chunk_out.contig_fasta)

        contig_fastqs.append(chunk_out.contig_fastq)
        contig_bams.append(chunk_out.contig_bam)

        summary_tsvs.append(chunk_out.summary_tsv)
        umi_summary_tsvs.append(chunk_out.umi_summary_tsv)

    cr_utils.concatenate_files(outs.contig_fasta, contigs)

    if os.path.getsize(outs.contig_fasta) > 0:
        tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta,
                              shell=True)
        outs.contig_fasta_fai = outs.contig_fasta + '.fai'

    cr_utils.concatenate_files(outs.contig_fastq, contig_fastqs)

    if len(summary_tsvs) > 0:
        cr_utils.concatenate_headered_files(outs.summary_tsv, summary_tsvs)
    if len(umi_summary_tsvs) > 0:
        cr_utils.concatenate_headered_files(outs.umi_summary_tsv,
                                            umi_summary_tsvs)

    if contig_bams:
        # Merge every N BAMs. Trying to merge them all at once
        #  risks hitting the filehandle limit.
        n_merged = 0

        while len(contig_bams) > 1:
            to_merge = contig_bams[0:MERGE_BAMS_N]

            tmp_bam = martian.make_path('merged-%04d.bam' % n_merged)
            n_merged += 1

            print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam)
            tk_bam.merge(tmp_bam, to_merge, threads=args.__threads)

            # Delete any temporary bams that have been merged
            for in_bam in to_merge:
                if os.path.basename(in_bam).startswith('merged-'):
                    cr_utils.remove(in_bam)

            # Pop the input bams and push the merged bam
            contig_bams = contig_bams[len(to_merge):] + [tmp_bam]

        if os.path.basename(contig_bams[0]).startswith('merged-'):
            # We merged at least two chunks together.
            # Rename it to the output bam.
            cr_utils.move(contig_bams[0], outs.contig_bam)
        else:
            # There was only a single chunk, so copy it from the input
            cr_utils.copy(contig_bams[0], outs.contig_bam)

        tk_bam.index(outs.contig_bam)

        # Make sure the Martian out matches the actual index filename
        outs.contig_bam_bai = outs.contig_bam + '.bai'

    # Merge the assembler summary jsons
    merged_summary = cr_utils.merge_jsons_single_level(
        [out.metrics_summary_json for out in chunk_outs])

    with open(outs.metrics_summary_json, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(merged_summary),
                  f,
                  indent=4,
                  sort_keys=True)
Exemple #5
0
def join(args, outs, chunk_defs, chunk_outs):
    contigs = []
    contig_fastqs = []
    contig_bams = []

    summary_df_parts = []
    umi_summary_df_parts = []

    for chunk_out in chunk_outs:
        if not os.path.isfile(chunk_out.contig_fasta):
            continue
        contigs.append(chunk_out.contig_fasta)

        contig_fastqs.append(chunk_out.contig_fastq)
        contig_bams.append(chunk_out.contig_bam)
        summary_df_parts.append(
            pd.read_csv(chunk_out.summary_tsv,
                        header=0,
                        index_col=None,
                        sep='\t',
                        dtype={
                            'component': int,
                            'num_reads': int,
                            'num_pairs': int,
                            'num_umis': int
                        }))

        umi_summary_df_parts.append(
            pd.read_csv(chunk_out.umi_summary_tsv,
                        header=0,
                        index_col=None,
                        sep='\t',
                        dtype={
                            'umi_id': int,
                            'reads': int,
                            'min_umi_reads': int,
                            'contigs': str
                        }))

    summary_df = pd.concat(summary_df_parts, ignore_index=True)
    umi_summary_df = pd.concat(umi_summary_df_parts, ignore_index=True)

    cr_utils.concatenate_files(outs.contig_fasta, contigs)

    if os.path.getsize(outs.contig_fasta) > 0:
        subprocess.check_call('samtools faidx %s' % outs.contig_fasta,
                              shell=True)
        outs.contig_fasta_fai = outs.contig_fasta + '.fai'

    cr_utils.concatenate_files(outs.contig_fastq, contig_fastqs)

    if summary_df is not None:
        summary_df.to_csv(outs.summary_tsv, header=True, index=False, sep='\t')
    if umi_summary_df is not None:
        umi_summary_df.to_csv(outs.umi_summary_tsv,
                              header=True,
                              index=False,
                              sep='\t')

    if contig_bams:
        tk_bam.merge(outs.contig_bam, contig_bams, threads=args.__threads)
        tk_bam.index(outs.contig_bam)

        # Make sure the Martian out matches the actual index filename
        outs.contig_bam_bai = outs.contig_bam + '.bai'

    # Merge the assembler summary jsons
    merged_summary = cr_utils.merge_jsons_single_level(
        [out.metrics_summary_json for out in chunk_outs])

    with open(outs.metrics_summary_json, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(merged_summary),
                  f,
                  indent=4,
                  sort_keys=True)