Example #1
0
def main(args, outs):
    if not args.run_qc:
        return

    out_base = os.path.dirname(outs.qc_summary)
    whitelist_path = tk_preflight.check_barcode_whitelist(
        args.barcode_whitelist)
    file_infos = [tk_fasta.IlmnFastqFile(path) for path in args.input_files]

    bc_file_type = args.file_read_types_map[args.bc_read_type]
    barcode_files = [f for f in file_infos if f.read == bc_file_type]

    # Note: this is Martian 3 incompatible; revert back to summary_chunk if merging
    # back into master (also applies to additional references to `qc_summary` in the main function)
    #
    # see https://github.com/10XDev/tenkit/commit/2c59c9a24b0e7cd81945544f62ffde7ab632ed42
    outs.qc_summary = {'barcode': [], 'read1': [], 'read2': []}
    for idx, bf in enumerate(barcode_files):
        output_json_path = os.path.join(out_base, "output_%d_BC.json" % idx)
        subproc_args = [
            'barcodeqc', bf.filename, output_json_path, "--whitelist",
            whitelist_path, "--bc-start-index",
            str(args.bc_start_index), "--bc-length",
            str(args.bc_length)
        ]
        if args.bc_read_type == "I2" and args.rc_i2_read:
            subproc_args.append("--rc")
        try:
            tk_proc.check_call(subproc_args)
        except subprocess.CalledProcessError, e:
            martian.throw("Could not QC barcodes: return code %s" %
                          e.returncode)

        # needs to be summary_chunk in Martian 3
        outs.qc_summary['barcode'].append(output_json_path)
Example #2
0
    def align_reads_paired(self, in_fastq_r1_fn, in_fastq_r2_fn, out_file, write_sam=False, **kwargs):
        """
        Perform paired-end alignment of reads to reference and produce BAM as output using bowtie2.

        Args:
            in_fastq_r1_fn (str): name of fastq file with R1 to align
            in_fastq_r2_fn (str): name of fastq file with R2 to align
            out_file (str):       name of BAM/SAM file to output aligned reads
            write_sam (bool):     set to True to write SAM instead of BAM
            **kwargs: Any additional arguments to bowtie2 may be included.
                      Flags may have value set to None. Values are not
                      validated except for conflicts with index and read
                      input arguments. Parameters with hypens in name should
                      be defined using underscores in place of hypens.

        Examples:
            kwargs can be specified as such: myBowtie2.align_reads_paired(f1, f2, bam, p=2, very_fast=None, N=3)
        """

        assert self.indexed

        reserved_arguments = {'x', '1', '2', 'U'}
        additional_options = cr_utils.kwargs_to_command_line_options(reserved_arguments, replace_chars={'_': '-'}, **kwargs)

        if write_sam:
            cmd = 'bowtie2 %s -x %s -1 %s -2 %s -S %s' % \
                  (additional_options, self.index_path, in_fastq_r1_fn, in_fastq_r2_fn, out_file)
        else:
            cmd = 'bowtie2 %s -x %s -1 %s -2 %s | samtools view -bS - -o %s' % \
                  (additional_options, self.index_path, in_fastq_r1_fn, in_fastq_r2_fn, out_file)

        tk_subproc.check_call(cmd, shell=True)
Example #3
0
def run_assembly(fastq_pref, fasta_pref, args):
    cmd = [
        'vdj_asm', 'asm', fastq_pref, fasta_pref,
        '--kmers=' + str(args.min_kmer_count),
        '--min-contig=' + str(args.min_contig_len),
        '--min-qual=' + str(args.min_qual),
        '--score-factor=' + str(args.score_factor),
        '--qual-factor=' + str(args.qual_factor),
        '--min-sw-score=' + str(args.min_sw_score),
        '--rt-error=' + str(args.rt_error)
    ]

    if not cr_chem.has_umis(args.chemistry_def):
        martian.log_info('Assembly without UMIs is not fully supported.')

    cutoff = args.min_readpairs_per_umi[str(args.gem_group)]
    if cr_chem.is_paired_end(args.chemistry_def):
        cmd.append('--min-umi-reads=' + str(2 * cutoff))
    else:
        cmd.append('--min-umi-reads=' + str(cutoff))
        cmd.append('--single-end')

    if args.use_unmapped:
        cmd.append('--use-unmapped')

    #cmd.append('--mixture-filter')

    print >> sys.stderr, 'Running', ' '.join(cmd)
    tk_subproc.check_call(cmd, cwd=os.getcwd())
Example #4
0
def main(args, outs):

    # Write read_chunk for consumption by Rust
    with open("chunk_args.json", "w") as f:
        json.dump(args.read_chunk, f)

    output_path = martian.make_path("")
    prefix = "fastq_chunk"
    chunk_reads_args = [
        'chunk_reads', '--reads-per-fastq',
        str(args.reads_per_file), output_path, prefix, "--martian-args",
        "chunk_args.json", '--compress', 'lz4'
    ]
    print "running chunk reads: [%s]" % str(chunk_reads_args)
    tk_subproc.check_call(chunk_reads_args)

    with open(os.path.join(output_path, "read_chunks.json")) as f:
        chunk_results = json.load(f)

    outs.out_chunks = []

    # Write out a new chunk entry for each resulting chunk
    for chunk in chunk_results:
        print args.read_chunk
        chunk_copy = args.read_chunk.copy()
        print chunk_copy
        chunk_copy['read_chunks'] = chunk
        outs.out_chunks.append(chunk_copy)
def main(args, outs):
    if not args.run_qc:
        return

    out_base = os.path.dirname(outs.qc_summary)
    whitelist_path = tk_preflight.check_barcode_whitelist(args.barcode_whitelist)
    file_infos = [tk_fasta.IlmnFastqFile(path) for path in args.input_files]

    bc_file_type = args.file_read_types_map[args.bc_read_type]
    barcode_files = [f for f in file_infos if f.read == bc_file_type]
    outs.summary_chunk = {
        'barcode': [],
        'read1': [],
        'read2': []
    }
    for idx, bf in enumerate(barcode_files):
        output_json_path = os.path.join(out_base, "output_%d_BC.json" % idx)
        subproc_args = [ 'barcodeqc', bf.filename, output_json_path, "--whitelist", whitelist_path,
                         "--bc-start-index", str(args.bc_start_index), "--bc-length",
                         str(args.bc_length)]
        if args.bc_read_type == "I2" and args.rc_i2_read:
            subproc_args.append("--rc")
        try:
            tk_proc.check_call(subproc_args)
        except subprocess.CalledProcessError, e:
            martian.throw("Could not QC barcodes: return code %s" % e.returncode)

        outs.summary_chunk['barcode'].append(output_json_path)
Example #6
0
def sort_bed(input_bed, output_bed, genome, threads=1, leave_key=False, has_key=False):
    """Use unix sort to properly sort a bed file, including a custom sort order on chromosomes.

    Warning!  sort does not have the --parallel argument on all forms of unix!  As such, we are
    dropping threading support for BED handling through unix sort.
    """
    main_cmds = []
    # sort_thread_args = "" if threads == 1 else " --parallel={}".format(threads)
    sort_thread_args = ""
    if not has_key:
        # If the bed file doesn't already have a contig key, we need to create the key file used to add it.
        tmpdir = os.path.dirname(output_bed)
        tmp_chroms = os.path.join(tmpdir, "chrom_order.txt")
        chroms = os.path.join(tmpdir, "sorted_order.txt")
        with open(genome, "r") as infile, open(tmp_chroms, "w") as outfile:
            for i, line in enumerate(infile):
                chrom = line.split()[0]
                outfile.write("{}\t{}\n".format(chrom, i))
        tk_subproc.check_call("sort -k1b,1 -o {} {}".format(chroms, tmp_chroms), shell=True)
        # Now we'll add the commands to join the key onto the file between the contig & start/stop positions
        main_cmds.extend(["sort -k1b,1{} {}".format(sort_thread_args, input_bed),
                          "join -t '\t' -j1 {} -".format(chroms)])
    else:
        main_cmds.append("cat {}".format(input_bed))
    # Next we sort on the contig key and start/stop positions
    main_cmds.append("sort -k2n -k3n -k4n{}".format(sort_thread_args))

    if not leave_key:
        # Finally we remove the key from the output file
        main_cmds.append("cut -f 1,3-8")

    with open(output_bed, 'w') as outfile:
        tk_subproc.check_call(" | ".join(main_cmds), shell=True, stdout=outfile)
Example #7
0
    def index_reference(self,
                        in_fasta_fn,
                        in_gtf_fn,
                        num_threads=1,
                        sa_sparse_d=None,
                        sa_index_n_bases=None,
                        chr_bin_n_bits=None,
                        limit_ram=None):
        if os.path.exists(self.reference_star_path):
            raise Exception('STAR reference path %s already exists' %
                            self.reference_star_path)

        os.mkdir(self.reference_star_path)

        args = [
            'STAR', '--runMode', 'genomeGenerate', '--genomeDir',
            self.reference_star_path, '--runThreadN',
            str(num_threads), '--genomeFastaFiles', in_fasta_fn,
            '--sjdbGTFfile', in_gtf_fn
        ]
        if limit_ram is not None:
            args += ['--limitGenomeGenerateRAM', str(limit_ram)]
        if sa_sparse_d is not None:
            args += ['--genomeSAsparseD', str(sa_sparse_d)]
        if sa_index_n_bases is not None:
            args += ['--genomeSAindexNbases', str(sa_index_n_bases)]
        if chr_bin_n_bits is not None:
            args += ['--genomeChrBinNbits', str(chr_bin_n_bits)]

        tk_subproc.check_call(args)
Example #8
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.output = [str(chunk_out.output) for chunk_out in chunk_outs]
    outs.chunked_reporter = None
    outs.coerce_strings()

    # Write chunk info to a temporary file for the rust code to consume
    chunk_metrics = []
    for chunk_def, chunk_out in zip(chunk_defs, chunk_outs):
        chunk_metrics.append({
            'metrics': chunk_out.chunked_reporter,
            'library_type': chunk_def.library_type,
        })
    with open('chunk_metrics.json', 'w') as f:
        json.dump(chunk_metrics, f)

    cmd = [
        'annotate_reads',
        'join',
        'chunk_metrics.json',
        outs.summary,
        outs.barcodes_detected,
    ]
    print >> sys.stderr, 'Running', ' '.join(cmd)
    tk_subproc.check_call(cmd, cwd=os.getcwd())
    outs.num_alignments = [
        chunk_out.num_alignments for chunk_out in chunk_outs
    ]
def run_louvain_unweighted_clustering(bin_filename, louvain_out):
    """ Run Louvain clustering on an unweighted edge-list """
    with open(louvain_out, 'w') as f:
        tk_subproc.check_call([
            LOUVAIN_BINPATH,
            bin_filename,
            '-q',
            '0',
            '-l',
            '-1',
        ],
                              stdout=f)
Example #10
0
    def load_from_index(index_path):
        b = Bowtie2Reference()
        b.reference_fasta_path = None

        # Check existence and validity of index
        try:
            tk_subproc.check_call('bowtie2-inspect -n %s' % index_path, shell=True)
        except subprocess.CalledProcessError:
            raise ValueError('Bowtie2 index could not be found or was invalid')

        b.index_path = index_path
        b.indexed = True
        return b
Example #11
0
def merge_keyed_bed(input_beds, output_bed, threads=1):
    """Merge sorted bedfiles retaining their chromosome keys, dropping the key afterwards.

    Warning!  sort does not have the --parallel argument on all forms of unix!  As such, we are
    dropping threading support for BED handling through unix sort.
    """
    main_cmds = []
    # sort_thread_args = "" if threads == 1 else " --parallel={}".format(threads)
    sort_thread_args = ""
    main_cmds.append("sort -m -k2n -k3n -k4n{} {}".format(sort_thread_args, ' '.join(input_beds)))
    main_cmds.append("cut -f 1,3-8")

    with open(output_bed, 'w') as outfile:
        tk_subproc.check_call(" | ".join(main_cmds), shell=True, stdout=outfile)
Example #12
0
def main(args, outs):
    convert_pickle_to_rust_index(
        cr_utils.get_reference_genes_index(args.reference_path),
        outs.gene_index_tab)

    if args.barcode_whitelist is None:
        barcode_whitelist = 'null'
    elif not os.path.exists(args.barcode_whitelist):
        barcode_whitelist = cr_utils.get_barcode_whitelist_path(
            args.barcode_whitelist)
    else:
        barcode_whitelist = args.barcode_whitelist

    cmd = [
        'annotate_reads',
        'main',
        args.chunk_genome_input,
        args.chunk_tags,
        outs.output,
        outs.chunked_reporter,
        args.reference_path,
        outs.gene_index_tab,
        args.barcode_counts,
        barcode_whitelist,
        str(args.gem_group),
        outs.chunk_metadata,
        cr_chem.get_strandedness(args.chemistry_def),
        args.feature_counts,
        args.library_type or lib_constants.DEFAULT_LIBRARY_TYPE,
        args.library_id,
        args.library_info_json,
        '--bam-comments',
        args.bam_comments_json,
    ]

    if cr_chem.get_endedness(args.chemistry_def) == cr_constants.FIVE_PRIME:
        cmd.append('--fiveprime')
    if args.skip_translate:
        cmd.append('--skip-translate')
    if args.feature_reference is not None:
        cmd.extend(['--feature-ref', args.feature_reference])

    print >> sys.stderr, 'Running', ' '.join(map(lambda x: "'%s'" % x, cmd))
    tk_subproc.check_call(cmd, cwd=os.getcwd())
    with open(outs.chunk_metadata) as f:
        metadata = json.load(f)
    outs.num_alignments = metadata['num_alignments']
Example #13
0
def run_read_match(read1_path, read2_path, fasta_path, out_bam_filename, strand, sw_params):
    assert strand in ('+', '-')
    cmd = ['vdj_asm', 'read-match',
           '--ref', fasta_path,
           '--r1', read1_path,
           '--outbam', out_bam_filename,
           '--seed=' + str(sw_params['seed']),
           '--min-sw-score=' + str(sw_params['min_sw_score'])]

    if strand == '-':
        cmd.append('--rev-strand')

    if read2_path:
        cmd.extend(['--r2', read2_path])

    print >> sys.stderr, 'Running', ' '.join(cmd)
    tk_subproc.check_call(cmd, cwd=os.getcwd())
Example #14
0
def main(args, outs):
    if not args.split_by_tile:
        return

    os.makedirs(outs.demultiplexed_fastq_path)

    demux_read_types = ("RA", "I1", "I2")  # covering the bases
    # like tenkit.fasta.find_input_fastq_files_10x_preprocess but allow Ns
    # from combined barcode list
    for read_type in demux_read_types:
        for barcode in args.bcs:
            file_glob = "read-%s_si-%s_lane-%03d[_\-]*.fastq*" % (read_type, barcode, args.lane)
            dir_glob = os.path.join(args.demultiplexed_fastq_path, "Tile*", file_glob)
            files = glob.glob(dir_glob)
            # assuming here that all files are already gzipped
            out_path = os.path.join(outs.demultiplexed_fastq_path,
                                    "read-%s_si-%s_lane-%03d-chunk-001.fastq.gz" % (read_type, barcode, args.lane))
            if files:
                subprocess_args = ["cat"] + files + [">", out_path]
                tk_proc.check_call(" ".join(subprocess_args), shell=True)
Example #15
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.output = [str(chunk_out.output) for chunk_out in chunk_outs]
    outs.chunked_reporter = None
    outs.coerce_strings()

    with open(outs.metric_chunk_list, 'w') as f:
        for chunk_out in chunk_outs:
            f.write(chunk_out.chunked_reporter + '\n')
    cmd = [
        'annotate_reads',
        'join',
        outs.metric_chunk_list,
        outs.summary,
        outs.barcodes_detected,
    ]
    print >> sys.stderr, 'Running', ' '.join(cmd)
    tk_subproc.check_call(cmd, cwd=os.getcwd())
    outs.num_alignments = [
        chunk_out.num_alignments for chunk_out in chunk_outs
    ]
def main(args, outs):
    args.coerce_strings()
    bam_prefix, ext = op.splitext(outs.default)

    # Sort based on the five prime position tag
    sort_args = [
        "samtools", "sort", "-t", SELF_FIVE_PRIME_POS_TAG, "-o", outs.default,
        args.chunk_input
    ]
    check_call(sort_args)

    perfect_read_count = 0
    bam = tk_bam.create_bam_infile(str(args.chunk_input))
    while True:
        try:
            read = bam.next()
            if crdna.read_filter.stringent_read_filter(read, True):
                perfect_read_count += 1
        except StopIteration:
            break
    outs.perfect_read_count = perfect_read_count
Example #17
0
def main(args, outs):
    convert_pickle_to_rust_index(
        cr_utils.get_reference_genes_index(args.reference_path),
        outs.gene_index_tab)

    if args.barcode_whitelist is None:
        barcode_whitelist = 'null'
    elif not os.path.exists(args.barcode_whitelist):
        barcode_whitelist = cr_utils.get_barcode_whitelist_path(
            args.barcode_whitelist)
    else:
        barcode_whitelist = args.barcode_whitelist

    cmd = [
        'annotate_reads',
        'main',
        args.chunk_genome_input,
        outs.output,
        outs.chunked_reporter,
        args.reference_path,
        outs.gene_index_tab,
        args.barcode_counts,
        barcode_whitelist,
        str(args.gem_group),
        outs.chunk_metadata,
        cr_chem.get_strandedness(args.chemistry_def),
        '--bam-comments',
        args.bam_comments_json,
    ]

    if cr_chem.get_endedness(args.chemistry_def) == cr_constants.FIVE_PRIME:
        cmd.append('--fiveprime')

    print >> sys.stderr, 'Running', ' '.join(cmd)
    tk_subproc.check_call(cmd, cwd=os.getcwd())
    with open(outs.chunk_metadata) as f:
        metadata = json.load(f)
    outs.num_alignments = metadata['num_alignments']
Example #18
0
    def _index_reference(self, index_path, **kwargs):
        """
        Generates a bowtie2 index for the specified reference file.

        Args:
            index_path (str): path to index prefix
            **kwargs: Any additional arguments to bowtie2-build may be included. Flags may have value set to None. Values
                are not validated. Parameters with hypens in name should be defined using underscores in place of hypens.

        Notes:
            Bowtie2 generates temporary files for indexing as a side-effect.

        Examples:
            kwargs can be specified as such: myBowtie2._index_reference(index_path, large_index=None, bmax=4)

        """

        additional_arguments = cr_utils.kwargs_to_command_line_options(set(), replace_chars={'_': '-'}, **kwargs)

        command = 'bowtie2-build %s %s %s' % (additional_arguments, self.reference_fasta_path, index_path)
        tk_subproc.check_call(command, shell=True)

        self.index_path = index_path
        self.indexed = True
Example #19
0
def run(args):
    """ Run tk_subproc.check_call and print command """
    print ' '.join(args)
    tk_subproc.check_call(args)
Example #20
0
def join(args, outs, chunk_defs, chunk_outs):
    contigs = []
    contig_fastqs = []
    contig_bams = []

    if len(chunk_outs) == 0:
        # No input reads
        # Create empty BAM file
        with open(outs.contig_bam, 'w') as f:
            pass
        outs.contig_bam_bai = None
        # Create empty contig FASTA
        with open(outs.contig_fasta, 'w') as f:
            pass
        outs.contig_fasta_fai = None
        # Create empty contig FASTQ
        with open(outs.contig_fastq, 'w') as f:
            pass
        outs.metrics_summary_json = None
        outs.summary_tsv = None
        outs.umi_summary_tsv = None
        return

    summary_tsvs = []
    umi_summary_tsvs = []

    for chunk_out in chunk_outs:
        if not os.path.isfile(chunk_out.contig_fasta):
            continue
        contigs.append(chunk_out.contig_fasta)

        contig_fastqs.append(chunk_out.contig_fastq)
        contig_bams.append(chunk_out.contig_bam)

        summary_tsvs.append(chunk_out.summary_tsv)
        umi_summary_tsvs.append(chunk_out.umi_summary_tsv)

    cr_io.concatenate_files(outs.contig_fasta, contigs)

    if os.path.getsize(outs.contig_fasta) > 0:
        tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta,
                              shell=True)
        outs.contig_fasta_fai = outs.contig_fasta + '.fai'

    cr_io.concatenate_files(outs.contig_fastq, contig_fastqs)

    if len(summary_tsvs) > 0:
        cr_io.concatenate_headered_files(outs.summary_tsv, summary_tsvs)
    if len(umi_summary_tsvs) > 0:
        cr_io.concatenate_headered_files(outs.umi_summary_tsv,
                                         umi_summary_tsvs)

    if contig_bams:
        # Merge every N BAMs. Trying to merge them all at once
        #  risks hitting the filehandle limit.
        n_merged = 0

        while len(contig_bams) > 1:
            to_merge = contig_bams[0:MERGE_BAMS_N]

            tmp_bam = martian.make_path('merged-%04d.bam' % n_merged)
            n_merged += 1

            print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam)
            tk_bam.merge(tmp_bam, to_merge, threads=args.__threads)

            # Delete any temporary bams that have been merged
            for in_bam in to_merge:
                if os.path.basename(in_bam).startswith('merged-'):
                    cr_io.remove(in_bam)

            # Pop the input bams and push the merged bam
            contig_bams = contig_bams[len(to_merge):] + [tmp_bam]

        if os.path.basename(contig_bams[0]).startswith('merged-'):
            # We merged at least two chunks together.
            # Rename it to the output bam.
            cr_io.move(contig_bams[0], outs.contig_bam)
        else:
            # There was only a single chunk, so copy it from the input
            cr_io.copy(contig_bams[0], outs.contig_bam)

        tk_bam.index(outs.contig_bam)

        # Make sure the Martian out matches the actual index filename
        outs.contig_bam_bai = outs.contig_bam + '.bai'

    # Merge the assembler summary jsons
    merged_summary = cr_io.merge_jsons_single_level(
        [out.metrics_summary_json for out in chunk_outs])

    with open(outs.metrics_summary_json, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(merged_summary),
                  f,
                  indent=4,
                  sort_keys=True)
Example #21
0
def run_cutadapt(args, out_read1s, out_read2s, chemistry_def, stdout=sys.stdout):
    paired_end = cr_chem.is_paired_end(chemistry_def)

    # If single end, determine which read the single read is (R1 or R2)
    if paired_end:
        single_read = None
    else:
        single_read = cr_chem.get_rna_read_def(chemistry_def).read_type
        assert single_read in ('R1', 'R2')

    out_r1_file = cr_utils.open_maybe_gzip(out_read1s, 'w')

    # Note: The complexity of forcing cutadapt to output a compressed file
    #       means we'll have to give up on that for now.
    cmd = ['cutadapt',
           '-e', '0.12', '--times', '3', '--overlap', '5',
           '-f', 'fastq',
           '-o', '/proc/%d/fd/%d' % (os.getpid(), out_r1_file.fileno())]

    out_r2_file = None
    if paired_end:
        out_r2_file = cr_utils.open_maybe_gzip(out_read2s, 'w')
        cmd.extend(['-p', '/proc/%d/fd/%d' % (os.getpid(), out_r2_file.fileno())])

    primers = {anno['name']:anno['seq'] for anno in args.primers}

    if paired_end or single_read == 'R1':
        # R1 adapters
        for name in R1_ANCHORED_FIVE_PRIME_SEQS:
            if name in primers:
                cmd.extend(['-g', '%s=^%s' % (name, primers[name])])

        for name in R1_THREE_PRIME_REV_COMP_SEQS:
            if name in primers:
                cmd.extend(['-a', '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))])

        for name in R1_THREE_PRIME_SEQS:
            if name in primers:
                cmd.extend(['-a', '%s=%s' % (name, primers[name])])

    if paired_end or single_read == 'R2':
        for name in R2_THREE_PRIME_REV_COMP_SEQS:
            if name in primers:
                flag = '-A' if paired_end else '-a'
                cmd.extend([flag, '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))])

        for name in R2_THREE_PRIME_SEQS:
            if name in primers:
                flag = '-A' if paired_end else '-a'
                cmd.extend([flag, '%s=%s' % (name, primers[name])])


    read1_file = cr_utils.open_maybe_gzip(args.read1s_chunk)
    cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read1_file.fileno())])

    read2_file = None
    if paired_end:
        read2_file = cr_utils.open_maybe_gzip(args.read2s_chunk)
        cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read2_file.fileno())])

    print cmd

    status = tk_subproc.check_call(cmd, stdout=stdout)

    # closing these files is important both because we need to wait on the
    # subprocess, if any, or else its rusage isn't accounted for for this
    # process, and because if we don't have a reference to the objects down
    # here, then python's garbage collector is free to finalize the objects
    # before cmd runs, which would result in a failure.
    out_r1_file.close()
    if out_r2_file:
        out_r2_file.close()
    read1_file.close()
    if read2_file:
        read2_file.close()

    return status
Example #22
0
                file_info.read,
                str(file_info.group).zfill(3)
            )
            out_path = os.path.join(output_folder, os.path.basename(out_name))
            os.rename(paths[0], out_path)

        else:
            out_file = "%s_S0%s_L%s_%s_%s.fastq.gz" % (
                file_info.prefix,
                args.output_snum,
                str(file_info.lane).zfill(3),
                file_info.read,
                str(file_info.group).zfill(3))
            out_path = os.path.join(output_folder, os.path.basename(out_file))
            subprocess_args = ["cat"] + paths + [">", out_path]
            log_subprocess.check_call(" ".join(subprocess_args), shell=True)
        out_files.append(out_path)

    # need something for non-blank chunk_outs (Martian foible)
    outs.files_merged = True
    outs.merged_file_paths = out_files


def join(args, outs, chunk_args, chunk_outs):
    if args.remove_split_fastqs:
        unique_folders = set([])
        for chunk_arg, chunk_out in zip(chunk_args, chunk_outs):
            # if no files present or no files need merging, no folders necessary to remove
            if not chunk_out.files_merged:
                continue
            for input_file in chunk_arg.input_files:
Example #23
0
    r1_files = [
        f for f in file_infos if args.file_read_types_map['R1'] == f.read
    ]
    for idx, r1f in enumerate(r1_files):
        output_json_path = os.path.join(out_base, "output_%d_R1.json" % idx)
        if bc_file_type == 'R1':
            start_index = args.bc_start_index
        else:
            start_index = 0
        subproc_args = [
            'q30count', r1f.filename, output_json_path, '--read-start-index',
            str(start_index)
        ]
        try:
            tk_proc.check_call(subproc_args)
        except subprocess.CalledProcessError, e:
            martian.throw("Could not count Q30 reads on R1: return code %s" %
                          e.returncode)

        # needs to be summary_chunk in Martian 3
        outs.qc_summary['read1'].append(output_json_path)

    r2_files = [
        f for f in file_infos if args.file_read_types_map['R2'] == f.read
    ]
    for idx, r2f in enumerate(r2_files):
        output_json_path = os.path.join(out_base, "output_%d_R2.json" % idx)
        if bc_file_type == 'R2':
            start_index = args.bc_start_index
        else:
Example #24
0
def run_cutadapt(args, out_read1s, out_read2s, chemistry_def):
    paired_end = cr_chem.is_paired_end(chemistry_def)

    # If single end, determine which read the single read is (R1 or R2)
    if paired_end:
        single_read = None
    else:
        single_read = cr_chem.get_rna_read_def(chemistry_def).read_type
        assert single_read in ('R1', 'R2')

    out_r1_file = cr_utils.open_maybe_gzip(out_read1s, 'w')

    # Note: The complexity of forcing cutadapt to output a compressed file
    #       means we'll have to give up on that for now.
    cmd = [
        'cutadapt', '-e', '0.12', '--times', '3', '--overlap', '5', '-f',
        'fastq', '-o',
        '/proc/%d/fd/%d' % (os.getpid(), out_r1_file.fileno())
    ]

    if paired_end:
        out_r2_file = cr_utils.open_maybe_gzip(out_read2s, 'w')
        cmd.extend(
            ['-p',
             '/proc/%d/fd/%d' % (os.getpid(), out_r2_file.fileno())])

    primers = {anno['name']: anno['seq'] for anno in args.primers}

    if paired_end or single_read == 'R1':
        # R1 adapters
        for name in R1_ANCHORED_FIVE_PRIME_SEQS:
            if name in primers:
                cmd.extend(['-g', '%s=^%s' % (name, primers[name])])

        for name in R1_THREE_PRIME_REV_COMP_SEQS:
            if name in primers:
                cmd.extend([
                    '-a',
                    '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))
                ])

        for name in R1_THREE_PRIME_SEQS:
            if name in primers:
                cmd.extend(['-a', '%s=%s' % (name, primers[name])])

    if paired_end or single_read == 'R2':
        for name in R2_THREE_PRIME_REV_COMP_SEQS:
            if name in primers:
                flag = '-A' if paired_end else '-a'
                cmd.extend([
                    flag,
                    '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))
                ])

        for name in R2_THREE_PRIME_SEQS:
            if name in primers:
                flag = '-A' if paired_end else '-a'
                cmd.extend([flag, '%s=%s' % (name, primers[name])])

    read1_file = cr_utils.open_maybe_gzip(args.read1s_chunk)
    cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read1_file.fileno())])

    if paired_end:
        read2_file = cr_utils.open_maybe_gzip(args.read2s_chunk)
        cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read2_file.fileno())])

    print cmd

    status = tk_subproc.check_call(cmd)
    return status
Example #25
0
def concatenate_and_index_fastas(out_fasta, fastas):
    cr_io.concatenate_files(out_fasta, fastas)
    tk_subproc.check_call(['samtools', 'faidx', out_fasta], cwd=os.getcwd())
Example #26
0
def get_consensus_quals(in_bam, clonotype_name, in_fasta, sel_contigs,
                        contig_umis, out_dir):
    """Compute base quality scores of a sequence.

    Args:
    - in_bam: bam file to get the list of reads assigned to UMIs on the selected contigs
    - clonotype_name: Used for naming output files.
    - sel_contigs: Contigs that led to the consensus sequence above
    - contig_umis: from contig name to list of umis assigned to that contig

    Return value:
    String with base qualities (in FASTQ format).
    """

    pref = re.sub('.fasta', '', os.path.basename(in_fasta))
    fastq1 = re.sub('.fasta', '_1.fastq', in_fasta)
    fastq2 = re.sub('.fasta', '_2.fastq', in_fasta)

    sel_reads = {}

    for contig in sel_contigs:
        umi_read_count = Counter()
        barcode = contig.split('_')[0]
        contig_read_count = 0

        # Wrap contig w/ str() because pysam crashes on unicode input
        for read in in_bam.fetch(str(contig)):
            # NOTE: Assembler assumes that any tags are part of the read name
            # BUT the bam that we feed to this stage has the tags stripped out
            # of the name.
            umi = read.get_tag(PROCESSED_UMI_TAG)
            if umi in contig_umis[contig] and not read.is_secondary:
                umi_read_count[umi] += 1
                if umi_read_count[umi] >= MAX_READS_PER_UMI:
                    continue

                contig_read_count += 1
                if contig_read_count >= MAX_READS_PER_CONTIG:
                    continue

                if not read.qname in sel_reads:
                    sel_reads[read.qname] = [None, None]
                sel_reads[read.qname][read.is_read2] = read

    with open(fastq1, 'w') as f1, open(fastq2, 'w') as f2:
        for read_name, pair in sel_reads.iteritems():
            read1, read2 = pair[0], pair[1]

            if read1 is None:
                # Replace the UMI with <BC>_<UMI>.
                umi = read2.get_tag(PROCESSED_UMI_TAG)
            else:
                umi = read1.get_tag(PROCESSED_UMI_TAG)

            header = cr_fastq.AugmentedFastqHeader(read_name)
            header.set_tag(PROCESSED_UMI_TAG, barcode + '_' + umi)
            header.set_tag(PROCESSED_BARCODE_TAG, barcode)

            if read1 is None:
                out_seq1 = ""
                out_quals1 = ""
            else:
                out_seq1 = tk_seq.get_rev_comp(
                    read1.seq) if read1.is_reverse else read1.seq
                out_quals1 = read1.qual[::
                                        -1] if read1.is_reverse else read1.qual
            tk_fasta.write_read_fastq(f1, header.to_string(), out_seq1,
                                      out_quals1)

            if read2 is None:
                out_seq2 = ""
                out_quals2 = ""
            else:
                out_seq2 = tk_seq.get_rev_comp(
                    read2.seq) if read2.is_reverse else read2.seq
                out_quals2 = read2.qual[::
                                        -1] if read2.is_reverse else read2.qual
            tk_fasta.write_read_fastq(f2, header.to_string(), out_seq2,
                                      out_quals2)

    assert (len(sel_reads) > 0)

    cmd = ['vdj_asm', 'base-quals', re.sub('.fasta', '', in_fasta), out_dir]
    sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

    tk_subproc.check_call(cmd, cwd=os.getcwd())

    with open(os.path.join(out_dir, pref + '.fastq'), 'r') as f:
        lines = f.readlines()

    return lines[3].strip()
Example #27
0
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args):
    """Build a consensus sequence from a set of contigs.

    Args:
    - clonotype_name: Used to prefix output files.
    - sel_contigs: Names of contigs to use for consensus building.
    - best_contig: Name of "best" contig. Will search for this contig's sequence
        and base qualities.
    - out_dir: dir used for temporary results
    - args: stage args.

    - Return value:
    A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name).
    - best_contig_seq/best_contig_quals: the sequence and quals of the best contig
    - consensus_seq: the consensus sequence or None if no consensus could be built.
    - out_bam_name: Path of BAM with alignments of contigs to consensus seq.
    - out_fastq_name: FASTQ with contig sequences.
    - out_fasta_name: FASTA with consensus sequence.
    enough reads for consensus.
    """

    best_contig_seq = None
    best_contig_quals = None

    # Input to base quality computation - we don't really need the
    # base qualities because we will replace them by read-based qualities
    # But we need to do this to get proper alignments of contigs against
    # the consensus.
    out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq')

    # Input to assembly
    out_bam_name = martian.make_path(clonotype_name + '_contigs.bam')

    # The reference in the output bam doesn't really matter.
    out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1])

    # Read the entire fastq (all contigs) and write the selected contigs to
    # a bam for the assembler and a fastq for the aligner.
    with open(args.contigs_fastq, 'r') as f, open(out_fastq_name,
                                                  'w') as out_fq:
        fq_iter = tk_fasta.read_generator_fastq(f)
        for (name, seq, quals) in fq_iter:
            if name in sel_contigs:
                if name == best_contig:
                    best_contig_seq = seq
                    best_contig_quals = quals

                header = cr_fastq.AugmentedFastqHeader(name)
                # Create a pseudo-UMI for each input contig
                header.set_tag(PROCESSED_UMI_TAG, name)
                # Put all reads on the same "barcode". This is important, so
                # the assembler assembles all of them together.
                header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name)

                record = pysam.AlignedRead()

                record.reference_start = 0
                record.reference_id = 0
                # Wrap with str() or pysam will crash when given unicode
                record.qname = str(header.to_string())
                record.seq = seq
                record.qual = quals
                record.flag = MAPPED_UNPAIRED_FLAG

                out_bam.write(record)

                # Now change the tags. The final bam concatenation code will pull
                # the tags out of the header, so we want these to be meaningful.
                # Put the real barcode in the barcode tag. The alignment-base-qual
                # code will ignore it anyway.
                header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0])
                tk_fasta.write_read_fastq(out_fq, header.to_string(), seq,
                                          quals)

    out_bam.close()
    assert (not best_contig_seq is None)

    out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta')

    # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name.
    # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the
    # only output of the assembler we care about.
    if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS:
        cmd = [
            'vdj_asm',
            'asm',
            out_bam_name,
            out_dir,
            '--single-end',
            '--cons',  # required so we produce a single output sequence
            '--kmers=0',
            '--min-qual=0',
            '--score-factor=0.0'
        ]
        sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

        tk_subproc.check_call(cmd, cwd=os.getcwd())

        with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'),
                  'r') as contig_f:
            lines = contig_f.readlines()
            if lines:
                out_seq = lines[1].strip()
            else:
                # In some rare cases (eg. input contigs have 0 quality), assembly might fail.
                out_seq = None
    else:
        out_seq = None

    # Write the best contig sequence on a new fasta. We need to make sure this has the
    # right contig name because this will be the name written in the bam alignments
    # of the contigs against the consensus
    with open(out_fasta_name, 'w') as f:
        tk_fasta.write_read_fasta(f, clonotype_name,
                                  out_seq if out_seq else best_contig_seq)

    # Now align the same reads that were used in vdj_asm against the consensus that you just got.
    # The output will be in out_dir/<clonotype_name> + '_contigs.bam'
    cmd = [
        'vdj_asm', 'base-quals',
        martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end'
    ]
    sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

    tk_subproc.check_call(cmd, cwd=os.getcwd())

    # Move the BAM of the contigs aligned against the consensus out of the outs
    # (Will overwrite this bam which was already used as input to assembly).
    cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'),
               out_bam_name)

    return (best_contig_seq, best_contig_quals, out_seq, out_bam_name,
            out_fastq_name, out_fasta_name)
Example #28
0
def build_reference_fasta_from_ensembl(gtf_paths, transcripts_to_remove_path,
                                       genome_fasta_path, reference_path,
                                       reference_name, ref_version,
                                       mkref_version):
    """Create cellranger-compatible vdj reference files from a list of ENSEMBL-like GTF files.

    Input files are concatenated. No attempt to merge/reconcile information
    across them is made. Providing the files in a different order might change the
    output in cases where there are multiple entries with the same transcript id
    and the same feature type (eg. V-region).
    """

    transcripts = collections.defaultdict(list)

    if transcripts_to_remove_path:
        with open(transcripts_to_remove_path) as f:
            rm_transcripts = set([line.strip() for line in f.readlines()])
    else:
        rm_transcripts = set()

    # Note: We cannot symlink here because some filesystems in the wild
    #       do not support symlinks.
    print 'Copying genome reference sequence...'
    os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path)))
    tmp_genome_fa_path = os.path.join(reference_path, 'genome.fasta')
    cr_utils.copy(genome_fasta_path, tmp_genome_fa_path)
    print '...done.\n'

    print 'Indexing genome reference sequence...'
    tk_subproc.check_call(['samtools', 'faidx', tmp_genome_fa_path])
    print '...done.\n'

    print 'Loading genome reference sequence...'
    genome_fasta = pysam.FastaFile(tmp_genome_fa_path)
    print '...done.\n'

    print 'Computing hash of genome FASTA file...'
    fasta_hash = cr_utils.compute_hash_of_file(tmp_genome_fa_path)
    print '...done.\n'

    for gtf in gtf_paths:
        print 'Reading GTF {}'.format(gtf)

        for line_no, entry in enumerate(get_gtf_iter(open(gtf))):
            if not entry.feature in [
                    ENSEMBL_FIVE_PRIME_UTR_FEATURE, ENSEMBL_CDS_FEATURE
            ]:
                continue
            entry = parse_attributes(entry)
            transcript_id = entry.attributes.get('transcript_id')
            transcript_biotype = entry.attributes.get('transcript_biotype')
            gene_biotype = entry.attributes.get('gene_biotype')
            gene_name = entry.attributes.get('gene_name')

            # Skip irrelevant biotypes
            if transcript_biotype not in ENSEMBL_VDJ_BIOTYPES and not gene_biotype in ENSEMBL_VDJ_BIOTYPES:
                continue

            # Skip blacklisted gene names
            if transcript_id in rm_transcripts:
                continue

            # Warn and skip if transcript_id missing
            if transcript_id is None:
                print 'Warning: Entry on row %d has no transcript_id' % line_no
                continue

            # Warn and skip if gene_name missing
            if gene_name is None:
                print 'Warning: Transcript %s on row %d has biotype %s but no gene_name. Skipping.' % (
                    transcript_id, line_no, transcript_biotype)
                continue

            # Infer region type from biotype
            if transcript_biotype in ENSEMBL_VDJ_BIOTYPES:
                vdj_feature = infer_ensembl_vdj_feature_type(
                    entry.feature, transcript_biotype)
            else:
                vdj_feature = infer_ensembl_vdj_feature_type(
                    entry.feature, gene_biotype)

            # Warn and skip if region type could not be inferred
            if vdj_feature is None:
                print 'Warning: Transcript %s has biotype %s. Could not infer VDJ gene type. Skipping.' % (
                    transcript_id, transcript_biotype)
                continue

            # Features that share a transcript_id and feature type are presumably exons
            # so keep them together.
            transcripts[(transcript_id, vdj_feature)].append(entry)

        print '...done.\n'

    print 'Computing hash of genes GTF files...'
    digest = hashlib.sha1()
    # concatenate all the hashes into a string and then hash that string
    digest.update(
        reduce(lambda x, y: x + y,
               [cr_utils.compute_hash_of_file(gtf) for gtf in gtf_paths]))
    gtf_hash = digest.hexdigest()
    print '...done.\n'

    print 'Fetching sequences...'
    out_fasta = open(get_vdj_reference_fasta(reference_path), 'w')

    feature_id = 1
    seen_features = set()

    for (transcript_id, region_type), regions in transcripts.iteritems():
        if not all(r.chrom == regions[0].chrom for r in regions):
            chroms = sorted(list(set([r.chrom for r in regions])))
            print 'Warning: Transcript %s spans multiple contigs: %s. Skipping.' % (
                transcript_id, str(chroms))
            continue

        if not all(r.strand == regions[0].strand for r in regions):
            print 'Warning: Transcript %s spans multiple strands. Skipping.' % transcript_id
            continue

        chrom = regions[0].chrom
        strand = regions[0].strand
        ens_gene_name = standardize_ensembl_gene_name(
            regions[0].attributes['gene_name'])
        transcript_id = regions[0].attributes['transcript_id']

        if chrom not in genome_fasta:
            print 'Warning: Transcript %s is on contig "%s" which is not in the provided reference fasta. Skipping.' % (
                transcript_id, chrom)
            continue

        # Build sequence
        regions.sort(key=lambda r: r.start)
        seq = ''
        for region in regions:
            # GTF coordinates are 1-based
            start, end = int(region.start) - 1, int(region.end)
            seq += genome_fasta.fetch(chrom, start, end)

        # Revcomp if transcript on reverse strand
        if strand == '-':
            seq = tk_seq.get_rev_comp(seq)

        # Strip Ns from termini
        if 'N' in seq:
            print 'Warning: Feature %s contains Ns. Stripping from the ends.' % str(
                (ens_gene_name, transcript_id, region_type))
            seq = seq.strip('N')

        if len(seq) == 0:
            print 'Warning: Feature %s is all Ns. Skipping.' % str(
                (ens_gene_name, transcript_id, region_type))
            continue

        # Infer various attributes from the Ensembl gene name
        record_id = transcript_id
        gene_name = ens_gene_name
        display_name = make_display_name(gene_name=gene_name, allele_name=None)
        chain = infer_ensembl_vdj_chain(gene_name)
        chain_type = infer_ensembl_vdj_chain_type(gene_name)
        # Ensembl doesn't encode alleles
        allele_name = '00'

        # Disallow spaces in these fields
        if ' ' in region_type:
            raise ValueError('Spaces not allowed in region type: "%s"' %
                             region_type)
        if ' ' in gene_name:
            raise ValueError('Spaces not allowed in gene name: "%s"' %
                             gene_name)
        if ' ' in record_id:
            raise ValueError('Spaces not allowed in record ID: "%s"' %
                             record_id)

        # Warn on features we couldn't classify properly
        if chain_type not in vdj_constants.VDJ_CHAIN_TYPES:
            print ('Warning: Could not infer chain type for: %s. ' + \
                'Expected the first two characters of the gene name to be in %s. Feature skipped.') % \
                (str((gene_name, record_id, region_type)),
                 str(tuple(vdj_constants.VDJ_CHAIN_TYPES)))
            continue

        if region_type in vdj_constants.VDJ_C_FEATURE_TYPES and chain in vdj_constants.CHAINS_WITH_ISOTYPES:
            isotype = infer_ensembl_isotype(ens_gene_name)
        else:
            isotype = None

        feature = VdjAnnotationFeature(
            feature_id=feature_id,
            record_id=record_id,
            display_name=display_name,
            gene_name=gene_name,
            region_type=region_type,
            chain_type=chain_type,
            chain=chain,
            isotype=isotype,
            allele_name=allele_name,
            sequence=seq,
        )

        # Don't add duplicate entries
        feat_key = get_duplicate_feature_key(feature)
        if feat_key in seen_features:
            print 'Warning: Skipping duplicate entry for %s (%s, %s).' % (
                display_name, region_type, record_id)
            continue
        seen_features.add(feat_key)

        feature_id += 1

        out_fasta.write(convert_vdj_feature_to_fasta_entry(feature) + '\n')
    print '...done.\n'

    print 'Deleting copy of genome fasta...'
    os.remove(tmp_genome_fa_path)
    os.remove(tmp_genome_fa_path + '.fai')
    print '...done.\n'

    print 'Writing metadata JSON file into reference folder...'
    metadata = {
        cr_constants.REFERENCE_GENOMES_KEY:
        reference_name,
        cr_constants.REFERENCE_FASTA_HASH_KEY:
        fasta_hash,
        cr_constants.REFERENCE_GTF_HASH_KEY:
        gtf_hash,
        cr_constants.REFERENCE_INPUT_FASTA_KEY:
        os.path.basename(genome_fasta_path),
        cr_constants.REFERENCE_INPUT_GTF_KEY:
        ','.join([os.path.basename(gtf_path) for gtf_path in gtf_paths]),
        cr_constants.REFERENCE_VERSION_KEY:
        ref_version,
        cr_constants.REFERENCE_MKREF_VERSION_KEY:
        mkref_version,
        cr_constants.REFERENCE_TYPE_KEY:
        vdj_constants.REFERENCE_TYPE,
    }
    with open(
            os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE),
            'w') as json_file:
        json.dump(tk_safe_json.json_sanitize(metadata),
                  json_file,
                  sort_keys=True,
                  indent=4)
    print '...done.\n'
Example #29
0
def main(args, outs):
    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []

    chunk_clonotypes = set(args.chunk_clonotypes)

    reporter = vdj_report.VdjReporter()
    if not args.clonotype_assignments or not vdj_utils.bam_has_seqs(
            args.contig_bam):
        # always produce an empty summary
        reporter.save(outs.chunked_reporter)
        return

    # Get the clonotype-barcode assignments
    with open(args.clonotype_assignments) as f:
        clonotypes = json.load(f)

    # Partition contig annotations by consensus id
    consensus_to_contigs = defaultdict(list)
    relevant_contig_ids = set()

    with open(args.chunk_annotations) as f:
        contigs = vdj_annot.load_contig_list_from_json(f,
                                                       args.vdj_reference_path)

    clo_key = '%s_clonotype_id' % args.metric_prefix
    cons_key = '%s_consensus_id' % args.metric_prefix

    for contig in contigs:
        clo_id = contig.info_dict.get(clo_key)
        cons_id = contig.info_dict.get(cons_key)
        assert clo_id in chunk_clonotypes and cons_id is not None

        consensus_to_contigs[cons_id].append(contig)
        relevant_contig_ids.add(contig.contig_name)

    assert len(consensus_to_contigs) > 0

    in_bam = tk_bam.create_bam_infile(args.contig_bam)

    n_merged_bams = 0

    # For all contigs relevant to this chunk,
    #   get the assembler umi data required for base qual recalculation.
    # Do not attempt to read into a pandas object because it can be huge.
    contig_umis = defaultdict(set)
    with open(args.umi_summary_tsv, 'r') as umi_file:
        for line in umi_file:
            fields = line.strip().split('\t')
            umi = fields[2]
            if umi == 'umi' or len(fields) < 7:
                continue
            good_umi = fields[5].lower() == 'true'
            contig_ids = set(fields[6].split(','))
            if good_umi and len(contig_ids & relevant_contig_ids) > 0:
                for c in contig_ids:
                    contig_umis[c].add(umi)

    consensus_fastq = open(outs.consensus_fastq, 'w')
    consensus_fasta = open(outs.consensus_fasta, 'w')
    ref_fasta = open(outs.concat_ref_fasta, 'w')

    consensus_contigs = []
    ref_contigs = []

    assert (args.metric_prefix in reporter.vdj_clonotype_types)

    # Iterate over clonotype assignments
    for clonotype_id, clonotype in clonotypes.iteritems():
        if not clonotype_id in chunk_clonotypes:
            continue

        for consensus_id, consensus in clonotype['consensuses'].iteritems():
            cdr = consensus['cdr3_seq']

            # Verify that the contig annotation data are consistent with the clonotype assignment data
            assert set(consensus['cell_contigs']) == \
                set(c.contig_name for c in consensus_to_contigs[consensus_id])
            sel_contigs = consensus_to_contigs[consensus_id]
            sel_contig_ids = [c.contig_name for c in sel_contigs]

            # Keep track of the "best" contig. This will be used in case the
            # merging fails.
            best_contig = None

            # Keep track of the set of distinct annotations of the contigs to merge.
            # Will use to report rate of discrepancies.
            feature_annotations = defaultdict(set)

            for contig in sel_contigs:
                for anno in contig.annotations:
                    feature_annotations[anno.feature.region_type].add(
                        anno.feature.gene_name)

                # Always choose a productive over a non-productive. Between
                # contigs with the same productivity, choose the one that had more UMIs.
                if best_contig is None or (not best_contig.productive and contig.productive) or \
                   (best_contig.productive == contig.productive and \
                    best_contig.umi_count < contig.umi_count):

                    best_contig = contig

            assert best_contig is not None

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_v_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_j_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            wrong_cdr_metric = reporter._get_metric_attr(
                'vdj_clonotype_consensus_wrong_cdr_contig_frac',
                args.metric_prefix)

            tmp_dir = martian.make_path(consensus_id + '_outs')
            cr_io.mkdir(tmp_dir, allow_existing=True)

            res = get_consensus_seq(consensus_id, sel_contig_ids,
                                    best_contig.contig_name, tmp_dir, args)
            (best_seq, best_quals, consensus_seq, contig_to_cons_bam,
             contig_fastq, contig_fasta) = res

            outs.chunked_consensus_bams.append(contig_to_cons_bam)

            # make sure the bam file has the right header (single sequence with this consensus name)
            tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam)
            if list(tmp_bam.references) != [consensus_id]:
                # Print some info to help us debug
                print tmp_bam.references, consensus_id
                assert (list(tmp_bam.references) == [consensus_id])
            tmp_bam.close()

            if consensus_seq:
                # If this is not None, we actually built a consensus, so we have to compute the quals from scratch.
                # Use a subset of the contigs for computing quals.
                contig_ids = map(
                    lambda c: c.contig_name,
                    sorted(sel_contigs,
                           key=lambda c: c.umi_count,
                           reverse=True))
                contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS]

                consensus_quals = get_consensus_quals(in_bam, consensus_id,
                                                      contig_fasta, contig_ids,
                                                      contig_umis, tmp_dir)
            else:
                consensus_seq = best_seq
                consensus_quals = best_quals

            assert (len(consensus_seq) == len(consensus_quals))

            total_read_count = sum([c.read_count for c in sel_contigs])
            total_umi_count = sum([c.umi_count for c in sel_contigs])

            contig_info_dict = {
                'cells': clonotype['barcodes'],
                'cell_contigs': sel_contig_ids,
                'clonotype_freq': clonotype['freq'],
                'clonotype_prop': clonotype['prop'],
            }

            contig = annotate_consensus_contig(args.vdj_reference_path,
                                               args.min_score_ratios,
                                               args.min_word_sizes,
                                               consensus_id,
                                               clonotype_id,
                                               consensus_seq,
                                               consensus_quals,
                                               read_count=total_read_count,
                                               umi_count=total_umi_count,
                                               info_dict=contig_info_dict,
                                               primers=args.primers)

            wrong_cdr_metric.add(1,
                                 filter=contig.cdr3_seq is None
                                 or contig.cdr3_seq != cdr)

            if contig.cdr3_seq is None or contig.cdr3_seq != cdr:
                # Something went wrong. Use "best" contig as the consensus.
                consensus_seq = best_seq
                consensus_quals = best_quals
                contig = annotate_consensus_contig(args.vdj_reference_path,
                                                   args.min_score_ratios,
                                                   args.min_word_sizes,
                                                   consensus_id,
                                                   clonotype_id,
                                                   consensus_seq,
                                                   consensus_quals,
                                                   read_count=total_read_count,
                                                   umi_count=total_umi_count,
                                                   info_dict=contig_info_dict,
                                                   primers=args.primers)

            assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr)

            consensus_contigs.append(contig)

            tk_fasta.write_read_fasta(consensus_fasta, consensus_id,
                                      consensus_seq)
            tk_fasta.write_read_fastq(consensus_fastq, consensus_id,
                                      consensus_seq, consensus_quals)
            assert (len(consensus_seq) == len(consensus_quals))

            ref_seq_parts, ref_annos = contig.get_concat_reference_sequence()

            # Align the contigs and consensus to a synthetic concatenated reference
            if ref_seq_parts is not None:
                # Trim the last segment down to the annotated length
                #   to avoid including the entire (500nt) C-region
                ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1].
                                                      annotation_match_end]

                # Concatenate the reference VDJC segments
                ref_seq = reduce(lambda x, y: x + y, ref_seq_parts)
                ref_name = re.sub('consensus', 'concat_ref', consensus_id)

                # Reannotate the reference sequence.
                # Restrict the annotation to the already-called segments to
                #   reduce the risk of discordance between the consensus and
                #   concat_ref annotations.
                ref_contig = annotate_consensus_contig(
                    args.vdj_reference_path,
                    args.min_score_ratios,
                    args.min_word_sizes,
                    ref_name,
                    clonotype_id,
                    ref_seq,
                    'I' * len(ref_seq),
                    use_features=set([a.feature.feature_id
                                      for a in ref_annos]),
                )
                ref_contigs.append(ref_contig)

                # Add the consensus sequence to the input FASTQ (next to the contigs)
                with open(contig_fastq, 'a') as contig_fq:
                    # Create a fake UMI and barcode
                    header = cr_fastq.AugmentedFastqHeader(consensus_id)
                    header.set_tag(PROCESSED_UMI_TAG, consensus_id)
                    header.set_tag(PROCESSED_BARCODE_TAG, consensus_id)
                    tk_fasta.write_read_fastq(contig_fq, header.to_string(),
                                              consensus_seq, consensus_quals)

                # Reuse this file (this had the assembly output but we don't need it anymore)
                ref_fasta_name = martian.make_path(consensus_id +
                                                   '_contigs.fasta')
                with open(ref_fasta_name, 'w') as f:
                    tk_fasta.write_read_fasta(f, ref_name, ref_seq)

                # Also append to the final output
                tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq)

                cmd = [
                    'vdj_asm', 'base-quals',
                    martian.make_path(consensus_id + '_contigs'), tmp_dir,
                    '--single-end'
                ]
                sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

                tk_subproc.check_call(cmd, cwd=os.getcwd())

                # Move out of tmp dir
                rec_bam = martian.make_path(consensus_id + '_reference.bam')
                cr_io.move(
                    os.path.join(tmp_dir, consensus_id + '_contigs.bam'),
                    rec_bam)
                outs.chunked_concat_ref_bams.append(rec_bam)

            if os.path.isdir(tmp_dir):
                shutil.rmtree(tmp_dir)

            # Clean up unneeded files ASAP
            rm_files([
                consensus_id + '_contigs.fasta',
                consensus_id + '_contigs.fastq'
            ])

            # Merge N most recent BAM files to avoid filesystem overload
            if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY:
                assert len(outs.chunked_consensus_bams) == len(
                    outs.chunked_concat_ref_bams)

                new_cons_bam = martian.make_path('merged-consensus-%03d.bam' %
                                                 n_merged_bams)
                concatenate_bams(new_cons_bam, outs.chunked_consensus_bams)
                rm_files(outs.chunked_consensus_bams)
                outs.chunked_consensus_bams = [new_cons_bam]

                new_ref_bam = martian.make_path('merged-ref-%03d.bam' %
                                                n_merged_bams)
                concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams)
                rm_files(outs.chunked_concat_ref_bams)
                outs.chunked_concat_ref_bams = [new_ref_bam]

                n_merged_bams += 1

    in_bam.close()

    consensus_fastq.close()
    consensus_fasta.close()
    ref_fasta.close()

    reporter.save(outs.chunked_reporter)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)