Example #1
0
def main(args, outs):
    # Martian coerces dict keys to string
    # Coerce keys back to int
    args.chunks_per_gem_group = {
        int(k): v
        for k, v in args.chunks_per_gem_group.iteritems()
    }

    with open(args.read1s_chunk) as f1:
        read1s = [read for read in tk_fasta.read_generator_fastq(f1)]

    with open(args.read2s_chunk) as f2:
        read2s = [read for read in tk_fasta.read_generator_fastq(f2)]

    assert len(read1s) == len(read2s)

    fastqs_out = {}
    buckets = {}

    outs.buckets = {}

    for gem_group, bucket_name in enumerate_bucket_names(
            args.chunks_per_gem_group):
        filename = martian.make_path("%s.fastq" % bucket_name)
        fastqs_out[bucket_name] = open(filename, 'w')
        outs.buckets[bucket_name] = filename
        buckets[bucket_name] = []

    for read1, read2 in itertools.izip(read1s, read2s):
        barcode = vdj_utils.get_fastq_read_barcode(read1)

        # Exclude unbarcoded reads
        if barcode is None:
            continue

        assert barcode == vdj_utils.get_fastq_read_barcode(read2)

        barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
        bucket_name = get_bucket_name(gem_group, barcode_seq,
                                      args.chunks_per_gem_group[gem_group])

        buckets[bucket_name].append(read1)
        buckets[bucket_name].append(read2)

    # Sort and write each bucket
    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=vdj_utils.fastq_barcode_sort_key)

        fastq_out = fastqs_out[bucket_name]
        for read in bucket:
            tk_fasta.write_read_fastq(fastq_out, *read)

        fastq_out.close()
Example #2
0
def split(args):
    assert args.read1s is not None and args.read2s is not None

    chunks = []

    if cr_chem.get_barcode_whitelist(args.chemistry_def) is not None:

        # Data are barcoded
        for read1_fq, read2_fq, barcodes_json in zip(args.read1s, args.read2s,
                                                     args.chunk_barcodes):
            with open(barcodes_json) as f:
                chunk_barcodes = json.load(f)

            chunks.append({
                'read1_chunk': read1_fq,
                'read2_chunk': read2_fq,
                'barcodes_chunk': chunk_barcodes,
                '__mem_gb': 3.0,
            })

    else:
        # Most stages assume that each chunk has a single barcode.
        # So unfortunately we have to put all reads in the same chunk, otherwise
        # metric computation will break.
        read1_out_filename = martian.make_path('chunk0_1.fastq')
        read2_out_filename = martian.make_path('chunk0_2.fastq')
        with open(read1_out_filename,
                  'w') as read1_out, open(read2_out_filename,
                                          'w') as read2_out:
            for read1_file, read2_file in zip(args.read1s, args.read2s):
                with open(read1_file) as in1, open(read2_file) as in2:
                    fastq1_iter = tk_fasta.read_generator_fastq(
                        in1, paired_end=False)
                    fastq2_iter = tk_fasta.read_generator_fastq(
                        in2, paired_end=False)

                    for read1_tuple in fastq1_iter:
                        read2_tuple = fastq2_iter.next()
                        tk_fasta.write_read_fastq(read1_out, *read1_tuple)
                        tk_fasta.write_read_fastq(read2_out, *read2_tuple)

        chunks.append({
            'read1_chunk': read1_out_filename,
            'read2_chunk': read2_out_filename,
            'barcodes_chunk': [""],
        })

    # Martian doesn't like empty chunk lists so create a chunk w/ empty data
    if len(chunks) == 0:
        return get_dummy_chunk()

    return {'chunks': chunks}
Example #3
0
def merge_by_barcode(in_filenames, r1_out_file, r2_out_file, bcs_out_file,
                     paired_end):
    barcodes = set()

    # Note: The filehandle cache precludes the use of compressed files
    file_cache = tk_cache.FileHandleCache(mode='r', open_func=open)
    heap = []

    key_func = vdj_utils.fastq_barcode_sort_key

    for filename in in_filenames:
        try:
            fastq = tk_fasta.read_generator_fastq(file_cache.get(filename),
                                                  paired_end=paired_end)
            first_readpair = fastq.next()

            key = key_func(first_readpair[0:3])
            barcode = key[0]
            barcodes.add(barcode)

            heapq.heappush(heap, (key, first_readpair, filename))

        except StopIteration:
            pass

    while len(heap) > 0:
        # Get the minimum item and write it.
        key, readpair, in_filename = heapq.heappop(heap)

        fastq = tk_fasta.read_generator_fastq(file_cache.get(in_filename),
                                              paired_end=paired_end)

        tk_fasta.write_read_fastq(r1_out_file, *readpair[0:3])
        if paired_end:
            tk_fasta.write_read_fastq(r2_out_file, *readpair[3:6])

        # Get the next item from the source file we just wrote from
        # If that file is out of items, then we leave that one out
        try:
            next_readpair = fastq.next()

            key = key_func(next_readpair[0:3])
            barcode = key[0]
            barcodes.add(barcode)

            heapq.heappush(heap, (key, next_readpair, in_filename))

        except StopIteration:
            pass

    json.dump(tk_safe_json.json_sanitize(list(barcodes)), bcs_out_file)
Example #4
0
 def write_data(self, data):
     tk_fasta.write_read_fastq(self.curr_file, *data)
Example #5
0
def main(args, outs):
    """ Trim the reads in a series of fasta files """

    # Set a fixed random seed to eliminate noise in metrics
    random.seed(0)

    chunk = args.chunk
    interleaved = chunk['reads_interleaved']
    have_read2 = chunk['read2'] is not None
    paired = interleaved or have_read2

    read1_trim = args.read1_trim_length
    read2_trim = args.read2_trim_length

    subsample_rate = chunk['subsample_rate']

    # BC config -- BC come from separate fastq, or are embedded in R1 or R2
    have_barcode = False
    bc_in_read1 = False
    bc_in_read2 = False
    bc_in_fastq = False

    # If we have bc in read, use that & ignore a separate BC read
    if chunk.get('bc_in_read',
                 None) is not None and chunk.get('bc_length', 0) > 0:
        have_barcode = True
        bc_length = chunk['bc_length']
        if chunk['bc_in_read'] == 1:
            bc_in_read1 = True
            read1_trim += bc_length
        elif chunk['bc_in_read'] == 2:
            bc_in_read2 = True
            read2_trim += bc_length
        else:
            martian.exit(
                "bc_in_read configuration incorrect -- read must be 1 or 2")

    # Otherwise use the BC file
    elif chunk['barcode'] is not None:
        have_barcode = True
        bc_in_fastq = True

    have_sample_index = chunk['sample_index'] is not None

    output_directory = os.path.dirname(os.path.realpath(outs.placeholder))
    max_read_num = args.max_read_num

    # counter for sub-chunked files
    file_number = 1

    # open the available read files and make the appropriate iterators
    if interleaved:
        read_in = openfq(chunk['read1'])
        read_iter = tk_fasta.read_generator_fastq(read_in, paired_end=True)
    else:
        if have_read2:
            read1_in = openfq(chunk['read1'])
            read1_iter = tk_fasta.read_generator_fastq(read1_in)

            read2_in = openfq(chunk['read2'])
            read2_iter = tk_fasta.read_generator_fastq(read2_in)

            read_iter = itertools.imap(
                lambda x, y: (x[0], x[1], x[2], y[0], y[1], y[2]), read1_iter,
                read2_iter)
        else:
            read1_in = openfq(chunk['read1'])
            read_iter = tk_fasta.read_generator_fastq(read1_in)

    # open read file
    read_name = output_directory + "/read" + str(file_number) + ".fastq"
    read_names = [read_name]
    out_read_fastq = open(read_name, 'w')

    # Bail out if there's no barcodes or whitelist
    if args.barcode_whitelist is None:
        outs.bc_counts = None
        bc_idx = None
    else:
        barcode_whitelist = sorted(
            list(tk_seq.load_barcode_whitelist(args.barcode_whitelist)))
        bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)}
        bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32)
        bad_count = 0

    # open barcode file if there is one
    if have_barcode:
        bc_name = output_directory + "/BC" + str(file_number) + ".fastq"
        out_bc_fastq = open(bc_name, 'w')
        bc_names = [bc_name]
        if bc_in_fastq:
            bc_in = openfq(chunk['barcode'])
            bc_iter = tk_fasta.read_generator_fastq(bc_in)
        elif bc_in_read1 or bc_in_read2:
            # BC in read -- have output file but no input file
            bc_iter = itertools.repeat(None)
    else:
        bc_iter = itertools.repeat(None)
        bc_names = [None]
        outs.bc_counts = None

# open sample_index file if there is one
    if have_sample_index:
        si_name = output_directory + "/SI" + str(file_number) + ".fastq"
        out_si_fastq = open(si_name, 'w')
        si_in = openfq(chunk['sample_index'])
        si_iter = tk_fasta.read_generator_fastq(si_in)
        si_names = [si_name]
    else:
        si_iter = itertools.repeat(None)
        si_names = [None]

    # loop through reads
    read_num = 0
    for read, barcode_read, sample_index_read in itertools.izip(
            read_iter, bc_iter, si_iter):
        if read_num > 0 and random.random() > subsample_rate:
            continue

        if paired:
            (name1, seq1, qual1, name2, seq2, qual2) = read
        else:
            (name1, seq1, qual1) = read

        new_seq1 = seq1[read1_trim:]
        new_qual1 = qual1[read1_trim:]
        if paired:
            new_seq2 = seq2[read2_trim:]
            new_qual2 = qual2[read2_trim:]

        # Get BC sequence out of the read, for BC-in-read schemes
        if bc_in_read1:
            barcode_read = (name1, seq1[:bc_length], qual1[:bc_length])

        if bc_in_read2:
            barcode_read = (name2, seq2[:bc_length], qual2[:bc_length])

        read_num += 1
        if read_num > max_read_num:
            read_num = 1
            file_number += 1
            read_name = output_directory + "/read" + str(
                file_number) + ".fastq"
            out_read_fastq.close()
            out_read_fastq = open(read_name, 'w')
            read_names.append(read_name)

            if have_barcode:
                bc_name = output_directory + "/BC" + str(
                    file_number) + ".fastq"
                out_bc_fastq.close()
                out_bc_fastq = open(bc_name, 'w')
                bc_names.append(bc_name)
            else:
                bc_names.append(None)

            if have_sample_index:
                si_name = output_directory + "/SI" + str(
                    file_number) + ".fastq"
                out_si_fastq.close()
                out_si_fastq = open(si_name, 'w')
                si_names.append(si_name)
            else:
                si_names.append(None)

        if have_barcode:
            barcode_seq = barcode_read[1]
            barcode_qual = barcode_read[2]
            if chunk['barcode_reverse_complement']:
                barcode_seq = tk_seq.get_rev_comp(barcode_seq)
                barcode_qual = barcode_qual[::
                                            -1]  # obscure way to reverse string
            if bc_idx is not None:
                idx = bc_idx.get(barcode_seq)
                if idx is not None:
                    bc_counts[idx] += 1
                else:
                    bad_count += 1

            tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0],
                                      barcode_seq, barcode_qual)
        if have_sample_index:
            tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0],
                                      sample_index_read[1],
                                      sample_index_read[2])

        tk_fasta.write_read_fastq(out_read_fastq, name1, new_seq1, new_qual1)
        if paired:
            tk_fasta.write_read_fastq(out_read_fastq, name2, new_seq2,
                                      new_qual2)

    if have_barcode:
        out_bc_fastq.close()
        # Only emit BC counts if we had a whitelist
        if outs.bc_counts is not None:
            result = {}
            result['bad_bc_count'] = bad_count
            result['bc_counts'] = list(bc_counts)
            with open(outs.bc_counts, 'w') as bc_counts_out:
                tenkit.safe_json.dump_numpy(result, bc_counts_out)
    if have_sample_index:
        out_si_fastq.close()
    out_read_fastq.close()

    chunks = []
    for (r, bc, si) in zip(read_names, bc_names, si_names):
        new_chunk = {
            'read1': r,
            'read2': None,
            'barcode': bc,
            'sample_index': si,
            'barcode_reverse_complement': False,
            'reads_interleaved': have_read2 or interleaved,
            'gem_group': chunk['gem_group'],
            'read_group': chunk['read_group']
        }
        chunks.append(new_chunk)

    outs.chunks = chunks
Example #6
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    with open(args.contig_annotations) as f:
        contigs = vdj_annot.load_contig_list_from_json(f,
                                                       args.vdj_reference_path)

    contigs.sort(key=lambda c: (c.barcode, c.get_single_chain(
    ), not c.productive, -c.umi_count, -c.read_count, -len(c)))

    low_confidence_contigs = set()
    cell_contigs = set()

    for (bc,
         chain), group in itertools.groupby(contigs,
                                            key=lambda c:
                                            (c.barcode, c.get_single_chain())):
        first_cdr3 = None
        first_cdr3_umis = None
        seen_cdr3s = set()

        for contig in group:
            contig.high_confidence = True

            if contig.is_cell:
                cell_contigs.add(contig.contig_name)

            if first_cdr3 is None:
                first_cdr3 = contig.cdr3_seq
                first_cdr3_umis = contig.umi_count

            # Mark as low confidence:
            # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3
            #    with a single UMI or low UMIs relative to the first contig, or
            extraneous_cdr3 = first_cdr3 is not None \
               and contig.cdr3_seq != first_cdr3 \
               and (contig.umi_count == 1 or \
                    (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO)

            # 2) Any contigs with a repeated CDR3.
            repeat_cdr3 = contig.cdr3_seq in seen_cdr3s

            if extraneous_cdr3 or repeat_cdr3:
                contig.high_confidence = False
                low_confidence_contigs.add(contig.contig_name)

            seen_cdr3s.add(contig.cdr3_seq)

            if chain in vdj_constants.VDJ_GENES:
                reporter._get_metric_attr('vdj_high_conf_prod_contig_frac',
                                          chain).add(
                                              1, filter=contig.high_confidence)
            reporter._get_metric_attr('vdj_high_conf_prod_contig_frac',
                                      cr_constants.MULTI_REFS_PREFIX).add(
                                          1, filter=contig.high_confidence)

    # Write augmented contig annotations
    with open(outs.contig_annotations, 'w') as f:
        vdj_annot.save_annotation_list_json(f, contigs)

    # Write filtered fasta
    with open(args.contig_fasta) as in_file, \
         open(outs.filtered_contig_fasta, 'w') as out_file:
        for hdr, seq in cr_utils.get_fasta_iter(in_file):
            # Keep contigs that are high confidence & in cells
            if hdr not in low_confidence_contigs and hdr in cell_contigs:
                tk_fasta.write_read_fasta(out_file, hdr, seq)

    # Write filtered fastq
    with open(args.contig_fastq) as in_file, \
         open(outs.filtered_contig_fastq, 'w') as out_file:
        for name, seq, qual in tk_fasta.read_generator_fastq(in_file):
            if name not in low_confidence_contigs and name in cell_contigs:
                tk_fasta.write_read_fastq(out_file, name, seq, qual)

    reporter.report_summary_json(outs.summary)
Example #7
0
def get_consensus_quals(in_bam, clonotype_name, in_fasta, sel_contigs,
                        contig_umis, out_dir):
    """Compute base quality scores of a sequence.

    Args:
    - in_bam: bam file to get the list of reads assigned to UMIs on the selected contigs
    - clonotype_name: Used for naming output files.
    - sel_contigs: Contigs that led to the consensus sequence above
    - contig_umis: from contig name to list of umis assigned to that contig

    Return value:
    String with base qualities (in FASTQ format).
    """

    pref = re.sub('.fasta', '', os.path.basename(in_fasta))
    fastq1 = re.sub('.fasta', '_1.fastq', in_fasta)
    fastq2 = re.sub('.fasta', '_2.fastq', in_fasta)

    sel_reads = {}

    for contig in sel_contigs:
        umi_read_count = Counter()
        barcode = contig.split('_')[0]
        contig_read_count = 0

        # Wrap contig w/ str() because pysam crashes on unicode input
        for read in in_bam.fetch(str(contig)):
            # NOTE: Assembler assumes that any tags are part of the read name
            # BUT the bam that we feed to this stage has the tags stripped out
            # of the name.
            umi = read.get_tag(PROCESSED_UMI_TAG)
            if umi in contig_umis[contig] and not read.is_secondary:
                umi_read_count[umi] += 1
                if umi_read_count[umi] >= MAX_READS_PER_UMI:
                    continue

                contig_read_count += 1
                if contig_read_count >= MAX_READS_PER_CONTIG:
                    continue

                if not read.qname in sel_reads:
                    sel_reads[read.qname] = [None, None]
                sel_reads[read.qname][read.is_read2] = read

    with open(fastq1, 'w') as f1, open(fastq2, 'w') as f2:
        for read_name, pair in sel_reads.iteritems():
            read1, read2 = pair[0], pair[1]

            if read1 is None:
                # Replace the UMI with <BC>_<UMI>.
                umi = read2.get_tag(PROCESSED_UMI_TAG)
            else:
                umi = read1.get_tag(PROCESSED_UMI_TAG)

            header = cr_fastq.AugmentedFastqHeader(read_name)
            header.set_tag(PROCESSED_UMI_TAG, barcode + '_' + umi)
            header.set_tag(PROCESSED_BARCODE_TAG, barcode)

            if read1 is None:
                out_seq1 = ""
                out_quals1 = ""
            else:
                out_seq1 = tk_seq.get_rev_comp(
                    read1.seq) if read1.is_reverse else read1.seq
                out_quals1 = read1.qual[::
                                        -1] if read1.is_reverse else read1.qual
            tk_fasta.write_read_fastq(f1, header.to_string(), out_seq1,
                                      out_quals1)

            if read2 is None:
                out_seq2 = ""
                out_quals2 = ""
            else:
                out_seq2 = tk_seq.get_rev_comp(
                    read2.seq) if read2.is_reverse else read2.seq
                out_quals2 = read2.qual[::
                                        -1] if read2.is_reverse else read2.qual
            tk_fasta.write_read_fastq(f2, header.to_string(), out_seq2,
                                      out_quals2)

    assert (len(sel_reads) > 0)

    cmd = ['vdj_asm', 'base-quals', re.sub('.fasta', '', in_fasta), out_dir]
    sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

    tk_subproc.check_call(cmd, cwd=os.getcwd())

    with open(os.path.join(out_dir, pref + '.fastq'), 'r') as f:
        lines = f.readlines()

    return lines[3].strip()
Example #8
0
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args):
    """Build a consensus sequence from a set of contigs.

    Args:
    - clonotype_name: Used to prefix output files.
    - sel_contigs: Names of contigs to use for consensus building.
    - best_contig: Name of "best" contig. Will search for this contig's sequence
        and base qualities.
    - out_dir: dir used for temporary results
    - args: stage args.

    - Return value:
    A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name).
    - best_contig_seq/best_contig_quals: the sequence and quals of the best contig
    - consensus_seq: the consensus sequence or None if no consensus could be built.
    - out_bam_name: Path of BAM with alignments of contigs to consensus seq.
    - out_fastq_name: FASTQ with contig sequences.
    - out_fasta_name: FASTA with consensus sequence.
    enough reads for consensus.
    """

    best_contig_seq = None
    best_contig_quals = None

    # Input to base quality computation - we don't really need the
    # base qualities because we will replace them by read-based qualities
    # But we need to do this to get proper alignments of contigs against
    # the consensus.
    out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq')

    # Input to assembly
    out_bam_name = martian.make_path(clonotype_name + '_contigs.bam')

    # The reference in the output bam doesn't really matter.
    out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1])

    # Read the entire fastq (all contigs) and write the selected contigs to
    # a bam for the assembler and a fastq for the aligner.
    with open(args.contigs_fastq, 'r') as f, open(out_fastq_name,
                                                  'w') as out_fq:
        fq_iter = tk_fasta.read_generator_fastq(f)
        for (name, seq, quals) in fq_iter:
            if name in sel_contigs:
                if name == best_contig:
                    best_contig_seq = seq
                    best_contig_quals = quals

                header = cr_fastq.AugmentedFastqHeader(name)
                # Create a pseudo-UMI for each input contig
                header.set_tag(PROCESSED_UMI_TAG, name)
                # Put all reads on the same "barcode". This is important, so
                # the assembler assembles all of them together.
                header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name)

                record = pysam.AlignedRead()

                record.reference_start = 0
                record.reference_id = 0
                # Wrap with str() or pysam will crash when given unicode
                record.qname = str(header.to_string())
                record.seq = seq
                record.qual = quals
                record.flag = MAPPED_UNPAIRED_FLAG

                out_bam.write(record)

                # Now change the tags. The final bam concatenation code will pull
                # the tags out of the header, so we want these to be meaningful.
                # Put the real barcode in the barcode tag. The alignment-base-qual
                # code will ignore it anyway.
                header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0])
                tk_fasta.write_read_fastq(out_fq, header.to_string(), seq,
                                          quals)

    out_bam.close()
    assert (not best_contig_seq is None)

    out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta')

    # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name.
    # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the
    # only output of the assembler we care about.
    if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS:
        cmd = [
            'vdj_asm',
            'asm',
            out_bam_name,
            out_dir,
            '--single-end',
            '--cons',  # required so we produce a single output sequence
            '--kmers=0',
            '--min-qual=0',
            '--score-factor=0.0'
        ]
        sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

        tk_subproc.check_call(cmd, cwd=os.getcwd())

        with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'),
                  'r') as contig_f:
            lines = contig_f.readlines()
            if lines:
                out_seq = lines[1].strip()
            else:
                # In some rare cases (eg. input contigs have 0 quality), assembly might fail.
                out_seq = None
    else:
        out_seq = None

    # Write the best contig sequence on a new fasta. We need to make sure this has the
    # right contig name because this will be the name written in the bam alignments
    # of the contigs against the consensus
    with open(out_fasta_name, 'w') as f:
        tk_fasta.write_read_fasta(f, clonotype_name,
                                  out_seq if out_seq else best_contig_seq)

    # Now align the same reads that were used in vdj_asm against the consensus that you just got.
    # The output will be in out_dir/<clonotype_name> + '_contigs.bam'
    cmd = [
        'vdj_asm', 'base-quals',
        martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end'
    ]
    sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

    tk_subproc.check_call(cmd, cwd=os.getcwd())

    # Move the BAM of the contigs aligned against the consensus out of the outs
    # (Will overwrite this bam which was already used as input to assembly).
    cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'),
               out_bam_name)

    return (best_contig_seq, best_contig_quals, out_seq, out_bam_name,
            out_fastq_name, out_fasta_name)
Example #9
0
def main(args, outs):
    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []

    chunk_clonotypes = set(args.chunk_clonotypes)

    reporter = vdj_report.VdjReporter()
    if not args.clonotype_assignments or not vdj_utils.bam_has_seqs(
            args.contig_bam):
        # always produce an empty summary
        reporter.save(outs.chunked_reporter)
        return

    # Get the clonotype-barcode assignments
    with open(args.clonotype_assignments) as f:
        clonotypes = json.load(f)

    # Partition contig annotations by consensus id
    consensus_to_contigs = defaultdict(list)
    relevant_contig_ids = set()

    with open(args.chunk_annotations) as f:
        contigs = vdj_annot.load_contig_list_from_json(f,
                                                       args.vdj_reference_path)

    clo_key = '%s_clonotype_id' % args.metric_prefix
    cons_key = '%s_consensus_id' % args.metric_prefix

    for contig in contigs:
        clo_id = contig.info_dict.get(clo_key)
        cons_id = contig.info_dict.get(cons_key)
        assert clo_id in chunk_clonotypes and cons_id is not None

        consensus_to_contigs[cons_id].append(contig)
        relevant_contig_ids.add(contig.contig_name)

    assert len(consensus_to_contigs) > 0

    in_bam = tk_bam.create_bam_infile(args.contig_bam)

    n_merged_bams = 0

    # For all contigs relevant to this chunk,
    #   get the assembler umi data required for base qual recalculation.
    # Do not attempt to read into a pandas object because it can be huge.
    contig_umis = defaultdict(set)
    with open(args.umi_summary_tsv, 'r') as umi_file:
        for line in umi_file:
            fields = line.strip().split('\t')
            umi = fields[2]
            if umi == 'umi' or len(fields) < 7:
                continue
            good_umi = fields[5].lower() == 'true'
            contig_ids = set(fields[6].split(','))
            if good_umi and len(contig_ids & relevant_contig_ids) > 0:
                for c in contig_ids:
                    contig_umis[c].add(umi)

    consensus_fastq = open(outs.consensus_fastq, 'w')
    consensus_fasta = open(outs.consensus_fasta, 'w')
    ref_fasta = open(outs.concat_ref_fasta, 'w')

    consensus_contigs = []
    ref_contigs = []

    assert (args.metric_prefix in reporter.vdj_clonotype_types)

    # Iterate over clonotype assignments
    for clonotype_id, clonotype in clonotypes.iteritems():
        if not clonotype_id in chunk_clonotypes:
            continue

        for consensus_id, consensus in clonotype['consensuses'].iteritems():
            cdr = consensus['cdr3_seq']

            # Verify that the contig annotation data are consistent with the clonotype assignment data
            assert set(consensus['cell_contigs']) == \
                set(c.contig_name for c in consensus_to_contigs[consensus_id])
            sel_contigs = consensus_to_contigs[consensus_id]
            sel_contig_ids = [c.contig_name for c in sel_contigs]

            # Keep track of the "best" contig. This will be used in case the
            # merging fails.
            best_contig = None

            # Keep track of the set of distinct annotations of the contigs to merge.
            # Will use to report rate of discrepancies.
            feature_annotations = defaultdict(set)

            for contig in sel_contigs:
                for anno in contig.annotations:
                    feature_annotations[anno.feature.region_type].add(
                        anno.feature.gene_name)

                # Always choose a productive over a non-productive. Between
                # contigs with the same productivity, choose the one that had more UMIs.
                if best_contig is None or (not best_contig.productive and contig.productive) or \
                   (best_contig.productive == contig.productive and \
                    best_contig.umi_count < contig.umi_count):

                    best_contig = contig

            assert best_contig is not None

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_v_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_j_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            wrong_cdr_metric = reporter._get_metric_attr(
                'vdj_clonotype_consensus_wrong_cdr_contig_frac',
                args.metric_prefix)

            tmp_dir = martian.make_path(consensus_id + '_outs')
            cr_io.mkdir(tmp_dir, allow_existing=True)

            res = get_consensus_seq(consensus_id, sel_contig_ids,
                                    best_contig.contig_name, tmp_dir, args)
            (best_seq, best_quals, consensus_seq, contig_to_cons_bam,
             contig_fastq, contig_fasta) = res

            outs.chunked_consensus_bams.append(contig_to_cons_bam)

            # make sure the bam file has the right header (single sequence with this consensus name)
            tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam)
            if list(tmp_bam.references) != [consensus_id]:
                # Print some info to help us debug
                print tmp_bam.references, consensus_id
                assert (list(tmp_bam.references) == [consensus_id])
            tmp_bam.close()

            if consensus_seq:
                # If this is not None, we actually built a consensus, so we have to compute the quals from scratch.
                # Use a subset of the contigs for computing quals.
                contig_ids = map(
                    lambda c: c.contig_name,
                    sorted(sel_contigs,
                           key=lambda c: c.umi_count,
                           reverse=True))
                contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS]

                consensus_quals = get_consensus_quals(in_bam, consensus_id,
                                                      contig_fasta, contig_ids,
                                                      contig_umis, tmp_dir)
            else:
                consensus_seq = best_seq
                consensus_quals = best_quals

            assert (len(consensus_seq) == len(consensus_quals))

            total_read_count = sum([c.read_count for c in sel_contigs])
            total_umi_count = sum([c.umi_count for c in sel_contigs])

            contig_info_dict = {
                'cells': clonotype['barcodes'],
                'cell_contigs': sel_contig_ids,
                'clonotype_freq': clonotype['freq'],
                'clonotype_prop': clonotype['prop'],
            }

            contig = annotate_consensus_contig(args.vdj_reference_path,
                                               args.min_score_ratios,
                                               args.min_word_sizes,
                                               consensus_id,
                                               clonotype_id,
                                               consensus_seq,
                                               consensus_quals,
                                               read_count=total_read_count,
                                               umi_count=total_umi_count,
                                               info_dict=contig_info_dict,
                                               primers=args.primers)

            wrong_cdr_metric.add(1,
                                 filter=contig.cdr3_seq is None
                                 or contig.cdr3_seq != cdr)

            if contig.cdr3_seq is None or contig.cdr3_seq != cdr:
                # Something went wrong. Use "best" contig as the consensus.
                consensus_seq = best_seq
                consensus_quals = best_quals
                contig = annotate_consensus_contig(args.vdj_reference_path,
                                                   args.min_score_ratios,
                                                   args.min_word_sizes,
                                                   consensus_id,
                                                   clonotype_id,
                                                   consensus_seq,
                                                   consensus_quals,
                                                   read_count=total_read_count,
                                                   umi_count=total_umi_count,
                                                   info_dict=contig_info_dict,
                                                   primers=args.primers)

            assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr)

            consensus_contigs.append(contig)

            tk_fasta.write_read_fasta(consensus_fasta, consensus_id,
                                      consensus_seq)
            tk_fasta.write_read_fastq(consensus_fastq, consensus_id,
                                      consensus_seq, consensus_quals)
            assert (len(consensus_seq) == len(consensus_quals))

            ref_seq_parts, ref_annos = contig.get_concat_reference_sequence()

            # Align the contigs and consensus to a synthetic concatenated reference
            if ref_seq_parts is not None:
                # Trim the last segment down to the annotated length
                #   to avoid including the entire (500nt) C-region
                ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1].
                                                      annotation_match_end]

                # Concatenate the reference VDJC segments
                ref_seq = reduce(lambda x, y: x + y, ref_seq_parts)
                ref_name = re.sub('consensus', 'concat_ref', consensus_id)

                # Reannotate the reference sequence.
                # Restrict the annotation to the already-called segments to
                #   reduce the risk of discordance between the consensus and
                #   concat_ref annotations.
                ref_contig = annotate_consensus_contig(
                    args.vdj_reference_path,
                    args.min_score_ratios,
                    args.min_word_sizes,
                    ref_name,
                    clonotype_id,
                    ref_seq,
                    'I' * len(ref_seq),
                    use_features=set([a.feature.feature_id
                                      for a in ref_annos]),
                )
                ref_contigs.append(ref_contig)

                # Add the consensus sequence to the input FASTQ (next to the contigs)
                with open(contig_fastq, 'a') as contig_fq:
                    # Create a fake UMI and barcode
                    header = cr_fastq.AugmentedFastqHeader(consensus_id)
                    header.set_tag(PROCESSED_UMI_TAG, consensus_id)
                    header.set_tag(PROCESSED_BARCODE_TAG, consensus_id)
                    tk_fasta.write_read_fastq(contig_fq, header.to_string(),
                                              consensus_seq, consensus_quals)

                # Reuse this file (this had the assembly output but we don't need it anymore)
                ref_fasta_name = martian.make_path(consensus_id +
                                                   '_contigs.fasta')
                with open(ref_fasta_name, 'w') as f:
                    tk_fasta.write_read_fasta(f, ref_name, ref_seq)

                # Also append to the final output
                tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq)

                cmd = [
                    'vdj_asm', 'base-quals',
                    martian.make_path(consensus_id + '_contigs'), tmp_dir,
                    '--single-end'
                ]
                sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

                tk_subproc.check_call(cmd, cwd=os.getcwd())

                # Move out of tmp dir
                rec_bam = martian.make_path(consensus_id + '_reference.bam')
                cr_io.move(
                    os.path.join(tmp_dir, consensus_id + '_contigs.bam'),
                    rec_bam)
                outs.chunked_concat_ref_bams.append(rec_bam)

            if os.path.isdir(tmp_dir):
                shutil.rmtree(tmp_dir)

            # Clean up unneeded files ASAP
            rm_files([
                consensus_id + '_contigs.fasta',
                consensus_id + '_contigs.fastq'
            ])

            # Merge N most recent BAM files to avoid filesystem overload
            if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY:
                assert len(outs.chunked_consensus_bams) == len(
                    outs.chunked_concat_ref_bams)

                new_cons_bam = martian.make_path('merged-consensus-%03d.bam' %
                                                 n_merged_bams)
                concatenate_bams(new_cons_bam, outs.chunked_consensus_bams)
                rm_files(outs.chunked_consensus_bams)
                outs.chunked_consensus_bams = [new_cons_bam]

                new_ref_bam = martian.make_path('merged-ref-%03d.bam' %
                                                n_merged_bams)
                concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams)
                rm_files(outs.chunked_concat_ref_bams)
                outs.chunked_concat_ref_bams = [new_ref_bam]

                n_merged_bams += 1

    in_bam.close()

    consensus_fastq.close()
    consensus_fasta.close()
    ref_fasta.close()

    reporter.save(outs.chunked_reporter)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)
Example #10
0
def main(args, outs):
    # Martian coerces dict keys to string
    # Coerce keys back to int
    args.chunks_per_gem_group = {int(k): v for k, v in args.chunks_per_gem_group.iteritems()}

    paired_end = args.read2s_chunk is not None

    # Lazy load R1
    r1_file = cr_io.open_maybe_gzip(args.read1s_chunk)
    read1s = tk_fasta.read_generator_fastq(r1_file)

    # Lazy load R2
    if paired_end:
        r2_file = cr_io.open_maybe_gzip(args.read2s_chunk)
        read2s = tk_fasta.read_generator_fastq(r2_file)
    else:
        read2s = []

    # Lazy load corrected BCs
    bc_file = cr_io.open_maybe_gzip(args.bcs)
    bcs = (line.strip() for line in bc_file)

    buckets = {}

    bucket_filenames = {}

    for gem_group, bucket_name in enumerate_bucket_names(args.chunks_per_gem_group):
        filename = martian.make_path("%s.fastq" % bucket_name)
        bucket_filenames[bucket_name] = filename
        buckets[bucket_name] = []

    for read1, read2, barcode in itertools.izip_longest(read1s, read2s, bcs):
        # Exclude unbarcoded reads
        if barcode == '':
            continue

        # Exclude short reads
        if len(read1[1]) < MIN_READ_LENGTH or (read2 is not None and len(read2[1]) < MIN_READ_LENGTH):
            continue

        # Attach processed barcode to reads
        r1_hdr = cr_fastq.AugmentedFastqHeader(read1[0])
        r1_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode)
        r1_new_qname = r1_hdr.to_string()

        if paired_end:
            r2_hdr = cr_fastq.AugmentedFastqHeader(read2[0])
            r2_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode)
            r2_new_qname = r2_hdr.to_string()

        barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
        bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group])

        buckets[bucket_name].append((r1_new_qname, read1[1], read1[2]))
        if paired_end:
            buckets[bucket_name].append((r2_new_qname, read2[1], read2[2]))

    outs.buckets = {}

    # Sort and write each bucket
    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=vdj_utils.fastq_barcode_sort_key)

        # Don't create empty bucket files.
        # This is common when the reads are ordered by gem group
        # And a chunk sees only a single gem group.
        if len(bucket) == 0:
            continue

        filename = bucket_filenames[bucket_name]
        with cr_io.open_maybe_gzip(filename, 'w') as f:
            for read in bucket:
                tk_fasta.write_read_fastq(f, *read)

        outs.buckets[bucket_name] = bucket_filenames[bucket_name]
def main(args, outs):
    """ Trim the reads in a series of fasta files """
    chunk = args.chunk
    subsample_rate = chunk['subsample_rate']
    have_barcode = chunk['barcode'] is not None
    have_sample_index = chunk['sample_index'] is not None

    # STEP 1:  We run the R1/R2 reads through cutadapt, writing them to a temporary file with appropriate adapters
    # trimmed, optionally filtering out reads where adapters weren't found
    interleaved = chunk['read2'] is None
    # can't do discard_untrimmed because we're running cutadapt in single-end mode
    if args.trim_def['discard_untrimmed']:
        martian.exit("discard_untrimmed was set in trim_def")
    if interleaved:
        trimmed_reads = martian.make_path("trimmed_reads.fastq")
        trim_info_fn = martian.make_path("trim_info.txt")
        initial_read_pairs, trimmed_read_pairs = run_cutadapt_single_end(
            chunk['read1'], trimmed_reads, trim_info_fn, args.trim_def,
            args.adapters)
    else:
        trimmed_r1 = martian.make_path("trimmed_r1.fastq")
        trimmed_r2 = martian.make_path("trimmed_r2.fastq")
        trim_info_r1_fn = martian.make_path("trim_info_r1.txt")
        trim_info_r2_fn = martian.make_path("trim_info_r2.txt")
        initial1, trimmed1 = run_cutadapt_single_end(chunk['read1'],
                                                     trimmed_r1,
                                                     trim_info_r1_fn,
                                                     args.trim_def,
                                                     args.adapters,
                                                     read_id="R1")
        initial2, trimmed2 = run_cutadapt_single_end(chunk['read2'],
                                                     trimmed_r2,
                                                     trim_info_r2_fn,
                                                     args.trim_def,
                                                     args.adapters,
                                                     read_id="R2")
        initial_read_pairs = initial1 + initial2
        trimmed_read_pairs = trimmed1 + trimmed2
        if initial1 != initial2:
            martian.exit(
                "Input fastq files for R1 and R2 are not the same length")
        if trimmed1 != trimmed2:
            raise ValueError(
                "Cutadapt produced differing numbers of reads for R1 and R2")

    # STEP 2:  We run through the trimmed R1/R2 reads along with sample index and barcode reads, chunking into files of
    # max_read_num reads or less, and skipping sample index/barcode reads that don't match the trimmed & filtered R1/R2
    # reads
    max_read_num = args.max_read_num
    file_number = 1

    # open the available input read files and get the iterator over them
    if interleaved:
        reads_in = open_maybe_gzip(trimmed_reads, 'r')
        read_iter = tk_fasta.read_generator_fastq(reads_in, paired_end=True)
        trim_info = open_maybe_gzip(trim_info_fn, 'r')
        trim_iter = read_generator_trim_info(trim_info, paired_end=True)
    else:
        r1_in = open_maybe_gzip(trimmed_r1, 'r')
        r2_in = open_maybe_gzip(trimmed_r2, 'r')
        read_iter = ((r1[0], r1[1], r1[2], r2[0], r2[1], r2[2])
                     for r1, r2 in itertools.izip_longest(
                         tk_fasta.read_generator_fastq(r1_in),
                         tk_fasta.read_generator_fastq(r2_in)))
        trim_info_r1 = open_maybe_gzip(trim_info_r1_fn, 'r')
        trim_info_r2 = open_maybe_gzip(trim_info_r2_fn, 'r')
        trim_iter = (t1 + t2 for t1, t2 in itertools.izip(
            read_generator_trim_info(trim_info_r1),
            read_generator_trim_info(trim_info_r2)))

    # open output read file, which will be interleaved
    read_name = martian.make_path("read{}.fastq".format(file_number))
    out_readfiles = [read_name]
    out_read_fastq = open(read_name, 'w')

    # open trimmed read file, which will be interleaved
    trim_out_name = martian.make_path("TRIM{}.fastq".format(file_number))
    out_trimfiles = [trim_out_name]
    out_trim_fastq = open(trim_out_name, 'w')

    if args.barcode_whitelist is None:
        outs.bc_counts = None
        barcode_indices = None
    else:
        barcode_whitelist = sorted(
            list(load_barcode_whitelist(args.barcode_whitelist)))
        barcode_indices = {
            bc: idx
            for (idx, bc) in enumerate(barcode_whitelist)
        }
        bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32)
        bad_count = 0

    # open barcode file if there is one
    if have_barcode:
        bc_name = martian.make_path("BC{}.fastq".format(file_number))
        out_bc_fastq = open(bc_name, 'w')
        out_barcodefiles = [bc_name]
        barcode_read = None
        bc_in = open_maybe_gzip(chunk['barcode'], 'r')
        bc_iter = tk_fasta.read_generator_fastq(bc_in)
        # Determine if barcode sequences need to be reverse complemented.
        with open_maybe_gzip(chunk['barcode'], 'r') as bc_in2:
            bc_iter2 = tk_fasta.read_generator_fastq(bc_in2)
            barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)
            barcode_rc = infer_barcode_reverse_complement(
                barcode_whitelist, bc_iter2)
    else:
        out_barcodefiles = [None]
        outs.bc_counts = None

    # open sample_index file if there is one
    if have_sample_index:
        si_name = martian.make_path("SI{}.fastq".format(file_number))
        out_si_fastq = open(si_name, 'w')
        si_in = open_maybe_gzip(chunk['sample_index'], 'r')
        sample_index_read = None
        si_iter = tk_fasta.read_generator_fastq(si_in)
        out_sampleindex_files = [si_name]
    else:
        out_sampleindex_files = [None]

    read_num = 0
    random.seed(0)
    for (read, trim) in itertools.izip(read_iter, trim_iter):
        # Downsample (other than the first read).  Note we've set a fixed seed to make this deterministic.
        if read_num > 0 and random.random() > subsample_rate:
            continue

        # Now we need to step through the barcode and sample index reads to find the matching reads
        if have_barcode:
            try:
                while barcode_read is None or not read_match(
                        read, barcode_read):
                    barcode_read = bc_iter.next()
                # reverse complement if all barcodes are RC-ed
                if barcode_rc:
                    barcode_read = (barcode_read[0],
                                    tk_seq.get_rev_comp(barcode_read[1]),
                                    barcode_read[2][::-1])
            except StopIteration:
                raise ValueError(
                    "Couldn't find barcode read matching {}".format(
                        get_read_name(read)))
        if have_sample_index:
            try:
                while sample_index_read is None or not read_match(
                        read, sample_index_read):
                    sample_index_read = si_iter.next()
            except StopIteration:
                raise ValueError(
                    "Couldn't find sample index read matching {}".format(
                        get_read_name(read)))

        (name1, seq1, qual1, name2, seq2, qual2) = read
        (tr_name1, tr_seq1, tr_qual1, tr_name2, tr_seq2, tr_qual2) = trim

        read_num += 1
        if read_num > max_read_num:
            read_num = 1
            file_number += 1
            read_name = martian.make_path("read{}.fastq".format(file_number))
            out_read_fastq.close()
            out_read_fastq = open(read_name, 'w')
            out_readfiles.append(read_name)

            trim_out_name = martian.make_path(
                "TRIM{}.fastq".format(file_number))
            out_trim_fastq.close()
            out_trim_fastq = open(trim_out_name, 'w')
            out_trimfiles.append(trim_out_name)

            if have_barcode:
                bc_name = martian.make_path("BC{}.fastq".format(file_number))
                out_bc_fastq.close()
                out_bc_fastq = open(bc_name, 'w')
                out_barcodefiles.append(bc_name)
            else:
                out_barcodefiles.append(None)

            if have_sample_index:
                si_name = martian.make_path("SI{}.fastq".format(file_number))
                out_si_fastq.close()
                out_si_fastq = open(si_name, 'w')
                out_sampleindex_files.append(si_name)
            else:
                out_sampleindex_files.append(None)

        if have_barcode:
            barcode_seq = barcode_read[1]
            barcode_qual = barcode_read[2]
            if barcode_indices is not None:
                idx = barcode_indices.get(barcode_seq)
                if idx is not None:
                    bc_counts[idx] += 1
                else:
                    bad_count += 1

            tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0],
                                      barcode_seq, barcode_qual)
        if have_sample_index:
            tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0],
                                      sample_index_read[1],
                                      sample_index_read[2])

        tk_fasta.write_read_fastq(out_read_fastq, name1, seq1, qual1)
        tk_fasta.write_read_fastq(out_read_fastq, name2, seq2, qual2)

        tk_fasta.write_read_fastq(out_trim_fastq, tr_name1, tr_seq1, tr_qual1)
        tk_fasta.write_read_fastq(out_trim_fastq, tr_name2, tr_seq2, tr_qual2)

    if interleaved:
        reads_in.close()
    else:
        r1_in.close()
        r2_in.close()

    if have_barcode:
        out_bc_fastq.close()
        # Only emit BC counts if we had a whitelist
        if outs.bc_counts is not None:
            result = {}
            result['bad_bc_count'] = bad_count
            result['bc_counts'] = list(bc_counts)
            with open(outs.bc_counts, 'w') as bc_counts_out:
                tenkit.safe_json.dump_numpy(result, bc_counts_out)

    with open(outs.read_counts, 'w') as outfile:
        read_counts = {
            'total_read_pairs': initial_read_pairs,
            'filtered_read_pairs': trimmed_read_pairs
        }
        tenkit.safe_json.dump_numpy(read_counts, outfile)

    if have_sample_index:
        out_si_fastq.close()
    out_read_fastq.close()
    out_trim_fastq.close()

    outs.chunks = [
        {
            'read1': r,  # output chunked trimmed read file
            'read2': None,
            'trim': t,  # output chunked trim file
            'barcode': bc,  # output chunked barcode file
            'sample_index': si,  # output chunked sample index file
            'barcode_reverse_complement':
            False,  # we always keep BC in correct orientation
            'reads_interleaved': True,
            'gem_group': chunk['gem_group'],
            'read_group': chunk['read_group']
        } for (r, t, bc, si) in zip(out_readfiles, out_trimfiles,
                                    out_barcodefiles, out_sampleindex_files)
    ]
Example #12
0
def main(args, outs):
    outs.chunked_consensus_bams = []
    outs.chunked_concat_ref_bams = []

    chunk_clonotypes = set(args.chunk_clonotypes)

    reporter = vdj_report.VdjReporter()
    if not args.clonotype_assignments or not vdj_utils.bam_has_seqs(
            args.contig_bam):
        # always produce an empty summary
        reporter.save(outs.chunked_reporter)
        return

    with open(args.annotations) as f:
        contigs = cPickle.load(f)
    with open(args.clonotype_assignments) as f:
        clonotypes = json.load(f)
    in_bam = tk_bam.create_bam_infile(args.contig_bam)

    contig_read_counts = {c.contig_name: c.read_count for c in contigs}
    contig_umi_counts = {c.contig_name: c.umi_count for c in contigs}

    # Do not attempt to read into a pandas object because it can be huge.
    contig_umis = defaultdict(set)
    with open(args.umi_summary_tsv, 'r') as umi_file:
        for line in umi_file:
            fields = line.strip().split('\t')
            umi = fields[2]
            if umi == 'umi' or len(fields) < 7:
                continue
            good_umi = fields[5] == 'True'
            contig_names = fields[6].split(',')
            if good_umi:
                for c in contig_names:
                    contig_umis[c].add(umi)

    consensus_fastq = open(outs.consensus_fastq, 'w')
    consensus_fasta = open(outs.consensus_fasta, 'w')
    ref_fasta = open(outs.concat_ref_fasta, 'w')

    consensus_contigs = []
    ref_contigs = []

    assert (args.metric_prefix in reporter.vdj_clonotype_types)

    # Iterate over clonotype assignments
    for clonotype_id, clonotype in clonotypes.iteritems():
        if not clonotype_id in chunk_clonotypes:
            continue

        for consensus_id, consensus in clonotype['consensuses'].iteritems():
            cdr = consensus['cdr3_seq']

            sel_contigs = set(consensus['cell_contigs']
                              )  # Get the contigs that should be merged
            # Keep track of the "best" contig. This will be used in case the
            # merging fails.
            best_contig = None

            # Keep track of the set of distinct annotations of the contigs to merge.
            # Will use to report rate of discrepancies.
            feature_annotations = defaultdict(set)

            for contig in contigs:
                if contig.contig_name in sel_contigs:

                    for anno in contig.annotations:
                        feature_annotations[anno.feature.region_type].add(
                            anno.feature.gene_name)

                    # Always choose a productive over a non-productive. Between
                    # contigs with the same productivity, choose the one that had more UMIs.
                    if best_contig is None or (not best_contig.productive and contig.productive) or \
                       (best_contig.productive == contig.productive and \
                        len(contig_umis[best_contig.contig_name]) < len(contig_umis[contig.contig_name])):

                        best_contig = contig

            assert not best_contig is None

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_v_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            anno_count = np.max(
                [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES])
            metric = reporter._get_metric_attr(
                'vdj_clonotype_gt1_j_annotations_contig_frac',
                args.metric_prefix)
            metric.add(1, filter=anno_count > 1)

            # Order contigs by decreasing UMI support
            ordered_contigs = list(
                sorted(sel_contigs,
                       key=lambda x: len(contig_umis[x]),
                       reverse=True))
            ordered_contigs = ordered_contigs[
                0:min(MAX_CELLS_FOR_BASE_QUALS, len(sel_contigs))]

            wrong_cdr_metric = reporter._get_metric_attr(
                'vdj_clonotype_consensus_wrong_cdr_contig_frac',
                args.metric_prefix)

            tmp_dir = martian.make_path(consensus_id + '_outs')
            cr_utils.mkdir(tmp_dir, allow_existing=True)

            res = get_consensus_seq(consensus_id, sel_contigs,
                                    best_contig.contig_name, tmp_dir, args)
            (best_seq, best_quals, consensus_seq, contig_to_cons_bam,
             contig_fastq, contig_fasta) = res

            outs.chunked_consensus_bams.append(contig_to_cons_bam)

            # make sure the bam file has the right header (single sequence with this consensus name)
            tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam)
            assert (list(tmp_bam.references) == [consensus_id])
            tmp_bam.close()

            if consensus_seq:
                # If this is not None, we actually built a consensus, so we have to compute the quals from scratch.
                consensus_quals = get_consensus_quals(in_bam, consensus_id,
                                                      contig_fasta,
                                                      ordered_contigs,
                                                      contig_umis, tmp_dir)
            else:
                consensus_seq = best_seq
                consensus_quals = best_quals

            assert (len(consensus_seq) == len(consensus_quals))

            total_read_count = np.sum(
                [contig_read_counts[c] for c in sel_contigs])
            total_umi_count = np.sum(
                [contig_umi_counts[c] for c in sel_contigs])

            contig_info_dict = {
                'cells': clonotype['barcodes'],
                'cell_contigs': sel_contigs,
                'clonotype_freq': clonotype['freq'],
                'clonotype_prop': clonotype['prop'],
            }

            contig = annotate_consensus_contig(args.vdj_reference_path,
                                               args.min_score_ratios,
                                               args.min_word_sizes,
                                               consensus_id,
                                               clonotype_id,
                                               consensus_seq,
                                               consensus_quals,
                                               read_count=total_read_count,
                                               umi_count=total_umi_count,
                                               info_dict=contig_info_dict,
                                               primers=args.primers)

            wrong_cdr_metric.add(1,
                                 filter=contig.cdr3_seq is None
                                 or contig.cdr3_seq != cdr)

            if contig.cdr3_seq is None or contig.cdr3_seq != cdr:
                # Something went wrong. Use "best" contig as the consensus.
                consensus_seq = best_seq
                consensus_quals = best_quals
                contig = annotate_consensus_contig(args.vdj_reference_path,
                                                   args.min_score_ratios,
                                                   args.min_word_sizes,
                                                   consensus_id,
                                                   clonotype_id,
                                                   consensus_seq,
                                                   consensus_quals,
                                                   read_count=total_read_count,
                                                   umi_count=total_umi_count,
                                                   info_dict=contig_info_dict,
                                                   primers=args.primers)

            assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr)

            consensus_contigs.append(contig)

            tk_fasta.write_read_fasta(consensus_fasta, consensus_id,
                                      consensus_seq)
            tk_fasta.write_read_fastq(consensus_fastq, consensus_id,
                                      consensus_seq, consensus_quals)
            assert (len(consensus_seq) == len(consensus_quals))

            ref_seq_parts, ref_annos = contig.get_concat_reference_sequence()

            # Align the contigs and consensus to a synthetic concatenated reference
            if ref_seq_parts is not None:
                # Trim the last segment down to the annotated length
                #   to avoid including the entire (500nt) C-region
                ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1].
                                                      annotation_match_end]

                # Concatenate the reference VDJC segments
                ref_seq = reduce(lambda x, y: x + y, ref_seq_parts)
                ref_name = re.sub('consensus', 'concat_ref', consensus_id)

                # Reannotate the reference sequence.
                # Restrict the annotation to the already-called segments to
                #   reduce the risk of discordance between the consensus and
                #   concat_ref annotations.
                ref_contig = annotate_consensus_contig(
                    args.vdj_reference_path,
                    args.min_score_ratios,
                    args.min_word_sizes,
                    ref_name,
                    clonotype_id,
                    ref_seq,
                    'I' * len(ref_seq),
                    use_features=set([a.feature.feature_id
                                      for a in ref_annos]),
                )
                ref_contigs.append(ref_contig)

                # Add the consensus sequence to the input FASTQ (next to the contigs)
                with open(contig_fastq, 'a') as contig_fq:
                    # Create a fake UMI and barcode
                    header = cr_fastq.AugmentedFastqHeader(consensus_id)
                    header.set_tag(PROCESSED_UMI_TAG, consensus_id)
                    header.set_tag(PROCESSED_BARCODE_TAG, consensus_id)
                    tk_fasta.write_read_fastq(contig_fq, header.to_string(),
                                              consensus_seq, consensus_quals)

                # Reuse this file (this had the assembly output but we don't need it anymore)
                ref_fasta_name = martian.make_path(consensus_id +
                                                   '_contigs.fasta')
                with open(ref_fasta_name, 'w') as f:
                    tk_fasta.write_read_fasta(f, ref_name, ref_seq)

                # Also append to the final output
                tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq)

                cmd = [
                    'vdj_asm',
                    'base-quals',
                    martian.make_path(consensus_id + '_contigs'),
                    tmp_dir,
                    '--single-end',
                    '--global'  # use global alignment if a good seed isn't found - everything must get aligned
                ]
                sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

                subprocess.check_call(cmd, cwd=os.getcwd())

                # Move out of tmp dir
                rec_bam = martian.make_path(consensus_id + '_reference.bam')
                cr_utils.move(
                    os.path.join(tmp_dir, consensus_id + '_contigs.bam'),
                    rec_bam)
                outs.chunked_concat_ref_bams.append(rec_bam)

            if os.path.isdir(tmp_dir):
                shutil.rmtree(tmp_dir)

    in_bam.close()

    consensus_fastq.close()
    consensus_fasta.close()
    ref_fasta.close()

    reporter.save(outs.chunked_reporter)

    with open(outs.consensus_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, consensus_contigs)

    with open(outs.concat_ref_annotations_json, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, ref_contigs)
Example #13
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = open(args.read1_chunk)
    in_read2_fastq = open(args.read2_chunk)
    out_read1_fastq = open(outs.corrected_read1s, 'w')
    out_read2_fastq = open(outs.corrected_read2s, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                    tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])
        read2_header = cr_fastq.AugmentedFastqHeader(read2[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)
                read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)
                read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(),
                                  read1[1], read1[2])
        tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(),
                                  read2[1], read2[2])

    in_read1_fastq.close()
    in_read2_fastq.close()
    out_read1_fastq.close()
    out_read2_fastq.close()
    bc_counter.close()

    reporter.save(outs.chunked_reporter)
Example #14
0
def write_bam_read_fastq(out, read):
    if read.is_reverse:
        seq, qual = tk_seq.get_rev_comp(read.seq), read.qual[::-1]
    else:
        seq, qual = read.seq, read.qual
    tk_fasta.write_read_fastq(out, read.qname, seq, qual)