def get_feature_generator_fastq(files,
                                extractor,
                                interleaved,
                                read_types,
                                r1_length=None,
                                r2_length=None):
    '''Extract feature barcodes from FASTQs.

    Args:
       files (list of File): FASTQ file handles for R1, R2
       extractor (FeatureExtractor): Extracts feature barcodes
       interleaved (bool): Are R1,R2 interleaved in a single file
       read_types (list of str): List of read types (e.g. R1,R2) we need to inspect
       r1_length (int): Length to hard-trim R1 to
       r2_length (int): Length to hard-trim R2 to
    Returns:
       FeatureMatchResult: Yields the feature extraction result for a read pair
'''
    assert len(files) == 2
    assert 'R1' in read_types or 'R2' in read_types

    # Apply hard trimming on input
    r1_hard_end = sys.maxint if r1_length is None else r1_length
    r2_hard_end = sys.maxint if r2_length is None else r2_length

    if interleaved:
        f = files[0]
        assert f
        # Get R1 and R2 seqs from interleaved FASTQ
        pair_iter = itertools.imap(
            lambda x: (x[0:3], x[3:6]),
            tk_fasta.read_generator_fastq(f, paired_end=True))
    else:
        r1_iter = tk_fasta.read_generator_fastq(
            files[0], paired_end=False) if 'R1' in read_types else iter([])
        r2_iter = tk_fasta.read_generator_fastq(
            files[1], paired_end=False) if 'R2' in read_types else iter([])
        pair_iter = itertools.izip_longest(r1_iter, r2_iter)

    if read_types == ['R1']:
        match_func = lambda x: extractor.extract_single_end(
            x[0][1][0:r1_hard_end],  # seq
            x[0][2][0:r1_hard_end],  # qual
            'R1')

    elif read_types == ['R2']:
        match_func = lambda x: extractor.extract_single_end(
            x[1][1][0:r2_hard_end],  # seq
            x[1][2][0:r2_hard_end],  # qual
            'R2')

    elif read_types == ['R1', 'R2']:
        match_func = lambda x: extractor.extract_paired_end(
            x[0][1][0:r1_hard_end],  # seq
            x[0][2][0:r1_hard_end],  # qual
            x[1][1][0:r2_hard_end],  # seq
            x[1][2][0:r2_hard_end])  # qual

    return itertools.imap(match_func, pair_iter)
Example #2
0
def create_unaligned_bam(args, outs):
    star_ref_path = cr_utils.get_reference_star_path(args.reference_path)

    header_buf = cStringIO.StringIO()

    header_buf.write('@HD\tVN:1.4\n')

    # SQ header lines
    with open(os.path.join(star_ref_path, 'chrNameLength.txt')) as f:
        for line in f:
            chr_name, chr_len = line.strip().split('\t')
            header_buf.write('@SQ\tSN:{}\tLN:{}\n'.format(chr_name, chr_len))

    # RG header lines
    for packed_rg in args.read_groups:
        header_buf.write(
            re.sub('\\\\t', '\t', tk_bam.make_rg_header(packed_rg)) + '\n')

    # Get read group ID for this chunk of reads
    read_group = args.read_group

    # pysam doesn't support reading SAM from a StringIO object
    with open('tmphdr', 'w') as f:
        f.write(header_buf.getvalue())
    samfile = pysam.AlignmentFile('tmphdr', 'r', check_sq=False)

    outbam = pysam.AlignmentFile(outs.genome_output, 'wb', template=samfile)

    fastq_file1 = cr_io.open_maybe_gzip(args.read_chunk)
    fastq_file2 = cr_io.open_maybe_gzip(
        args.read2_chunk) if args.read2_chunk else None
    read1s = tk_fasta.read_generator_fastq(fastq_file1)
    read2s = tk_fasta.read_generator_fastq(fastq_file2) if fastq_file2 else []

    record = pysam.AlignedSegment()
    record.flag = 4

    for read1, read2 in itertools.izip_longest(read1s, read2s):
        name, seq, qual = read1
        record.query_name, record.query_sequence = name.split(' ')[0], seq
        record.query_qualities = tk_fasta.get_qvs(qual)
        record.set_tag('RG', read_group, 'Z')
        outbam.write(record)

        if read2:
            name, seq, qual = read2
            record.query_name, record.query_sequence = name.split(' ')[0], seq
            record.query_qualities = tk_fasta.get_qvs(qual)
            record.set_tag('RG', read_group, 'Z')
            outbam.write(record)

    samfile.close()
    fastq_file1.close()
    if fastq_file2 is not None:
        fastq_file2.close()
    outbam.close()
Example #3
0
def main(args, outs):
    # Martian coerces dict keys to string
    # Coerce keys back to int
    args.chunks_per_gem_group = {
        int(k): v
        for k, v in args.chunks_per_gem_group.iteritems()
    }

    with open(args.read1s_chunk) as f1:
        read1s = [read for read in tk_fasta.read_generator_fastq(f1)]

    with open(args.read2s_chunk) as f2:
        read2s = [read for read in tk_fasta.read_generator_fastq(f2)]

    assert len(read1s) == len(read2s)

    fastqs_out = {}
    buckets = {}

    outs.buckets = {}

    for gem_group, bucket_name in enumerate_bucket_names(
            args.chunks_per_gem_group):
        filename = martian.make_path("%s.fastq" % bucket_name)
        fastqs_out[bucket_name] = open(filename, 'w')
        outs.buckets[bucket_name] = filename
        buckets[bucket_name] = []

    for read1, read2 in itertools.izip(read1s, read2s):
        barcode = vdj_utils.get_fastq_read_barcode(read1)

        # Exclude unbarcoded reads
        if barcode is None:
            continue

        assert barcode == vdj_utils.get_fastq_read_barcode(read2)

        barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
        bucket_name = get_bucket_name(gem_group, barcode_seq,
                                      args.chunks_per_gem_group[gem_group])

        buckets[bucket_name].append(read1)
        buckets[bucket_name].append(read2)

    # Sort and write each bucket
    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=vdj_utils.fastq_barcode_sort_key)

        fastq_out = fastqs_out[bucket_name]
        for read in bucket:
            tk_fasta.write_read_fastq(fastq_out, *read)

        fastq_out.close()
Example #4
0
def split(args):
    assert args.read1s is not None and args.read2s is not None

    chunks = []

    if cr_chem.get_barcode_whitelist(args.chemistry_def) is not None:

        # Data are barcoded
        for read1_fq, read2_fq, barcodes_json in zip(args.read1s, args.read2s,
                                                     args.chunk_barcodes):
            with open(barcodes_json) as f:
                chunk_barcodes = json.load(f)

            chunks.append({
                'read1_chunk': read1_fq,
                'read2_chunk': read2_fq,
                'barcodes_chunk': chunk_barcodes,
                '__mem_gb': 3.0,
            })

    else:
        # Most stages assume that each chunk has a single barcode.
        # So unfortunately we have to put all reads in the same chunk, otherwise
        # metric computation will break.
        read1_out_filename = martian.make_path('chunk0_1.fastq')
        read2_out_filename = martian.make_path('chunk0_2.fastq')
        with open(read1_out_filename,
                  'w') as read1_out, open(read2_out_filename,
                                          'w') as read2_out:
            for read1_file, read2_file in zip(args.read1s, args.read2s):
                with open(read1_file) as in1, open(read2_file) as in2:
                    fastq1_iter = tk_fasta.read_generator_fastq(
                        in1, paired_end=False)
                    fastq2_iter = tk_fasta.read_generator_fastq(
                        in2, paired_end=False)

                    for read1_tuple in fastq1_iter:
                        read2_tuple = fastq2_iter.next()
                        tk_fasta.write_read_fastq(read1_out, *read1_tuple)
                        tk_fasta.write_read_fastq(read2_out, *read2_tuple)

        chunks.append({
            'read1_chunk': read1_out_filename,
            'read2_chunk': read2_out_filename,
            'barcodes_chunk': [""],
        })

    # Martian doesn't like empty chunk lists so create a chunk w/ empty data
    if len(chunks) == 0:
        return get_dummy_chunk()

    return {'chunks': chunks}
Example #5
0
def merge_by_barcode(in_filenames, r1_out_file, r2_out_file, bcs_out_file,
                     paired_end):
    barcodes = set()

    # Note: The filehandle cache precludes the use of compressed files
    file_cache = tk_cache.FileHandleCache(mode='r', open_func=open)
    heap = []

    key_func = vdj_utils.fastq_barcode_sort_key

    for filename in in_filenames:
        try:
            fastq = tk_fasta.read_generator_fastq(file_cache.get(filename),
                                                  paired_end=paired_end)
            first_readpair = fastq.next()

            key = key_func(first_readpair[0:3])
            barcode = key[0]
            barcodes.add(barcode)

            heapq.heappush(heap, (key, first_readpair, filename))

        except StopIteration:
            pass

    while len(heap) > 0:
        # Get the minimum item and write it.
        key, readpair, in_filename = heapq.heappop(heap)

        fastq = tk_fasta.read_generator_fastq(file_cache.get(in_filename),
                                              paired_end=paired_end)

        tk_fasta.write_read_fastq(r1_out_file, *readpair[0:3])
        if paired_end:
            tk_fasta.write_read_fastq(r2_out_file, *readpair[3:6])

        # Get the next item from the source file we just wrote from
        # If that file is out of items, then we leave that one out
        try:
            next_readpair = fastq.next()

            key = key_func(next_readpair[0:3])
            barcode = key[0]
            barcodes.add(barcode)

            heapq.heappush(heap, (key, next_readpair, in_filename))

        except StopIteration:
            pass

    json.dump(tk_safe_json.json_sanitize(list(barcodes)), bcs_out_file)
def infer_barcode_reverse_complement(barcode_whitelist, barcode_files):
    rc_valid_count = 0
    reg_valid_count = 0
    if barcode_whitelist:
        barcode_rc = []
        for barcode_file in barcode_files:
            read_num = 0

            if barcode_file[-3:] == ".gz":
                barcode_open_file = gzip.open(barcode_file)
            else:
                barcode_open_file = open(barcode_file, 'r')
            read_iter = tk_fasta.read_generator_fastq(barcode_open_file)
            for (name, seq, qual) in read_iter:
                if seq in barcode_whitelist:
                    reg_valid_count += 1
                if tk_seq.get_rev_comp(seq) in barcode_whitelist:
                    rc_valid_count += 1
                if read_num > 1000:
                    break
                read_num += 1

            if tk_stats.robust_divide(float(rc_valid_count), float(rc_valid_count + reg_valid_count)) > 0.75:
                barcode_rc.append(True)
            else:
                barcode_rc.append(False)
            barcode_open_file.close()
        return barcode_rc
    else:
        return [False] * len(barcode_files)
Example #7
0
def estimate_read_count_and_length(fn, num_reads=1000):
    ''' 
    Estimate the number of reads AND the average read length
    in the fastq file fn by only reading in the first
    num_reads (default 1000) reads.
    '''
    # Open reader
    if fn[-2:] == 'gz':
        reader = gzip.open(fn)
        is_gz = True
    else:
        reader = open(fn, 'r')
        is_gz = False
    ## first compute the average read length
    avg_read_length = 0.0
    gen = tk_fasta.read_generator_fastq(reader)
    rec_count = 0
    for (header, r, qual) in gen:
        avg_read_length += len(r)
        rec_count += 1
        if rec_count == num_reads:
            break
    avg_read_length = avg_read_length / rec_count
    if is_gz:
        file_len = reader.myfileobj.tell()
    else:
        file_len = reader.tell()
    ## total file size
    file_sz = os.path.getsize(fn)
    total_reads_est = float(num_reads) / file_len * file_sz
    return (total_reads_est, avg_read_length)
def fastq_data_estimate(fn, num_reads = 8000):
    # Open reader
    if fn[-2:] == 'gz':
        reader = gzip.open(fn)
        is_gz = True
    else:
        reader = open(fn, 'r')
        is_gz = False

    gen = tk_fasta.read_generator_fastq(reader)
    rds = itertools.islice(gen, num_reads)

    input_lens = [(len(header) + len(r) + len(qual) + 4, len(r)) for (header,r,qual) in rds]
    total_seq_len = sum(x[1] for x in input_lens)
    total_data_len = sum(x[0] for x in input_lens)
    file_sz = os.path.getsize(fn)

    if is_gz:
        #file_len = reader.myfileobj.tell()
        uncomp_size = estimate_gzip_uncompressed_size(fn) * 0.8        # STUPID FUDGE FACTOR TO GET >= REQUESTED AMT
    else:
        #file_len = data_len
        uncomp_size = file_sz

    read_yield = float(len(input_lens)) / total_data_len
    seq_yield = float(total_seq_len) / total_data_len
    predicted_reads = read_yield * uncomp_size
    predicted_seq = seq_yield * uncomp_size

    # For debugging
    #predicted_sz = float(total_data_len) / file_len * file_sz
    #gzip_sz = parse_gzip_sz(fn)
    #print "comp: %.2f, pred: %.2f, pred_mod2: %.2f, gzip_mod2: %.2f, gzip_est: %.2f" % (float(file_sz)/1e9, float(predicted_sz)/1e9, float(predicted_sz % 2**32)/1e9, float(gzip_sz)/1e9, float(uncomp_gzip_est2)/1e9)

    return (predicted_reads, predicted_seq)
Example #9
0
def _compute_r1_length(fastqs, reads_interleaved):
    """ Infer the length of R1 """
    num_reads = 0
    r1_max_len = 0

    def get_r1_noninterleaved(read_iter):
        for _, seq, _ in read_iter:
            yield seq
    def get_r1_interleaved(read_iter):
        for _, seq, _, _, _, _ in read_iter:
            yield seq
    get_r1 = get_r1_interleaved if reads_interleaved else get_r1_noninterleaved

    for fastq in fastqs:
        with cr_utils.open_maybe_gzip(fastq, 'r') as fq_file:
            reads = tk_fasta.read_generator_fastq(fq_file, reads_interleaved)

            for r1 in get_r1(reads):
                if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS:
                    break
                r1_max_len = max(len(r1), r1_max_len)
                num_reads += 1

        if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS:
            break

    return r1_max_len
Example #10
0
    def test_align(self):
        args = {
            'chunk_input': IN_FASTQ,
            'aligner': 'bwa',
            'aligner_method': 'MEM',
            'reference_path': 'hg19',
            '__threads': 1,
            'reads_interleaved': True
        }
        outs = {'default': OUT_BAM}

        args = martian.Record(args)
        outs = martian.Record(outs)

        main(args, outs)

        # Ensure each read has a barcode
        out_bam = pysam.Samfile(OUT_BAM)
        bam_reads = list(out_bam)

        fq_file = open(IN_FASTQ)
        fq_reads = list(
            tk_fasta.read_generator_fastq(fq_file, paired_end=False))

        self.assertEqual(len(bam_reads), len(fq_reads))
Example #11
0
def split(args):
    '''We just align each chunk independently -- joining will happen in the join step of SORT_READS'''

    # Pull some reads from fastq files -- bail out if it's less than 25bp
    fastq_tests = [x['read1'] for x in args.chunks]

    for fastq_test in fastq_tests:
        with open(fastq_test) as in_file:
            reader = tk_fasta.read_generator_fastq(in_file)
            for name, read, qual in itertools.islice(reader, 10):
                continue
                if len(read) < MIN_READ_LENGTH:
                    martian.alarm(
                        "BWA-MEM can't handle reads <25bp -- reads will be unmapped."
                    )
                    continue

    # estimated amount of memory needed to process genome is 2x(num gigabases)+4GB
    reference_pyfasta = tenkit.reference.open_reference(args.reference_path)
    reference_bases = sum(
        len(reference_pyfasta[contig]) for contig in reference_pyfasta)
    base_mem_in_gb = int(math.ceil(2 * reference_bases / (1024.0**3)))

    mem_in_gb = base_mem_in_gb + 4
    chunks = [{
        'chunk': x,
        '__threads': args.num_threads,
        '__mem_gb': mem_in_gb
    } for x in args.chunks]
    return {'chunks': chunks}
def get_raw_processed_barcodes(barcode_file, barcode_whitelist, bc_confidence_threshold, gem_group, barcodes_reverse_complement, wl_idxs, wl_dist):
    """ Stream the barcodes and the 'processed' barcode """
    bc_iterator = tk_fasta.read_generator_fastq(barcode_file)

    gem_group_str = "-" + str(gem_group)

    for (name, seq, qual) in bc_iterator:
        if barcodes_reverse_complement:
            seq = tk_seq.get_rev_comp(seq)
            qual = qual[::-1] #reverse qual string
        # Check for valid bc sequences
        if barcode_whitelist is None:
            # No whitelist case -- attach BC if there are no Ns
            if not ('N' in seq):
                processed_bc = seq + gem_group_str
                yield (name, seq, processed_bc, qual)
            else:
                yield (name, seq, None, qual)
        else:
            # whitelist case -- attach bc if posterior probability of best
            # BC sequence exceeds the confidence threshold
            bc_seq = handle_10x_barcode(bc_confidence_threshold, seq, qual, wl_idxs, wl_dist)
            if bc_seq is None:
                yield (name, seq, None, qual)
            else:
                processed_bc = bc_seq + gem_group_str
                yield (name, seq, processed_bc, qual)
def get_read_generator_fastq(fastq_open_file,
                             read_def,
                             reads_interleaved,
                             r1_length=None,
                             r2_length=None):
    read_iter = tk_fasta.read_generator_fastq(
        fastq_open_file,
        paired_end=reads_interleaved and read_def.read_type in ['R1', 'R2'])
    for read_tuple in read_iter:
        yield extract_read_maybe_paired(read_tuple, read_def,
                                        reads_interleaved, r1_length,
                                        r2_length)
def get_raw_processed_barcodes(barcode_file, barcode_whitelist,
                               bc_confidence_threshold, gem_group,
                               barcodes_reverse_complement, wl_idxs, wl_dist):
    """Stream through the raw barcodes and generate processed barcodes (which may be none)"""
    bc_iterator = tk_fasta.read_generator_fastq(barcode_file)
    for (name, seq, qual) in bc_iterator:
        if barcodes_reverse_complement:
            seq = tk_seq.get_rev_comp(seq)
            qual = qual[::-1]
        if barcode_whitelist is None:
            corrected_bc = None if ('N' in seq) else seq
        else:
            corrected_bc = correct_barcode(bc_confidence_threshold, seq, qual,
                                           wl_idxs, wl_dist, MAXDIST_CORRECT)
        if corrected_bc is not None:
            corrected_bc = '{}-{}'.format(corrected_bc, gem_group)
        yield (name, seq, corrected_bc, qual)
def get_run_data(fn):
    """ Parse flowcell + lane from the first FASTQ record. 
    NOTE: we don't check whether there are multiple FC / lanes in this file.
    """
    if fn[-2:] == 'gz':
        reader = gzip.open(fn)
    else:
        reader = open(fn, 'r')
        
    gen = tk_fasta.read_generator_fastq(reader)

    try:
        (name, seq, qual) = gen.next()
        (flowcell, lane) = re.split(':', name)[2:4]
        return (flowcell, lane)
    except StopIteration:
        # empty fastq
        martian.exit("FASTQ is empty: %s" % fn)
Example #16
0
def fastq_data_estimate(fn, num_reads=5000):
    # Open reader
    if fn[-2:] == 'gz':
        reader = gzip.open(fn)
        is_gz = True
    else:
        reader = open(fn, 'r')
        is_gz = False

    gen = tk_fasta.read_generator_fastq(reader)
    rds = itertools.islice(gen, num_reads)

    input_lens = [(len(header) + len(r) + len(qual) + 4, len(r))
                  for (header, r, qual) in rds]
    total_seq_len = sum(x[1] for x in input_lens)
    total_data_len = sum(x[0] for x in input_lens)
    file_sz = os.path.getsize(fn)

    read_length = total_seq_len / len(input_lens)

    if is_gz:
        (uncomp_size, predicted_sz) = estimate_gzip_uncompressed_size(fn)
    else:
        uncomp_size = file_sz
        predicted_sz = file_sz

    read_yield = float(len(input_lens)) / total_data_len
    seq_yield = float(total_seq_len) / total_data_len
    predicted_reads = read_yield * uncomp_size
    predicted_seq = seq_yield * uncomp_size

    # Log estimate of downsampling
    gzip_sz = parse_gzip_sz(fn)
    martian.log_info("Estimates for: %s" % fn)
    dbg_str = "compressed_size: %.2f, predicted_size: %.2f, predicted_size_mod: %.2f, gzip_size_mod: %.2f, gzip_predicted_size: %.2f" % (
        float(file_sz) / 1e9, float(predicted_sz) / 1e9,
        float(predicted_sz % 2**32) / 1e9, float(gzip_sz) / 1e9,
        float(uncomp_size) / 1e9)
    martian.log_info(dbg_str)

    return (predicted_reads, predicted_seq, read_length)
Example #17
0
def fastq_data_estimate(fn, num_reads=1000000):
    # Open reader
    if fn[-2:] == 'gz':
        reader = gzip.open(fn)
        is_gz = True
    else:
        reader = open(fn, 'r')
        is_gz = False

    gen = tk_fasta.read_generator_fastq(reader)
    rds = itertools.islice(gen, num_reads)

    input_lens = [(len(header) + len(r) + len(qual) + 4, len(r))
                  for (header, r, qual) in rds]
    total_seq_len = sum(x[1] for x in input_lens)
    total_data_len = sum(x[0] for x in input_lens)
    file_sz = os.path.getsize(fn)

    # NOTE: do not try and use the gzip footer containing the length of the compressed data
    # that only reflects the length of the final gzip block. A valid gzip file may have
    # many blocks, so that field cannot be relied upon.

    if is_gz:
        compressed_sz = reader.myfileobj.tell()
        predicted_sz = total_data_len / compressed_sz * file_sz
    else:
        predicted_sz = file_sz

    read_yield = len(input_lens) / total_data_len
    seq_yield = total_seq_len / total_data_len
    predicted_reads = read_yield * predicted_sz
    predicted_seq = seq_yield * predicted_sz

    # Log estimate of downsampling
    martian.log_info("Estimates for: %s" % fn)
    dbg_str =  "compressed_size: %.2f, predicted_size: %.2f" % \
               (file_sz / 1e9, predicted_sz / 1e9)
    martian.log_info(dbg_str)

    return (predicted_reads, predicted_seq)
Example #18
0
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    # Bail out if there's no barcodes or whitelist
    if args.barcode_whitelist is None or args.chunk['barcode'] is None:
        outs.bc_counts = None
        return

    def open_maybe_gzip(fn):
        if fn[-2:] == "gz":
            return gzip.open(fn)
        else:
            return open(fn)

    barcode_whitelist = sorted(
        list(tk_seq.load_barcode_whitelist(args.barcode_whitelist)))
    bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)}
    bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32)
    bad_count = 0

    barcode_file = open_maybe_gzip(args.chunk['barcode'])
    bc_iterator = tk_fasta.read_generator_fastq(barcode_file)

    for (bc_read, raw_bc_seq, raw_bc_qual) in bc_iterator:
        idx = bc_idx.get(raw_bc_seq)

        if idx is not None:
            bc_counts[idx] += 1
        else:
            bad_count += 1

    # Write BC count array and bad count to pickle
    result = {}
    result['bad_bc_count'] = bad_count
    result['bc_counts'] = list(bc_counts)

    with open(outs.bc_counts, 'w') as bc_counts_out:
        tenkit.safe_json.dump_numpy(result, bc_counts_out)
Example #19
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    with open(args.contig_annotations) as f:
        contigs = vdj_annot.load_contig_list_from_json(f,
                                                       args.vdj_reference_path)

    contigs.sort(key=lambda c: (c.barcode, c.get_single_chain(
    ), not c.productive, -c.umi_count, -c.read_count, -len(c)))

    low_confidence_contigs = set()
    cell_contigs = set()

    for (bc,
         chain), group in itertools.groupby(contigs,
                                            key=lambda c:
                                            (c.barcode, c.get_single_chain())):
        first_cdr3 = None
        first_cdr3_umis = None
        seen_cdr3s = set()

        for contig in group:
            contig.high_confidence = True

            if contig.is_cell:
                cell_contigs.add(contig.contig_name)

            if first_cdr3 is None:
                first_cdr3 = contig.cdr3_seq
                first_cdr3_umis = contig.umi_count

            # Mark as low confidence:
            # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3
            #    with a single UMI or low UMIs relative to the first contig, or
            extraneous_cdr3 = first_cdr3 is not None \
               and contig.cdr3_seq != first_cdr3 \
               and (contig.umi_count == 1 or \
                    (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO)

            # 2) Any contigs with a repeated CDR3.
            repeat_cdr3 = contig.cdr3_seq in seen_cdr3s

            if extraneous_cdr3 or repeat_cdr3:
                contig.high_confidence = False
                low_confidence_contigs.add(contig.contig_name)

            seen_cdr3s.add(contig.cdr3_seq)

            if chain in vdj_constants.VDJ_GENES:
                reporter._get_metric_attr('vdj_high_conf_prod_contig_frac',
                                          chain).add(
                                              1, filter=contig.high_confidence)
            reporter._get_metric_attr('vdj_high_conf_prod_contig_frac',
                                      cr_constants.MULTI_REFS_PREFIX).add(
                                          1, filter=contig.high_confidence)

    # Write augmented contig annotations
    with open(outs.contig_annotations, 'w') as f:
        vdj_annot.save_annotation_list_json(f, contigs)

    # Write filtered fasta
    with open(args.contig_fasta) as in_file, \
         open(outs.filtered_contig_fasta, 'w') as out_file:
        for hdr, seq in cr_utils.get_fasta_iter(in_file):
            # Keep contigs that are high confidence & in cells
            if hdr not in low_confidence_contigs and hdr in cell_contigs:
                tk_fasta.write_read_fasta(out_file, hdr, seq)

    # Write filtered fastq
    with open(args.contig_fastq) as in_file, \
         open(outs.filtered_contig_fastq, 'w') as out_file:
        for name, seq, qual in tk_fasta.read_generator_fastq(in_file):
            if name not in low_confidence_contigs and name in cell_contigs:
                tk_fasta.write_read_fastq(out_file, name, seq, qual)

    reporter.report_summary_json(outs.summary)
Example #20
0
def main(args, outs):
    if args.vdj_reference_path is None:
        outs.chunked_annotations = None
        return
    chunk_contigs = []
    barcodes_in_chunk = set(args.barcodes)

    # Set of barcodes that were called as cells
    if args.cell_barcodes:
        cell_barcodes_set = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes))
    else:
        cell_barcodes_set = set()

    # Setup feature reference sequences
    res = vdj_annot.setup_feature_aligners(args.vdj_reference_path,
                                           args.min_score_ratios,
                                           args.min_word_sizes)
    feature_types, feature_aligners, feature_filters = res

    # Setup primer reference sequnces
    if args.primers:
        primer_aligner, primer_filter = vdj_annot.setup_primer_aligner(args.primers,
                                                                       vdj_constants.VDJ_ANNOTATION_MIN_SCORE_RATIO)

    read_counts = {}
    umi_counts = {}
    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t')
        for _, row in contig_summary.iterrows():
            read_counts[row.contig_name] = int(row.num_reads)
            umi_counts[row.contig_name] = int(row.num_umis)

    if args.filter_summary:
        try:
            filter_summary = vdj_utils.load_contig_summary_table(open(args.filter_summary))
        except EmptyDataError:
            filter_summary = None
    else:
        filter_summary = None

    if not args.contigs_fastq is None:
        fq_iter = tk_fasta.read_generator_fastq(open(args.contigs_fastq), paired_end=False)

    for header, contig_sequence in cr_utils.get_fasta_iter(open(args.contigs)):
        if args.contigs_fastq is None:
            contig_quals = None
        else:
            header_fq, contig_sequence_fq, contig_quals = fq_iter.next()
            assert(contig_sequence_fq == contig_sequence)
            assert(header_fq == header)

        barcode = vdj_utils.get_barcode_from_contig_name(header)
        contig_name = header.split(' ')[0]

        # Only annotate barcodes assigned to this chunk and contigs with enough read support
        if barcode in barcodes_in_chunk:
            if filter_summary is not None:
                filtered = vdj_utils.is_contig_filtered(filter_summary, contig_name)
            else:
                filtered = True

            contig = vdj_annot.AnnotatedContig(contig_name,
                                               contig_sequence,
                                               quals=contig_quals,
                                               barcode=barcode,
                                               is_cell=barcode in cell_barcodes_set,
                                               filtered=filtered,
                                               read_count=read_counts.get(contig_name),
                                               umi_count=umi_counts.get(contig_name),
                                               )

            contig.annotations = contig.annotate_features(feature_types,
                                                          feature_aligners,
                                                          feature_filters)

            if args.primers:
                contig.primer_annotations = contig.annotate_features_by_group(primer_aligner,
                                                                              alignment_filter=primer_filter)

            contig.annotate_cdr3()

            chunk_contigs.append(contig)

    cPickle.dump(chunk_contigs, open(outs.chunked_annotations, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
Example #21
0
def main(args, outs):
    """ Trim the reads in a series of fasta files """

    # Set a fixed random seed to eliminate noise in metrics
    random.seed(0)

    chunk = args.chunk
    interleaved = chunk['reads_interleaved']
    have_read2 = chunk['read2'] is not None
    paired = interleaved or have_read2

    read1_trim = args.read1_trim_length
    read2_trim = args.read2_trim_length

    subsample_rate = chunk['subsample_rate']

    # BC config -- BC come from separate fastq, or are embedded in R1 or R2
    have_barcode = False
    bc_in_read1 = False
    bc_in_read2 = False
    bc_in_fastq = False

    # If we have bc in read, use that & ignore a separate BC read
    if chunk.get('bc_in_read',
                 None) is not None and chunk.get('bc_length', 0) > 0:
        have_barcode = True
        bc_length = chunk['bc_length']
        if chunk['bc_in_read'] == 1:
            bc_in_read1 = True
            read1_trim += bc_length
        elif chunk['bc_in_read'] == 2:
            bc_in_read2 = True
            read2_trim += bc_length
        else:
            martian.exit(
                "bc_in_read configuration incorrect -- read must be 1 or 2")

    # Otherwise use the BC file
    elif chunk['barcode'] is not None:
        have_barcode = True
        bc_in_fastq = True

    have_sample_index = chunk['sample_index'] is not None

    output_directory = os.path.dirname(os.path.realpath(outs.placeholder))
    max_read_num = args.max_read_num

    # counter for sub-chunked files
    file_number = 1

    # open the available read files and make the appropriate iterators
    if interleaved:
        read_in = openfq(chunk['read1'])
        read_iter = tk_fasta.read_generator_fastq(read_in, paired_end=True)
    else:
        if have_read2:
            read1_in = openfq(chunk['read1'])
            read1_iter = tk_fasta.read_generator_fastq(read1_in)

            read2_in = openfq(chunk['read2'])
            read2_iter = tk_fasta.read_generator_fastq(read2_in)

            read_iter = itertools.imap(
                lambda x, y: (x[0], x[1], x[2], y[0], y[1], y[2]), read1_iter,
                read2_iter)
        else:
            read1_in = openfq(chunk['read1'])
            read_iter = tk_fasta.read_generator_fastq(read1_in)

    # open read file
    read_name = output_directory + "/read" + str(file_number) + ".fastq"
    read_names = [read_name]
    out_read_fastq = open(read_name, 'w')

    # Bail out if there's no barcodes or whitelist
    if args.barcode_whitelist is None:
        outs.bc_counts = None
        bc_idx = None
    else:
        barcode_whitelist = sorted(
            list(tk_seq.load_barcode_whitelist(args.barcode_whitelist)))
        bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)}
        bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32)
        bad_count = 0

    # open barcode file if there is one
    if have_barcode:
        bc_name = output_directory + "/BC" + str(file_number) + ".fastq"
        out_bc_fastq = open(bc_name, 'w')
        bc_names = [bc_name]
        if bc_in_fastq:
            bc_in = openfq(chunk['barcode'])
            bc_iter = tk_fasta.read_generator_fastq(bc_in)
        elif bc_in_read1 or bc_in_read2:
            # BC in read -- have output file but no input file
            bc_iter = itertools.repeat(None)
    else:
        bc_iter = itertools.repeat(None)
        bc_names = [None]
        outs.bc_counts = None

# open sample_index file if there is one
    if have_sample_index:
        si_name = output_directory + "/SI" + str(file_number) + ".fastq"
        out_si_fastq = open(si_name, 'w')
        si_in = openfq(chunk['sample_index'])
        si_iter = tk_fasta.read_generator_fastq(si_in)
        si_names = [si_name]
    else:
        si_iter = itertools.repeat(None)
        si_names = [None]

    # loop through reads
    read_num = 0
    for read, barcode_read, sample_index_read in itertools.izip(
            read_iter, bc_iter, si_iter):
        if read_num > 0 and random.random() > subsample_rate:
            continue

        if paired:
            (name1, seq1, qual1, name2, seq2, qual2) = read
        else:
            (name1, seq1, qual1) = read

        new_seq1 = seq1[read1_trim:]
        new_qual1 = qual1[read1_trim:]
        if paired:
            new_seq2 = seq2[read2_trim:]
            new_qual2 = qual2[read2_trim:]

        # Get BC sequence out of the read, for BC-in-read schemes
        if bc_in_read1:
            barcode_read = (name1, seq1[:bc_length], qual1[:bc_length])

        if bc_in_read2:
            barcode_read = (name2, seq2[:bc_length], qual2[:bc_length])

        read_num += 1
        if read_num > max_read_num:
            read_num = 1
            file_number += 1
            read_name = output_directory + "/read" + str(
                file_number) + ".fastq"
            out_read_fastq.close()
            out_read_fastq = open(read_name, 'w')
            read_names.append(read_name)

            if have_barcode:
                bc_name = output_directory + "/BC" + str(
                    file_number) + ".fastq"
                out_bc_fastq.close()
                out_bc_fastq = open(bc_name, 'w')
                bc_names.append(bc_name)
            else:
                bc_names.append(None)

            if have_sample_index:
                si_name = output_directory + "/SI" + str(
                    file_number) + ".fastq"
                out_si_fastq.close()
                out_si_fastq = open(si_name, 'w')
                si_names.append(si_name)
            else:
                si_names.append(None)

        if have_barcode:
            barcode_seq = barcode_read[1]
            barcode_qual = barcode_read[2]
            if chunk['barcode_reverse_complement']:
                barcode_seq = tk_seq.get_rev_comp(barcode_seq)
                barcode_qual = barcode_qual[::
                                            -1]  # obscure way to reverse string
            if bc_idx is not None:
                idx = bc_idx.get(barcode_seq)
                if idx is not None:
                    bc_counts[idx] += 1
                else:
                    bad_count += 1

            tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0],
                                      barcode_seq, barcode_qual)
        if have_sample_index:
            tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0],
                                      sample_index_read[1],
                                      sample_index_read[2])

        tk_fasta.write_read_fastq(out_read_fastq, name1, new_seq1, new_qual1)
        if paired:
            tk_fasta.write_read_fastq(out_read_fastq, name2, new_seq2,
                                      new_qual2)

    if have_barcode:
        out_bc_fastq.close()
        # Only emit BC counts if we had a whitelist
        if outs.bc_counts is not None:
            result = {}
            result['bad_bc_count'] = bad_count
            result['bc_counts'] = list(bc_counts)
            with open(outs.bc_counts, 'w') as bc_counts_out:
                tenkit.safe_json.dump_numpy(result, bc_counts_out)
    if have_sample_index:
        out_si_fastq.close()
    out_read_fastq.close()

    chunks = []
    for (r, bc, si) in zip(read_names, bc_names, si_names):
        new_chunk = {
            'read1': r,
            'read2': None,
            'barcode': bc,
            'sample_index': si,
            'barcode_reverse_complement': False,
            'reads_interleaved': have_read2 or interleaved,
            'gem_group': chunk['gem_group'],
            'read_group': chunk['read_group']
        }
        chunks.append(new_chunk)

    outs.chunks = chunks
def main(args, outs):
    """ Trim the reads in a series of fasta files """
    chunk = args.chunk
    subsample_rate = chunk['subsample_rate']
    have_barcode = chunk['barcode'] is not None
    have_sample_index = chunk['sample_index'] is not None

    # STEP 1:  We run the R1/R2 reads through cutadapt, writing them to a temporary file with appropriate adapters
    # trimmed, optionally filtering out reads where adapters weren't found
    interleaved = chunk['read2'] is None
    # can't do discard_untrimmed because we're running cutadapt in single-end mode
    if args.trim_def['discard_untrimmed']:
        martian.exit("discard_untrimmed was set in trim_def")
    if interleaved:
        trimmed_reads = martian.make_path("trimmed_reads.fastq")
        trim_info_fn = martian.make_path("trim_info.txt")
        initial_read_pairs, trimmed_read_pairs = run_cutadapt_single_end(
            chunk['read1'], trimmed_reads, trim_info_fn, args.trim_def,
            args.adapters)
    else:
        trimmed_r1 = martian.make_path("trimmed_r1.fastq")
        trimmed_r2 = martian.make_path("trimmed_r2.fastq")
        trim_info_r1_fn = martian.make_path("trim_info_r1.txt")
        trim_info_r2_fn = martian.make_path("trim_info_r2.txt")
        initial1, trimmed1 = run_cutadapt_single_end(chunk['read1'],
                                                     trimmed_r1,
                                                     trim_info_r1_fn,
                                                     args.trim_def,
                                                     args.adapters,
                                                     read_id="R1")
        initial2, trimmed2 = run_cutadapt_single_end(chunk['read2'],
                                                     trimmed_r2,
                                                     trim_info_r2_fn,
                                                     args.trim_def,
                                                     args.adapters,
                                                     read_id="R2")
        initial_read_pairs = initial1 + initial2
        trimmed_read_pairs = trimmed1 + trimmed2
        if initial1 != initial2:
            martian.exit(
                "Input fastq files for R1 and R2 are not the same length")
        if trimmed1 != trimmed2:
            raise ValueError(
                "Cutadapt produced differing numbers of reads for R1 and R2")

    # STEP 2:  We run through the trimmed R1/R2 reads along with sample index and barcode reads, chunking into files of
    # max_read_num reads or less, and skipping sample index/barcode reads that don't match the trimmed & filtered R1/R2
    # reads
    max_read_num = args.max_read_num
    file_number = 1

    # open the available input read files and get the iterator over them
    if interleaved:
        reads_in = open_maybe_gzip(trimmed_reads, 'r')
        read_iter = tk_fasta.read_generator_fastq(reads_in, paired_end=True)
        trim_info = open_maybe_gzip(trim_info_fn, 'r')
        trim_iter = read_generator_trim_info(trim_info, paired_end=True)
    else:
        r1_in = open_maybe_gzip(trimmed_r1, 'r')
        r2_in = open_maybe_gzip(trimmed_r2, 'r')
        read_iter = ((r1[0], r1[1], r1[2], r2[0], r2[1], r2[2])
                     for r1, r2 in itertools.izip_longest(
                         tk_fasta.read_generator_fastq(r1_in),
                         tk_fasta.read_generator_fastq(r2_in)))
        trim_info_r1 = open_maybe_gzip(trim_info_r1_fn, 'r')
        trim_info_r2 = open_maybe_gzip(trim_info_r2_fn, 'r')
        trim_iter = (t1 + t2 for t1, t2 in itertools.izip(
            read_generator_trim_info(trim_info_r1),
            read_generator_trim_info(trim_info_r2)))

    # open output read file, which will be interleaved
    read_name = martian.make_path("read{}.fastq".format(file_number))
    out_readfiles = [read_name]
    out_read_fastq = open(read_name, 'w')

    # open trimmed read file, which will be interleaved
    trim_out_name = martian.make_path("TRIM{}.fastq".format(file_number))
    out_trimfiles = [trim_out_name]
    out_trim_fastq = open(trim_out_name, 'w')

    if args.barcode_whitelist is None:
        outs.bc_counts = None
        barcode_indices = None
    else:
        barcode_whitelist = sorted(
            list(load_barcode_whitelist(args.barcode_whitelist)))
        barcode_indices = {
            bc: idx
            for (idx, bc) in enumerate(barcode_whitelist)
        }
        bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32)
        bad_count = 0

    # open barcode file if there is one
    if have_barcode:
        bc_name = martian.make_path("BC{}.fastq".format(file_number))
        out_bc_fastq = open(bc_name, 'w')
        out_barcodefiles = [bc_name]
        barcode_read = None
        bc_in = open_maybe_gzip(chunk['barcode'], 'r')
        bc_iter = tk_fasta.read_generator_fastq(bc_in)
        # Determine if barcode sequences need to be reverse complemented.
        with open_maybe_gzip(chunk['barcode'], 'r') as bc_in2:
            bc_iter2 = tk_fasta.read_generator_fastq(bc_in2)
            barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)
            barcode_rc = infer_barcode_reverse_complement(
                barcode_whitelist, bc_iter2)
    else:
        out_barcodefiles = [None]
        outs.bc_counts = None

    # open sample_index file if there is one
    if have_sample_index:
        si_name = martian.make_path("SI{}.fastq".format(file_number))
        out_si_fastq = open(si_name, 'w')
        si_in = open_maybe_gzip(chunk['sample_index'], 'r')
        sample_index_read = None
        si_iter = tk_fasta.read_generator_fastq(si_in)
        out_sampleindex_files = [si_name]
    else:
        out_sampleindex_files = [None]

    read_num = 0
    random.seed(0)
    for (read, trim) in itertools.izip(read_iter, trim_iter):
        # Downsample (other than the first read).  Note we've set a fixed seed to make this deterministic.
        if read_num > 0 and random.random() > subsample_rate:
            continue

        # Now we need to step through the barcode and sample index reads to find the matching reads
        if have_barcode:
            try:
                while barcode_read is None or not read_match(
                        read, barcode_read):
                    barcode_read = bc_iter.next()
                # reverse complement if all barcodes are RC-ed
                if barcode_rc:
                    barcode_read = (barcode_read[0],
                                    tk_seq.get_rev_comp(barcode_read[1]),
                                    barcode_read[2][::-1])
            except StopIteration:
                raise ValueError(
                    "Couldn't find barcode read matching {}".format(
                        get_read_name(read)))
        if have_sample_index:
            try:
                while sample_index_read is None or not read_match(
                        read, sample_index_read):
                    sample_index_read = si_iter.next()
            except StopIteration:
                raise ValueError(
                    "Couldn't find sample index read matching {}".format(
                        get_read_name(read)))

        (name1, seq1, qual1, name2, seq2, qual2) = read
        (tr_name1, tr_seq1, tr_qual1, tr_name2, tr_seq2, tr_qual2) = trim

        read_num += 1
        if read_num > max_read_num:
            read_num = 1
            file_number += 1
            read_name = martian.make_path("read{}.fastq".format(file_number))
            out_read_fastq.close()
            out_read_fastq = open(read_name, 'w')
            out_readfiles.append(read_name)

            trim_out_name = martian.make_path(
                "TRIM{}.fastq".format(file_number))
            out_trim_fastq.close()
            out_trim_fastq = open(trim_out_name, 'w')
            out_trimfiles.append(trim_out_name)

            if have_barcode:
                bc_name = martian.make_path("BC{}.fastq".format(file_number))
                out_bc_fastq.close()
                out_bc_fastq = open(bc_name, 'w')
                out_barcodefiles.append(bc_name)
            else:
                out_barcodefiles.append(None)

            if have_sample_index:
                si_name = martian.make_path("SI{}.fastq".format(file_number))
                out_si_fastq.close()
                out_si_fastq = open(si_name, 'w')
                out_sampleindex_files.append(si_name)
            else:
                out_sampleindex_files.append(None)

        if have_barcode:
            barcode_seq = barcode_read[1]
            barcode_qual = barcode_read[2]
            if barcode_indices is not None:
                idx = barcode_indices.get(barcode_seq)
                if idx is not None:
                    bc_counts[idx] += 1
                else:
                    bad_count += 1

            tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0],
                                      barcode_seq, barcode_qual)
        if have_sample_index:
            tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0],
                                      sample_index_read[1],
                                      sample_index_read[2])

        tk_fasta.write_read_fastq(out_read_fastq, name1, seq1, qual1)
        tk_fasta.write_read_fastq(out_read_fastq, name2, seq2, qual2)

        tk_fasta.write_read_fastq(out_trim_fastq, tr_name1, tr_seq1, tr_qual1)
        tk_fasta.write_read_fastq(out_trim_fastq, tr_name2, tr_seq2, tr_qual2)

    if interleaved:
        reads_in.close()
    else:
        r1_in.close()
        r2_in.close()

    if have_barcode:
        out_bc_fastq.close()
        # Only emit BC counts if we had a whitelist
        if outs.bc_counts is not None:
            result = {}
            result['bad_bc_count'] = bad_count
            result['bc_counts'] = list(bc_counts)
            with open(outs.bc_counts, 'w') as bc_counts_out:
                tenkit.safe_json.dump_numpy(result, bc_counts_out)

    with open(outs.read_counts, 'w') as outfile:
        read_counts = {
            'total_read_pairs': initial_read_pairs,
            'filtered_read_pairs': trimmed_read_pairs
        }
        tenkit.safe_json.dump_numpy(read_counts, outfile)

    if have_sample_index:
        out_si_fastq.close()
    out_read_fastq.close()
    out_trim_fastq.close()

    outs.chunks = [
        {
            'read1': r,  # output chunked trimmed read file
            'read2': None,
            'trim': t,  # output chunked trim file
            'barcode': bc,  # output chunked barcode file
            'sample_index': si,  # output chunked sample index file
            'barcode_reverse_complement':
            False,  # we always keep BC in correct orientation
            'reads_interleaved': True,
            'gem_group': chunk['gem_group'],
            'read_group': chunk['read_group']
        } for (r, t, bc, si) in zip(out_readfiles, out_trimfiles,
                                    out_barcodefiles, out_sampleindex_files)
    ]
Example #23
0
def main(args, outs):
    # Martian coerces dict keys to string
    # Coerce keys back to int
    args.chunks_per_gem_group = {int(k): v for k, v in args.chunks_per_gem_group.iteritems()}

    paired_end = args.read2s_chunk is not None

    # Lazy load R1
    r1_file = cr_io.open_maybe_gzip(args.read1s_chunk)
    read1s = tk_fasta.read_generator_fastq(r1_file)

    # Lazy load R2
    if paired_end:
        r2_file = cr_io.open_maybe_gzip(args.read2s_chunk)
        read2s = tk_fasta.read_generator_fastq(r2_file)
    else:
        read2s = []

    # Lazy load corrected BCs
    bc_file = cr_io.open_maybe_gzip(args.bcs)
    bcs = (line.strip() for line in bc_file)

    buckets = {}

    bucket_filenames = {}

    for gem_group, bucket_name in enumerate_bucket_names(args.chunks_per_gem_group):
        filename = martian.make_path("%s.fastq" % bucket_name)
        bucket_filenames[bucket_name] = filename
        buckets[bucket_name] = []

    for read1, read2, barcode in itertools.izip_longest(read1s, read2s, bcs):
        # Exclude unbarcoded reads
        if barcode == '':
            continue

        # Exclude short reads
        if len(read1[1]) < MIN_READ_LENGTH or (read2 is not None and len(read2[1]) < MIN_READ_LENGTH):
            continue

        # Attach processed barcode to reads
        r1_hdr = cr_fastq.AugmentedFastqHeader(read1[0])
        r1_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode)
        r1_new_qname = r1_hdr.to_string()

        if paired_end:
            r2_hdr = cr_fastq.AugmentedFastqHeader(read2[0])
            r2_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode)
            r2_new_qname = r2_hdr.to_string()

        barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
        bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group])

        buckets[bucket_name].append((r1_new_qname, read1[1], read1[2]))
        if paired_end:
            buckets[bucket_name].append((r2_new_qname, read2[1], read2[2]))

    outs.buckets = {}

    # Sort and write each bucket
    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=vdj_utils.fastq_barcode_sort_key)

        # Don't create empty bucket files.
        # This is common when the reads are ordered by gem group
        # And a chunk sees only a single gem group.
        if len(bucket) == 0:
            continue

        filename = bucket_filenames[bucket_name]
        with cr_io.open_maybe_gzip(filename, 'w') as f:
            for read in bucket:
                tk_fasta.write_read_fastq(f, *read)

        outs.buckets[bucket_name] = bucket_filenames[bucket_name]
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    chunk = args.chunk

    bam_in = create_bam_infile(args.align_chunk)

    bam_out, _ = tk_bam.create_bam_outfile(
        outs.output,
        None,
        None,
        template=bam_in,
        pgs=[
            tk_bam.make_pg_header(martian.get_pipelines_version(),
                                  "attach_bcs", TENX_PRODUCT_NAME)
        ])

    gp_tagger = GlobalFivePrimePosTagger(bam_in)

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = {
            bc: idx
            for (idx, bc) in enumerate(sorted(list(barcode_whitelist)))
        }

    # set random seed to get deterministic subsampling
    random.seed(0)

    if chunk['barcode'] is not None:
        processed_barcode_iter = get_raw_processed_barcodes(
            open_maybe_gzip(chunk['barcode']), barcode_whitelist,
            args.bc_confidence_threshold, chunk['gem_group'],
            chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['sample_index'] is not None:
        sample_index_iter = tk_fasta.read_generator_fastq(
            open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    if chunk['trim'] is not None:
        trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(
            chunk['trim']),
                                                  paired_end=True)
    else:
        trim_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter,
                           trim_iter)

    # First read
    try:
        read = bam_in.next()
    except StopIteration:
        read = None

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info, trim_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info is not None:
            (bc_read_name, raw_bc_seq, processed_bc_seq,
             raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]

        if sample_index_info is not None:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name is not None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info(
                        "mismatch: si_read_name: %s, bam_read_name: %s" %
                        (si_read_name, read_name))
                assert (si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        r1_tags = tags
        r2_tags = list(r1_tags)

        if trim_info is not None:
            (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name,
             trim2_seq, trim2_qual) = trim_info
            if len(trim1_seq) > 0:
                r1_tags.append((TRIM_TAG, trim1_seq))
                r1_tags.append((TRIM_QUAL_TAG, trim1_qual))
            if len(trim2_seq) > 0:
                r2_tags.append((TRIM_TAG, trim2_seq))
                r2_tags.append((TRIM_QUAL_TAG, trim2_qual))

        reads_attached = 0
        reads_to_attach = []

        while read.query_name == read_name or read_name is None:
            tags = r1_tags if read.is_read1 else r2_tags
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            if reads_to_attach and (
                    read.query_name != reads_to_attach[0].query_name
                    or reads_to_attach[0].query_name is None):
                gp_tagger.tag_reads(reads_to_attach)
                reads_attached += len(reads_to_attach)
                for r in reads_to_attach:
                    if stringent_read_filter(r, require_barcode_for_stringent):
                        perfect_read_count += 1

                    if args.exclude_non_bc_reads:
                        if not (get_read_barcode(r) is None):
                            bam_out.write(r)
                    else:
                        bam_out.write(r)
                reads_to_attach = []

            reads_to_attach.append(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        gp_tagger.tag_reads(reads_to_attach)
        reads_attached += len(reads_to_attach)
        for r in reads_to_attach:
            if stringent_read_filter(r, require_barcode_for_stringent):
                perfect_read_count += 1

            if args.exclude_non_bc_reads:
                if not (get_read_barcode(r) is None):
                    bam_out.write(r)
            else:
                bam_out.write(r)
        # We may have more than 2 reads if there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert (reads_attached >= 2)

    outs.perfect_read_count = perfect_read_count
    bam_out.close()
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    chunk = args.chunk

    #subsample_rate = 1.0
    #if args.subsample_rate is not None:
    #    subsample_rate = args.subsample_rate

    bam_in = tk_bam.create_bam_infile(args.align_chunk)
    bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs"))

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = tk_seq.load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) }

    # set random seed to get deterministic subsampling
    random.seed(0)

    def open_maybe_gzip(fn):
        if fn[-2:] == "gz":
            return gzip.open(fn)
        else:
            return open(fn)

    if chunk['barcode']:
        processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['sample_index']:
        sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter)

    # First read
    read = bam_in.next()

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info:
            (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]


        if sample_index_info:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name != None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name))
                assert(si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        reads_attached = 0
        #emit_read_pair = random.random() < subsample_rate
        emit_read_pair = True

        while read.qname == read_name or read_name == None:
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            reads_attached += 1
            if not (read_name is None):
                assert(read.qname == read_name)

            if emit_read_pair:
                # Count the perfect reads -- will be used when subsampling in dedup
                if tenkit.read_filter.stringent_read_filter(read, require_barcode_for_stringent):
                    perfect_read_count += 1

                if args.exclude_non_bc_reads:
                    if not(tk_io.get_read_barcode(read) is None):
                        bam_out.write(read)
                else:
                    bam_out.write(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        # We may have more than 2 reads is there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert(reads_attached >= 2)


    outs.perfect_read_count = perfect_read_count
    bam_out.close()
Example #26
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group,
                                              args.library_type)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk)
    in_read2_fastq = cr_io.open_maybe_gzip(
        args.read2_chunk) if args.read2_chunk else []

    outs.corrected_bcs += h5_constants.LZ4_SUFFIX
    out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                            tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        processed_bc = None

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        out_file.write('%s\n' %
                       (processed_bc if processed_bc is not None else ''))

    in_read1_fastq.close()
    if in_read2_fastq:
        in_read2_fastq.close()
    out_file.close()

    bc_counter.close()

    reporter.save(outs.chunked_reporter)
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """
    # this silences a weird non-failure in --strict=error mode
    # TODO(lhepler): remove this when martian upstream handles this itself
    outs.outputs = []

    chunk = args.chunk

    bam_in = tk_bam.create_bam_infile(args.align_chunk)
    bc_spec = "{}:{}".format(RAW_BARCODE_TAG, RAW_BARCODE_QUAL_TAG)

    # only comment the first chunk, otherwise later merge will duplicate the comments and could lead to:
    # samtools merge ... : '[finish_merged_header] Output header text too long'
    if args.chunk_index > 0:
        COs = None
    elif chunk['trim']:
        COs = ['10x_bam_to_fastq:R1({},TR:TQ,SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)']
    else:
        COs = ['10x_bam_to_fastq:R1({},SEQ:QUAL)'.format(bc_spec), '10x_bam_to_fastq:R2(SEQ:QUAL)', '10x_bam_to_fastq:I1(BC:QT)']

    bam_out, tids = tk_bam.create_bam_outfile(outs.output, None, None, template=bam_in, pgs=[tk_bam.make_pg_header(martian.get_pipelines_version(), "attach_bcs")], cos = COs)

    gp_tagger = GlobalFivePrimePosTagger(bam_in)

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = { bc:idx for (idx,bc) in enumerate(sorted(list(barcode_whitelist))) }

    # set random seed to get deterministic subsampling
    random.seed(0)

    def open_maybe_gzip(fn):
        if fn[-2:] == "gz":
            return gzip.open(fn)
        else:
            return open(fn)

    if chunk['barcode']:
        processed_barcode_iter = get_raw_processed_barcodes(open_maybe_gzip(chunk['barcode']), barcode_whitelist, args.bc_confidence_threshold, chunk['gem_group'], chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['trim']:
        trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['trim']), paired_end=True)
    else:
        trim_iter = itertools.repeat(None)

    if chunk['sample_index']:
        sample_index_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter, trim_iter)

    # First read
    read = bam_in.next()

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info, trim_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info:
            (bc_read_name, raw_bc_seq, processed_bc_seq, raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]


        if sample_index_info:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name != None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info("mismatch: si_read_name: %s, bam_read_name: %s" % (si_read_name, read_name))
                assert(si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        r1_tags = tags
        r2_tags = list(tags)

        if trim_info:
            (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name, trim2_seq, trim2_qual) = trim_info
            if len(trim1_seq) > 0:
                r1_tags.append((TRIM_TAG, trim1_seq))
                r1_tags.append((TRIM_QUAL_TAG, trim1_qual))
            if len(trim2_seq) > 0:
                r2_tags.append((TRIM_TAG, trim2_seq))
                r2_tags.append((TRIM_QUAL_TAG, trim2_qual))

        reads_attached = 0
        reads_to_attach = []

        while read.qname == read_name or read_name == None:
            tags = r1_tags if read.is_read1 else r2_tags
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            if not (read_name is None):
                assert(read.qname == read_name)

            if reads_to_attach and (read.query_name != reads_to_attach[0].query_name or reads_to_attach[0].query_name is None):
                gp_tagger.tag_reads(reads_to_attach)
                reads_attached += len(reads_to_attach)
                for r in reads_to_attach:
                    if stringent_read_filter(r, require_barcode_for_stringent):
                        perfect_read_count += 1

                    if args.exclude_non_bc_reads:
                        if not(crdna_io.get_read_barcode(r) is None):
                            bam_out.write(r)
                    else:
                        bam_out.write(r)
                reads_to_attach = []

            reads_to_attach.append(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        gp_tagger.tag_reads(reads_to_attach)
        reads_attached += len(reads_to_attach)
        for r in reads_to_attach:
            if stringent_read_filter(r, require_barcode_for_stringent):
                perfect_read_count += 1

            if args.exclude_non_bc_reads:
                if not(crdna_io.get_read_barcode(r) is None):
                    bam_out.write(r)
            else:
                bam_out.write(r)

        # We may have more than 2 reads is there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert(reads_attached >= 2)


    outs.perfect_read_count = perfect_read_count
    bam_out.close()
Example #28
0
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args):
    """Build a consensus sequence from a set of contigs.

    Args:
    - clonotype_name: Used to prefix output files.
    - sel_contigs: Names of contigs to use for consensus building.
    - best_contig: Name of "best" contig. Will search for this contig's sequence
        and base qualities.
    - out_dir: dir used for temporary results
    - args: stage args.

    - Return value:
    A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name).
    - best_contig_seq/best_contig_quals: the sequence and quals of the best contig
    - consensus_seq: the consensus sequence or None if no consensus could be built.
    - out_bam_name: Path of BAM with alignments of contigs to consensus seq.
    - out_fastq_name: FASTQ with contig sequences.
    - out_fasta_name: FASTA with consensus sequence.
    enough reads for consensus.
    """

    best_contig_seq = None
    best_contig_quals = None

    # Input to base quality computation - we don't really need the
    # base qualities because we will replace them by read-based qualities
    # But we need to do this to get proper alignments of contigs against
    # the consensus.
    out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq')

    # Input to assembly
    out_bam_name = martian.make_path(clonotype_name + '_contigs.bam')

    # The reference in the output bam doesn't really matter.
    out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1])

    # Read the entire fastq (all contigs) and write the selected contigs to
    # a bam for the assembler and a fastq for the aligner.
    with open(args.contigs_fastq, 'r') as f, open(out_fastq_name,
                                                  'w') as out_fq:
        fq_iter = tk_fasta.read_generator_fastq(f)
        for (name, seq, quals) in fq_iter:
            if name in sel_contigs:
                if name == best_contig:
                    best_contig_seq = seq
                    best_contig_quals = quals

                header = cr_fastq.AugmentedFastqHeader(name)
                # Create a pseudo-UMI for each input contig
                header.set_tag(PROCESSED_UMI_TAG, name)
                # Put all reads on the same "barcode". This is important, so
                # the assembler assembles all of them together.
                header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name)

                record = pysam.AlignedRead()

                record.reference_start = 0
                record.reference_id = 0
                # Wrap with str() or pysam will crash when given unicode
                record.qname = str(header.to_string())
                record.seq = seq
                record.qual = quals
                record.flag = MAPPED_UNPAIRED_FLAG

                out_bam.write(record)

                # Now change the tags. The final bam concatenation code will pull
                # the tags out of the header, so we want these to be meaningful.
                # Put the real barcode in the barcode tag. The alignment-base-qual
                # code will ignore it anyway.
                header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0])
                tk_fasta.write_read_fastq(out_fq, header.to_string(), seq,
                                          quals)

    out_bam.close()
    assert (not best_contig_seq is None)

    out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta')

    # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name.
    # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the
    # only output of the assembler we care about.
    if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS:
        cmd = [
            'vdj_asm',
            'asm',
            out_bam_name,
            out_dir,
            '--single-end',
            '--cons',  # required so we produce a single output sequence
            '--kmers=0',
            '--min-qual=0',
            '--score-factor=0.0'
        ]
        sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

        tk_subproc.check_call(cmd, cwd=os.getcwd())

        with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'),
                  'r') as contig_f:
            lines = contig_f.readlines()
            if lines:
                out_seq = lines[1].strip()
            else:
                # In some rare cases (eg. input contigs have 0 quality), assembly might fail.
                out_seq = None
    else:
        out_seq = None

    # Write the best contig sequence on a new fasta. We need to make sure this has the
    # right contig name because this will be the name written in the bam alignments
    # of the contigs against the consensus
    with open(out_fasta_name, 'w') as f:
        tk_fasta.write_read_fasta(f, clonotype_name,
                                  out_seq if out_seq else best_contig_seq)

    # Now align the same reads that were used in vdj_asm against the consensus that you just got.
    # The output will be in out_dir/<clonotype_name> + '_contigs.bam'
    cmd = [
        'vdj_asm', 'base-quals',
        martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end'
    ]
    sys.stderr.write('Running ' + ' '.join(cmd) + '\n')

    tk_subproc.check_call(cmd, cwd=os.getcwd())

    # Move the BAM of the contigs aligned against the consensus out of the outs
    # (Will overwrite this bam which was already used as input to assembly).
    cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'),
               out_bam_name)

    return (best_contig_seq, best_contig_quals, out_seq, out_bam_name,
            out_fastq_name, out_fasta_name)
Example #29
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = open(args.read1_chunk)
    in_read2_fastq = open(args.read2_chunk)
    out_read1_fastq = open(outs.corrected_read1s, 'w')
    out_read2_fastq = open(outs.corrected_read2s, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                    tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])
        read2_header = cr_fastq.AugmentedFastqHeader(read2[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)
                read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)
                read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(),
                                  read1[1], read1[2])
        tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(),
                                  read2[1], read2[2])

    in_read1_fastq.close()
    in_read2_fastq.close()
    out_read1_fastq.close()
    out_read2_fastq.close()
    bc_counter.close()

    reporter.save(outs.chunked_reporter)