Beispiel #1
0
def main(args, outs):
    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Write compressed files
    outs.read1s += cr_constants.LZ4_SUFFIX
    outs.read2s += cr_constants.LZ4_SUFFIX

    cutadapt_out = os.path.join(os.path.dirname(outs.chunked_reporter), 'cutadapt_stdout')
    with open(cutadapt_out, 'w') as cut_stdout:
        status = run_cutadapt(args,
                              outs.read1s, outs.read2s,
                              args.chemistry_def,
                              cut_stdout)

    if args.read2s_chunk == None:
        outs.read2s = None

    if status != 0:
        martian.log_info('Error while running cutadapt')
    else:
        reporter = vdj_report.VdjReporter(primers=cr_utils.get_primers_from_dicts(args.primers))
        get_vdj_trim_metrics(reporter,
                             cutadapt_out,
                             paired_end)
        reporter.save(outs.chunked_reporter)
Beispiel #2
0
def main(args, outs):
    outs.coerce_strings()

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    outs.read1s = martian.make_path('reads_1.fastq' + h5_constants.LZ4_SUFFIX)
    r1_fq_out = cr_io.open_maybe_gzip(outs.read1s, 'w')

    if paired_end:
        outs.read2s = martian.make_path('reads_2.fastq' +
                                        h5_constants.LZ4_SUFFIX)
        r2_fq_out = cr_io.open_maybe_gzip(outs.read2s, 'w')
    else:
        outs.read2s = None
        r2_fq_out = None

    barcodes_out = cr_io.open_maybe_gzip(outs.chunk_barcodes, 'w')

    merge_by_barcode(args.fastqs, r1_fq_out, r2_fq_out, barcodes_out,
                     paired_end)

    r1_fq_out.close()
    if r2_fq_out is not None:
        r2_fq_out.close()
    barcodes_out.close()
Beispiel #3
0
def run_assembly(fastq_pref, fasta_pref, args):
    cmd = [
        'vdj_asm', 'asm', fastq_pref, fasta_pref,
        '--kmers=' + str(args.min_kmer_count),
        '--min-contig=' + str(args.min_contig_len),
        '--min-qual=' + str(args.min_qual),
        '--score-factor=' + str(args.score_factor),
        '--qual-factor=' + str(args.qual_factor),
        '--min-sw-score=' + str(args.min_sw_score),
        '--rt-error=' + str(args.rt_error)
    ]

    if not cr_chem.has_umis(args.chemistry_def):
        martian.log_info('Assembly without UMIs is not fully supported.')

    cutoff = args.min_readpairs_per_umi[str(args.gem_group)]
    if cr_chem.is_paired_end(args.chemistry_def):
        cmd.append('--min-umi-reads=' + str(2 * cutoff))
    else:
        cmd.append('--min-umi-reads=' + str(cutoff))
        cmd.append('--single-end')

    if args.use_unmapped:
        cmd.append('--use-unmapped')

    #cmd.append('--mixture-filter')

    print >> sys.stderr, 'Running', ' '.join(cmd)
    tk_subproc.check_call(cmd, cwd=os.getcwd())
Beispiel #4
0
def split(args):
    chunks = []

    if cr_chem.is_paired_end(args.chemistry_def):
        assert len(args.read1s) == len(args.read2s)

    for fastq1, fastq2 in itertools.izip_longest(args.read1s, args.read2s):
        chunks.append({
            'read1s_chunk': fastq1,
            'read2s_chunk': fastq2,
        })

    return {'chunks': chunks}
Beispiel #5
0
def split(args):
    paired_end = cr_chem.is_paired_end(args.chemistry_def)
    if paired_end:
        assert len(args.read1s) == len(args.read2s)

    chunks = []

    if args.retain_fastqs:
        for read1, read2 in itertools.izip_longest(args.read1s, args.read2s):
            chunks.append({
                'read1': read1,
                'read2': read2 if paired_end else None,
            })

    return {'chunks': chunks}
Beispiel #6
0
def split(args):
    chunks = []

    if cr_chem.is_paired_end(args.chemistry_def):
        assert len(args.read1s) == len(args.read2s)

    assert len(args.gem_groups) == len(args.read1s)

    for read1, read2, gem_group in itertools.izip_longest(
            args.read1s, args.read2s, args.gem_groups):
        chunks.append({
            'read1_chunk': read1,
            'read2_chunk': read2,
            'gem_group': gem_group,
        })
    return {'chunks': chunks}
Beispiel #7
0
def run_cutadapt(args, out_read1s, out_read2s):
    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    cmd = [
        'cutadapt', '--minimum-length', '50', '--times', '3', '--overlap', '5',
        '-o', out_read1s
    ]

    if paired_end:
        cmd.extend(['-p', out_read2s])

    primers = {anno['name']: anno['seq'] for anno in args.primers}

    for name in R1_ANCHORED_FIVE_PRIME_SEQS:
        if name in primers:
            cmd.extend(['-g', '%s=^%s' % (name, primers[name])])

    for name in R1_THREE_PRIME_REV_COMP_SEQS:
        if name in primers:
            cmd.extend([
                '-a',
                '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))
            ])

    for name in R1_THREE_PRIME_SEQS:
        if name in primers:
            cmd.extend(['-a', '%s=%s' % (name, primers[name])])

    if paired_end:
        for name in R2_THREE_PRIME_REV_COMP_SEQS:
            if name in primers:
                cmd.extend([
                    '-A',
                    '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))
                ])

        for name in R2_THREE_PRIME_SEQS:
            if name in primers:
                cmd.extend(['-A', '%s=%s' % (name, primers[name])])

    cmd.extend([args.read1s_chunk])

    if paired_end:
        cmd.extend([args.read2s_chunk])

    status = subprocess.check_call(cmd)
    return status
Beispiel #8
0
def split(args):
    chunks = []

    if cr_chem.is_paired_end(args.chemistry_def):
        assert len(args.read1s) == len(args.read2s)

    assert len(args.gem_groups) == len(args.read1s)

    for read1, read2, gem_group, lib_type in itertools.izip_longest(
            args.read1s, args.read2s, args.gem_groups, args.library_types):
        chunks.append({
            'read1_chunk': read1,
            'read2_chunk': read2,
            'gem_group': gem_group,
            'library_type': lib_type,
        })
    return {'chunks': chunks, 'join': {'__mem_gb': 2}}
Beispiel #9
0
def split(args):
    paired_end = cr_chem.is_paired_end(args.chemistry_def)
    if paired_end:
        assert len(args.read1s) == len(args.read2s)
    assert len(args.tags) == len(args.read1s)

    chunks = []

    for read1, read2, tags in itertools.izip_longest(args.read1s, args.read2s,
                                                     args.tags):
        # Conditionally retain input fastqs.
        # Always retain tag fastq.
        chunks.append({
            'read1': read1 if args.retain_fastqs else None,
            'read2': read2 if paired_end and args.retain_fastqs else None,
            'chunk_tags': tags,
        })

    return {'chunks': chunks}
Beispiel #10
0
def split(args):
    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    if paired_end:
        assert len(args.read1s) == len(args.read2s)

    assert len(args.corrected_bcs) == len(args.read1s)

    chunks = []

    # Determine the number of buckets required to achieve
    # the given chunk size.
    chunks_per_gem_group = {}
    with open(args.reads_summary) as f:
        reads_summary = json.load(f)
        for gg in args.gem_groups:
            # Get the libraries w/ this GEM group (should only be one)
            gg_library_ids = [lib['library_id'] for lib in args.library_info if lib['gem_group'] == gg]
            assert len(gg_library_ids) == 1

            lib_type_prefix = rna_library.get_library_type_metric_prefix(
                lib_constants.VDJ_LIBRARY_TYPE)
            readpairs = reads_summary['%s%s_total_read_pairs_per_library' %
                                      (lib_type_prefix,
                                       gg_library_ids[0])]

            chunks_per_gem_group[str(gg)] = max(2,
                                                int(math.ceil(float(readpairs) / \
                                                              args.readpairs_per_chunk)))

    for fastq1, fastq2, bcs in itertools.izip_longest(args.read1s, args.read2s, args.corrected_bcs):
        chunks.append({
            'read1s_chunk': fastq1,
            'read2s_chunk': fastq2 if paired_end else None,
            'bcs': bcs,
            'chunks_per_gem_group': chunks_per_gem_group,
            '__mem_gb': 6,
        })
    return {'chunks': chunks, 'join': {'__mem_gb': 2}}
Beispiel #11
0
def main(args, outs):
    reporter = vdj_report.VdjReporter(
        vdj_reference_path=args.vdj_reference_path)
    gene_umi_counts_per_bc = {}

    strand = cr_chem.get_strandedness(args.chemistry_def)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)
    assert paired_end != (args.read2_chunk is None)

    # For the entire chunk, match reads against the V(D)J reference
    ref_fasta = vdj_reference.get_vdj_reference_fasta(args.vdj_reference_path)

    # The filtering code will write this bam. Then we'll read it, correct the UMIs
    # and write outs.chunked_bams.
    filter_bam = martian.make_path('tmp.bam')

    vdj_filt.run_read_match(args.read1_chunk, args.read2_chunk, ref_fasta,
                            filter_bam, strand, args.sw_params)

    # Make two passes over the BAM file, processing one barcode at a time
    bam1 = pysam.AlignmentFile(filter_bam, check_sq=False)
    bam2 = pysam.AlignmentFile(filter_bam, check_sq=False)
    bc_iter1 = get_bc_grouped_pair_iter(bam1, paired_end)
    bc_iter2 = get_bc_grouped_pair_iter(bam2, paired_end)

    reads_per_bc = open(outs.reads_per_bc, 'w')
    out_bam, _ = tk_bam.create_bam_outfile(outs.barcode_chunked_bams,
                                           None,
                                           None,
                                           template=bam1)

    for (bc, pair_iter1), (_,
                           pair_iter2) in itertools.izip(bc_iter1, bc_iter2):
        nreads = 0

        # Pass 1: UMI correction
        umi_counts = defaultdict(int)
        for header, (read1, read2) in pair_iter1:
            nreads += 2
            umi_counts[header.get_tag(cr_constants.RAW_UMI_TAG)] += 1

        corrected_umis = correct_umis(umi_counts)

        # Pass 2: Write the UMI-corrected records
        process_bam_barcode(bam1, pair_iter2, bc, corrected_umis, reporter,
                            gene_umi_counts_per_bc, strand, out_bam,
                            paired_end)

        reads_per_bc.write('{}\t{}\n'.format(bc, nreads))

    bam1.close()
    bam2.close()
    out_bam.close()

    # Write bc-gene-umi counts
    cPickle.dump(gene_umi_counts_per_bc, open(outs.chunked_gene_umi_counts,
                                              'w'))

    # Copy the input barcodes
    if args.barcodes_chunk is not None:
        cr_utils.copy(args.barcodes_chunk, outs.barcodes_in_chunks)
    else:
        outs.barcodes_in_chunks = None

    reporter.save(outs.chunked_reporter)
Beispiel #12
0
def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [
        rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def
    ]
    read_tags = [
        None,
        None,
        (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
        (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
        (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
    ]

    # Determine which trimmed sequences need to be retained
    trim_defs = compute_trim_defs(
        read_defs, read_tags,
        args.chemistry_def.get('retain_trimmed_suffix_read'))

    outs.bam_comments = sorted(
        set([td.bam_to_fastq for td in trim_defs.itervalues()]))

    gem_groups = [chunk['gem_group'] for chunk in args.chunks]
    reporter = cr_report.Reporter(
        umi_length=cr_chem.get_umi_length(args.chemistry_def),
        primers=cr_utils.get_primers_from_dicts(args.primers),
        gem_groups=gem_groups)

    # Determine if barcode sequences need to be reverse complemented.
    bc_check_rc = FastqReader(args.read_chunks, bc_read_def,
                              args.reads_interleaved, None)
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_rc = infer_barcode_reverse_complement(barcode_whitelist,
                                                  bc_check_rc.in_iter)
    bc_check_rc.close()

    # Determine which read_iters need to retain trimmed sequence
    # (only one per read-type e.g., one per R1, one per R2, etc.)
    read_types_with_trim_def = set()
    rna_read_trim_defs = None
    rna_read2_trim_defs = None
    bc_read_trim_defs = None
    si_read_trim_defs = None
    umi_read_trim_defs = None

    if rna_read_def.read_type not in read_types_with_trim_def:
        rna_read_trim_defs = trim_defs
        read_types_with_trim_def.add(rna_read_def.read_type)
    if rna_read2_def.read_type not in read_types_with_trim_def:
        rna_read2_trim_defs = trim_defs
        read_types_with_trim_def.add(rna_read2_def.read_type)
    if bc_read_def.read_type not in read_types_with_trim_def:
        bc_read_trim_defs = trim_defs
        read_types_with_trim_def.add(bc_read_def.read_type)
    if si_read_def.read_type not in read_types_with_trim_def:
        si_read_trim_defs = trim_defs
        read_types_with_trim_def.add(si_read_def.read_type)
    if umi_read_def.read_type not in read_types_with_trim_def:
        umi_read_trim_defs = trim_defs
        read_types_with_trim_def.add(umi_read_def.read_type)

    # Setup read iterators.
    rna_reads = FastqReader(args.read_chunks, rna_read_def,
                            args.reads_interleaved, rna_read_trim_defs)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def,
                             args.reads_interleaved, rna_read2_trim_defs)
    bc_reads = FastqReader(args.read_chunks, bc_read_def,
                           args.reads_interleaved, bc_read_trim_defs)
    si_reads = FastqReader(args.read_chunks, si_read_def,
                           args.reads_interleaved, si_read_trim_defs)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def,
                                args.reads_interleaved, umi_read_trim_defs)
    else:
        umi_reads = FastqReader(None, None, False, None)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads)

    # Compute trim order of the readers; this is to ensure stability in the ordering
    # in which trimmed sequence is added to the TRIMMED_SEQ tags
    trim_order = list(
        np.argsort([
            reader.read_def.read_type for reader in fastq_readers
            if reader.read_def is not None
        ]))

    read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(
        *[reader.in_iter for reader in fastq_readers])

    # Bam file to write auxiliary data to (that won't fit in a fastq hdr / QNAME)
    trimmed_seq_writer = ChunkedBamWriter(outs.trimmed_seqs,
                                          args.reads_per_file)

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter, args.initial_reads):
        # Downsample
        if random.random() > args.subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions

        rna_read = rna_extraction.read if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction.read if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction.read if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction.read if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction.read if umi_extraction is not None else EMPTY_READ

        # Extra trimming for internal purposes
        if args.rna_read_length is not None:
            rna_read = (rna_read[0], rna_read[1][0:args.rna_read_length],
                        rna_read[2][0:args.rna_read_length])

        # Accumulate trimmed sequence; ordering is by read-type (I1,I2,R1,R2)
        # to ensure stability
        trimmed_seq = ''
        trimmed_qual = ''
        for i in trim_order:
            if extractions[i] is None:
                continue
            trimmed_seq += extractions[i].trimmed_seq
            trimmed_qual += extractions[i].trimmed_qual

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]),
                           bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        reporter.raw_fastq_cb(rna_read,
                              rna_read2,
                              bc_read,
                              si_read,
                              umi_read,
                              args.gem_group,
                              skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        fastq_header_str1 = fastq_header1.to_string()

        read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2]))

        # Write trimmed sequence data to a separate, unaligned BAM file
        # Note: We assume that there is only one trimmed sequence per read-pair
        trimmed_seq_data = pysam.AlignedSegment()
        trimmed_seq_data.query_name = fastq_header_str1.split(
            AugmentedFastqHeader.WORD_SEP)[0]
        trimmed_seq_data.flag = 4
        trimmed_seq_data.seq = trimmed_seq
        trimmed_seq_data.qual = trimmed_qual
        trimmed_seq_writer.write(trimmed_seq_data)

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG,
                                  si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG,
                                  bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            read2_writer.write(
                (fastq_header2.to_string(), rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    bc_counter.close()

    trimmed_seq_writer.close()

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()
        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []
        outs.gem_groups = [args.gem_group] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
        outs.trimmed_seqs = trimmed_seq_writer.get_out_paths()
    else:
        outs.reads = []
        outs.read2s = []
        outs.gem_groups = []
        outs.read_groups = []
        outs.trimmed_seqs = []

    assert len(outs.gem_groups) == len(outs.reads)
    if paired_end:
        assert len(outs.reads) == len(outs.read2s)
    assert len(outs.trimmed_seqs) == len(outs.reads)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)
Beispiel #13
0
def run_cutadapt(args, out_read1s, out_read2s, chemistry_def):
    paired_end = cr_chem.is_paired_end(chemistry_def)

    # If single end, determine which read the single read is (R1 or R2)
    if paired_end:
        single_read = None
    else:
        single_read = cr_chem.get_rna_read_def(chemistry_def).read_type
        assert single_read in ('R1', 'R2')

    out_r1_file = cr_utils.open_maybe_gzip(out_read1s, 'w')

    # Note: The complexity of forcing cutadapt to output a compressed file
    #       means we'll have to give up on that for now.
    cmd = [
        'cutadapt', '-e', '0.12', '--times', '3', '--overlap', '5', '-f',
        'fastq', '-o',
        '/proc/%d/fd/%d' % (os.getpid(), out_r1_file.fileno())
    ]

    if paired_end:
        out_r2_file = cr_utils.open_maybe_gzip(out_read2s, 'w')
        cmd.extend(
            ['-p',
             '/proc/%d/fd/%d' % (os.getpid(), out_r2_file.fileno())])

    primers = {anno['name']: anno['seq'] for anno in args.primers}

    if paired_end or single_read == 'R1':
        # R1 adapters
        for name in R1_ANCHORED_FIVE_PRIME_SEQS:
            if name in primers:
                cmd.extend(['-g', '%s=^%s' % (name, primers[name])])

        for name in R1_THREE_PRIME_REV_COMP_SEQS:
            if name in primers:
                cmd.extend([
                    '-a',
                    '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))
                ])

        for name in R1_THREE_PRIME_SEQS:
            if name in primers:
                cmd.extend(['-a', '%s=%s' % (name, primers[name])])

    if paired_end or single_read == 'R2':
        for name in R2_THREE_PRIME_REV_COMP_SEQS:
            if name in primers:
                flag = '-A' if paired_end else '-a'
                cmd.extend([
                    flag,
                    '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))
                ])

        for name in R2_THREE_PRIME_SEQS:
            if name in primers:
                flag = '-A' if paired_end else '-a'
                cmd.extend([flag, '%s=%s' % (name, primers[name])])

    read1_file = cr_utils.open_maybe_gzip(args.read1s_chunk)
    cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read1_file.fileno())])

    if paired_end:
        read2_file = cr_utils.open_maybe_gzip(args.read2s_chunk)
        cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read2_file.fileno())])

    print cmd

    status = tk_subproc.check_call(cmd)
    return status
Beispiel #14
0
def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Build the feature reference
    if args.reference_path:
        feature_ref = rna_feature_ref.from_transcriptome_and_csv(
            args.reference_path, args.feature_reference)
    else:
        feature_ref = rna_feature_ref.FeatureReference.empty()

    # Setup feature barcode extraction
    feature_extractor = rna_feature_ref.FeatureExtractor(
        feature_ref, use_feature_types=[args.library_type])

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [
        rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def
    ]
    read_tags = [
        None,
        None,
        (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
        (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
        (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
    ]

    # Determine which trimmed sequences need to be retained for bamtofastq
    trim_defs = get_bamtofastq_defs(read_defs, read_tags)
    outs.bam_comments = sorted(set(trim_defs.itervalues()))

    num_libraries = len(args.library_info)
    reporter = cr_report.Reporter(
        umi_length=cr_chem.get_umi_length(args.chemistry_def),
        primers=cr_utils.get_primers_from_dicts(args.primers),
        num_libraries=num_libraries)

    # Determine if barcode sequences need to be reverse complemented.
    with FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved,
                     None, None) as bc_check_rc:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist, True)
        barcode_rc = infer_barcode_reverse_complement(barcode_whitelist,
                                                      bc_check_rc.in_iter)

    # Log the untrimmed read lengths to stdout
    r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None)
    r1_reader = FastqReader(args.read_chunks, r1_read_def,
                            args.reads_interleaved, None, None)

    r1_untrimmed_len = 0
    for read in itertools.islice(r1_reader.in_iter,
                                 cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
        r1_untrimmed_len = max(r1_untrimmed_len, len(read[1]))
    print "Read 1 untrimmed length = ", r1_untrimmed_len
    print "Input arg r1_length = ", args.r1_length
    r1_reader.close()

    if paired_end:
        r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None)
        r2_reader = FastqReader(args.read_chunks, r2_read_def,
                                args.reads_interleaved, None, None)

        r2_untrimmed_len = 0
        for read in itertools.islice(
                r2_reader.in_iter,
                cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
            r2_untrimmed_len = max(r2_untrimmed_len, len(read[1]))
        print "Read 2 untrimmed length = ", r2_untrimmed_len
        print "Input arg r2_length = ", args.r2_length
        r2_reader.close()

    # Setup read iterators.
    r1_length = args.r1_length
    r2_length = args.r2_length

    rna_reads = FastqReader(args.read_chunks, rna_read_def,
                            args.reads_interleaved, r1_length, r2_length)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def,
                             args.reads_interleaved, r1_length, r2_length)
    bc_reads = FastqReader(args.read_chunks, bc_read_def,
                           args.reads_interleaved, r1_length, r2_length)
    si_reads = FastqReader(args.read_chunks, si_read_def,
                           args.reads_interleaved, r1_length, r2_length)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def,
                                args.reads_interleaved, r1_length, r2_length)
    else:
        umi_reads = FastqReader(None, None, False, r1_length, r2_length)

    # Record feature counts:
    feature_counts = np.zeros(feature_ref.get_num_features(), dtype=int)

    # If this library type has no feature barcodes, make the reader a NOOP
    if feature_extractor.has_features_to_extract():
        feature_reads = FastqFeatureReader(args.read_chunks, feature_extractor,
                                           args.reads_interleaved, r1_length,
                                           r2_length)
    else:
        feature_reads = FastqReader(None, None, None, r1_length, r2_length)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads,
                     feature_reads)

    read1_writer = ChunkedFastqWriter(outs.reads,
                                      args.reads_per_file,
                                      compression=COMPRESSION)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s,
                                          args.reads_per_file,
                                          compression=COMPRESSION)

    tag_writer = None
    if not args.augment_fastq:
        tag_writer = ChunkedFastqWriter(outs.tags,
                                        args.reads_per_file,
                                        compression=COMPRESSION)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(
        *[reader.in_iter for reader in fastq_readers])

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter,
                                        args.chunk_initial_reads):
        # Downsample
        if random.random() > args.chunk_subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction, feature_extraction = extractions

        rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ

        if (not rna_read[1]) or (paired_end and (not rna_read2[1])):
            # Read 1 is empty or read 2 is empty (if paired_end)
            # Empty reads causes issue with STAR aligner, so eliminate
            # them here
            continue

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]),
                           bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        lib_idx = [
            i for i, x in enumerate(args.library_info)
            if x['library_id'] == args.library_id
        ][0]
        reporter.raw_fastq_cb(rna_read,
                              rna_read2,
                              bc_read,
                              si_read,
                              umi_read,
                              lib_idx,
                              skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        feat_raw_bc = None
        feat_proc_bc = None
        feat_qual = None
        feat_ids = None

        if feature_extraction:
            if feature_extraction.barcode:
                feat_raw_bc = feature_extraction.barcode
                feat_qual = feature_extraction.qual

            if len(feature_extraction.ids) > 0:
                feat_proc_bc = feature_extraction.barcode
                feat_ids = ';'.join(feature_extraction.ids)

                # If hit a single feature ID, count its frequency
                if len(feature_extraction.ids) == 1:
                    feature_counts[feature_extraction.indices[0]] += 1

        if feat_raw_bc:
            fastq_header1.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG,
                                  feat_raw_bc)
            fastq_header1.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG,
                                  feat_qual)
        if feat_ids:
            fastq_header1.set_tag(cr_constants.PROCESSED_FEATURE_BARCODE_TAG,
                                  feat_proc_bc)
            fastq_header1.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids)

        if args.augment_fastq:
            read1_writer.write(
                (fastq_header1.to_string(), rna_read[1], rna_read[2]))
        else:
            read1_writer.write((rna_read[0], rna_read[1], rna_read[2]))
            tag_writer.write((fastq_header1.to_string(), '', ''))

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG,
                                  si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG,
                                  bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            if feat_raw_bc:
                fastq_header2.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG,
                                      feat_raw_bc)
                fastq_header2.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG,
                                      feat_qual)
            if feat_ids:
                fastq_header2.set_tag(
                    cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc)
                fastq_header2.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids)

            if args.augment_fastq:
                read2_writer.write(
                    (fastq_header2.to_string(), rna_read2[1], rna_read2[2]))
            else:
                read2_writer.write((rna_read2[0], rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    if not args.augment_fastq:
        tag_writer.close()
    bc_counter.close()

    # Write feature BC read counts
    with open(outs.feature_counts, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f)

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()

        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []

        if args.augment_fastq:
            outs.tags = []
        else:
            outs.tags = tag_writer.get_out_paths(len(outs.tags))

        libraries = args.library_info
        library = [
            li for li in libraries if li['library_id'] == args.library_id
        ][0]

        outs.gem_groups = [library['gem_group']] * len(outs.reads)
        outs.library_types = [library['library_type']] * len(outs.reads)
        outs.library_ids = [library['library_id']] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
    else:
        outs.reads = []
        outs.read2s = []
        outs.tags = []
        outs.gem_groups = []
        outs.library_types = []
        outs.library_ids = []
        outs.read_groups = []

    assert len(outs.gem_groups) == len(outs.reads)
    assert args.augment_fastq or len(outs.tags) == len(outs.reads)

    if paired_end:
        assert len(outs.reads) == len(outs.read2s)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)
Beispiel #15
0
def run_cutadapt(args, out_read1s, out_read2s, chemistry_def, stdout=sys.stdout):
    paired_end = cr_chem.is_paired_end(chemistry_def)

    # If single end, determine which read the single read is (R1 or R2)
    if paired_end:
        single_read = None
    else:
        single_read = cr_chem.get_rna_read_def(chemistry_def).read_type
        assert single_read in ('R1', 'R2')

    out_r1_file = cr_utils.open_maybe_gzip(out_read1s, 'w')

    # Note: The complexity of forcing cutadapt to output a compressed file
    #       means we'll have to give up on that for now.
    cmd = ['cutadapt',
           '-e', '0.12', '--times', '3', '--overlap', '5',
           '-f', 'fastq',
           '-o', '/proc/%d/fd/%d' % (os.getpid(), out_r1_file.fileno())]

    out_r2_file = None
    if paired_end:
        out_r2_file = cr_utils.open_maybe_gzip(out_read2s, 'w')
        cmd.extend(['-p', '/proc/%d/fd/%d' % (os.getpid(), out_r2_file.fileno())])

    primers = {anno['name']:anno['seq'] for anno in args.primers}

    if paired_end or single_read == 'R1':
        # R1 adapters
        for name in R1_ANCHORED_FIVE_PRIME_SEQS:
            if name in primers:
                cmd.extend(['-g', '%s=^%s' % (name, primers[name])])

        for name in R1_THREE_PRIME_REV_COMP_SEQS:
            if name in primers:
                cmd.extend(['-a', '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))])

        for name in R1_THREE_PRIME_SEQS:
            if name in primers:
                cmd.extend(['-a', '%s=%s' % (name, primers[name])])

    if paired_end or single_read == 'R2':
        for name in R2_THREE_PRIME_REV_COMP_SEQS:
            if name in primers:
                flag = '-A' if paired_end else '-a'
                cmd.extend([flag, '%s_rc=%s' % (name, tk_seq.get_rev_comp(primers[name]))])

        for name in R2_THREE_PRIME_SEQS:
            if name in primers:
                flag = '-A' if paired_end else '-a'
                cmd.extend([flag, '%s=%s' % (name, primers[name])])


    read1_file = cr_utils.open_maybe_gzip(args.read1s_chunk)
    cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read1_file.fileno())])

    read2_file = None
    if paired_end:
        read2_file = cr_utils.open_maybe_gzip(args.read2s_chunk)
        cmd.extend(['/proc/%d/fd/%d' % (os.getpid(), read2_file.fileno())])

    print cmd

    status = tk_subproc.check_call(cmd, stdout=stdout)

    # closing these files is important both because we need to wait on the
    # subprocess, if any, or else its rusage isn't accounted for for this
    # process, and because if we don't have a reference to the objects down
    # here, then python's garbage collector is free to finalize the objects
    # before cmd runs, which would result in a failure.
    out_r1_file.close()
    if out_r2_file:
        out_r2_file.close()
    read1_file.close()
    if read2_file:
        read2_file.close()

    return status
Beispiel #16
0
def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [rna_read_def, rna_read2_def,
                 bc_read_def, si_read_def, umi_read_def]
    read_tags = [None, None,
                 (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
                 (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
                 (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
             ]

    # Determine which trimmed sequences need to be retained for bamtofastq
    trim_defs = get_bamtofastq_defs(read_defs, read_tags)
    outs.bam_comments = sorted(set(trim_defs.itervalues()))

    gem_groups = [chunk['gem_group'] for chunk in args.chunks]
    reporter = cr_report.Reporter(umi_length=cr_chem.get_umi_length(args.chemistry_def),
                                  primers=cr_utils.get_primers_from_dicts(args.primers),
                                  gem_groups=gem_groups)

    # Determine if barcode sequences need to be reverse complemented.
    bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None)
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter)
    bc_check_rc.close()

    # Log the untrimmed read lengths to stdout
    r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None)
    r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None)

    r1_untrimmed_len = 0
    for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
        r1_untrimmed_len = max(r1_untrimmed_len, len(read[1]))
    print "Read 1 untrimmed length = ", r1_untrimmed_len
    print "Input arg r1_length = ", args.r1_length
    r1_reader.close()

    if paired_end:
        r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None)
        r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None)

        r2_untrimmed_len = 0
        for read in itertools.islice(r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
            r2_untrimmed_len = max(r2_untrimmed_len, len(read[1]))
        print "Read 2 untrimmed length = ", r2_untrimmed_len
        print "Input arg r2_length = ", args.r2_length
        r2_reader.close()


    # Setup read iterators.
    r1_length = args.r1_length
    r2_length = args.r2_length

    rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length)
    bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length)
    si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length)
    else:
        umi_reads = FastqReader(None, None, False, r1_length, r2_length)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads)

    read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(*[reader.in_iter for reader in fastq_readers])

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter, args.initial_reads):
        # Downsample
        if random.random() > args.subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions

        rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ

        if (not rna_read[1]) or (paired_end and (not rna_read2[1])):
            # Read 1 is empty or read 2 is empty (if paired_end)
            # Empty reads causes issue with STAR aligner, so eliminate
            # them here
            continue

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        fastq_header_str1 = fastq_header1.to_string()

        read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2]))

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            read2_writer.write((fastq_header2.to_string(), rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    bc_counter.close()

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()
        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []
        outs.gem_groups = [args.gem_group] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
    else:
        outs.reads = []
        outs.read2s = []
        outs.gem_groups = []
        outs.read_groups = []

    assert len(outs.gem_groups) == len(outs.reads)

    if paired_end:
        assert len(outs.reads) == len(outs.read2s)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)