def construct_chunks(filename_lists, sample_id, gem_group, library_id, reads_interleaved, chemistry, library_type, subsample_rate): """ filename_lists (list of dict<str,list>) """ chunks = [] for chunk_idx in xrange(len(filename_lists.values()[0])): chunk = { 'gem_group': gem_group, 'library_type': library_type, 'library_id': library_id, 'reads_interleaved': reads_interleaved, 'read_chunks': {}, 'chemistry': chemistry, 'subsample_rate': subsample_rate, } for read_type in cr_constants.FASTQ_READ_TYPES.keys(): filename = filename_lists[read_type][chunk_idx] chunk['read_chunks'][read_type] = filename # Build read group (@RG) string # Infer flowcell, lane from first fastq first_fastq = [ fq for fq in chunk['read_chunks'].values() if fq is not None ][0] flowcell, lane = tk_fasta.get_run_data(first_fastq) rg_string = tk_bam.pack_rg_string(sample_id, library_id, str(gem_group), flowcell, lane) chunk['read_group'] = rg_string chunks.append(chunk) return chunks
def construct_chunks(filename_lists, sample_id, sample_def, reads_interleaved, chemistry): chunks = [] for chunk_idx in xrange(len(filename_lists.values()[0])): chunk = { 'gem_group': sample_def['gem_group'], 'reads_interleaved': reads_interleaved, 'read_chunks': {}, 'chemistry': chemistry, } for read_type in cr_constants.FASTQ_READ_TYPES.keys(): filename = filename_lists[read_type][chunk_idx] chunk['read_chunks'][read_type] = filename # Build read group (@RG) string # Infer flowcell, lane from first fastq first_fastq = [ fq for fq in chunk['read_chunks'].values() if fq is not None ][0] flowcell, lane = tk_fasta.get_run_data(first_fastq) library_id = sample_def.get('library_id', 'MissingLibrary') gem_group = str(sample_def['gem_group'] or 1) rg_string = tk_bam.pack_rg_string(sample_id, library_id, gem_group, flowcell, lane) chunk['read_group'] = rg_string chunks.append(chunk) return chunks
def main(args, outs): """Combine reads from multiple input FASTQ files, and potentially trim. Demultiplex outputs a series of FASTQ files with filenames of the form: read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz]. """ def check_key(n, dict_in, name, tys): if not dict_in.has_key(name): martian.exit("Entry %d in sample_def missing required field: %s" % (n, name)) if not (type(dict_in[name]) in tys): martian.exit( "Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s" % (n, name, str(tys), type(dict_in[name]))) # Check for self-consistent gem_group settings in the sample_def entries gem_groups = [x['gem_group'] for x in args.sample_def] all_null = all([x is None for x in gem_groups]) all_int = all([type(x) is int for x in gem_groups]) if not (all_null or all_int): martian.exit( "Inconsistent gem_group tags. Please specify all gem_group tags as null, or all gem_group tags with an integer" ) # If all gem_groups are set to null, then set them all to 1 if all_null: for sample_item in args.sample_def: sample_item['gem_group'] = 1 # Predicted input bases total_seq_bases = 0 # verify input mode upfront if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]: martian.throw("Unrecognized input_mode: %s" % args.input_mode) for (idx, sample_item) in enumerate(args.sample_def): # validate fields check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "gem_group", [int, type(None)]) if args.input_mode == "BCL_PROCESSOR": check_key(idx, sample_item, "sample_indices", [list, type(None)]) elif args.input_mode == "ILMN_BCL2FASTQ": check_key(idx, sample_item, "sample_names", [list, type(None)]) interleaved_read_type = "RA" chunks = [] read_groups = set() for read_chunk in args.sample_def: # Each sample_def entry can have a separate pre-applied downsampling rate # We adjust the estimated data in that chunk to account for this # subsampling chunk_subsample_rate = read_chunk.get('subsample_rate', 1.0) bc_in_read = {} if read_chunk.has_key('bc_in_read'): if read_chunk['bc_in_read'] is not None: bc_in_read['bc_in_read'] = read_chunk['bc_in_read'] bc_in_read['bc_length'] = read_chunk['bc_length'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] gem_group = read_chunk['gem_group'] unbarcoded = read_chunk.get('unbarcoded') sample_id = args.sample_id library_id = read_chunk.get('library', 'MissingLibrary') # split on BCL_PROCESSOR / ILMN_BCL2FASTQ # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index; # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name if args.input_mode == "BCL_PROCESSOR": sample_index_strings, msg = tk_preflight.check_sample_indices( read_chunk) if sample_index_strings is None: martian.exit(msg) sample_seq_bases = 0 read_length = 100 # Should be overwritten below find_func = tk_fasta.find_input_fastq_files_10x_preprocess for sample_index in sample_index_strings: # process interleaved reads reads = find_func(path, interleaved_read_type, sample_index, lanes) for read in reads: _, predicted_seq_bases, read_length = fastq_data_estimate( read) sample_seq_bases += predicted_seq_bases sample_seq_bases = chunk_subsample_rate * sample_seq_bases bp_per_read_pair = 2 * read_length martian.log_info( "Input data: Predict %f GB from %s. (%d bp per read pair)" % (float(sample_seq_bases) / 1e9, path, bp_per_read_pair)) total_seq_bases += sample_seq_bases for sample_index in sample_index_strings: reads = find_func(path, interleaved_read_type, sample_index, lanes) # TODO confirm that this works with cellranger si_read, bc_read = ("I1", "I2") if 'barcode_read' in read_chunk and read_chunk[ 'barcode_read'] == 'I1': si_read, bc_read = ("I2", "I1") sis = find_func(path, si_read, sample_index, lanes) # allow empty sample index case if all reads in lane are same sample if sis is None or sis == []: sis = [None] * len(reads) if not unbarcoded: barcodes = find_func(path, bc_read, sample_index, lanes) if len(barcodes) == 0: barcodes = [None] * len(reads) else: barcodes = [None] * len(reads) # calculate chunks for r, b, si in zip(reads, barcodes, sis): (flowcell, lane) = get_run_data(r) rg_string = tk_bam.pack_rg_string(sample_id, library_id, gem_group, flowcell, lane) new_chunk = { 'read1': r, 'read2': None, 'reads_interleaved': True, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': chunk_subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) elif args.input_mode == "ILMN_BCL2FASTQ": sample_names = read_chunk['sample_names'] sample_seq_bases = 0 find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for sample_name in sample_names: # process read 1 reads = find_func(path, "R1", sample_name, lanes) for read in reads: _, predicted_seq_bases, read_length1 = fastq_data_estimate( read) sample_seq_bases += predicted_seq_bases # process read 2 reads = find_func(path, "R2", sample_name, lanes) for read in reads: _, predicted_seq_bases, read_length2 = fastq_data_estimate( read) sample_seq_bases += predicted_seq_bases sample_seq_bases = chunk_subsample_rate * sample_seq_bases bp_per_read_pair = read_length1 + read_length2 martian.log_info( "Input data: Predict %f GB from %s. (%d bp per read pair)" % (float(sample_seq_bases) / 1e9, path, bp_per_read_pair)) total_seq_bases += sample_seq_bases for sample_name in sample_names: r1_reads = find_func(path, "R1", sample_name, lanes) r2_reads = find_func(path, "R2", sample_name, lanes) # TODO confirm that this works with cellranger si_read, bc_read = ("I1", "I2") if 'barcode_read' in read_chunk and read_chunk[ 'barcode_read'] == 'I1': si_read, bc_read = ("I2", "I1") sis = find_func(path, si_read, sample_name, lanes) # allow empty sample index case if all reads in lane are same sample if sis is None or sis == []: sis = [None] * len(r1_reads) # in Chromium chemistry... there shouldn't be separate barcode reads... if not unbarcoded: barcodes = find_func(path, bc_read, sample_name, lanes) if len(barcodes) == 0: barcodes = [None] * len(r1_reads) else: barcodes = [None] * len(r1_reads) # again, with Chromium, the barcodes should be an array of Nones, but # just in case... if not (len(r1_reads) == len(r2_reads) == len(barcodes)): martian.log_info("Read 1 files: %s" % str(r1_reads)) martian.log_info("Read 2 files: %s" % str(r2_reads)) martian.log_info("Barcode files: %s" % str(barcodes)) martian.exit( "Read1, Read2, and Barcode files are mismatched. Exiting pipline" ) # calculate chunks for r1, r2, b, si in zip(r1_reads, r2_reads, barcodes, sis): (flowcell, lane) = get_run_data(r1) rg_string = tk_bam.pack_rg_string(sample_id, library_id, gem_group, flowcell, lane) new_chunk = { 'read1': r1, 'read2': r2, 'reads_interleaved': False, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': chunk_subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) martian.log_info("Input data: Predict %f total GB" % (float(total_seq_bases) / 1e9)) if len(chunks) == 0: martian.exit( "No input FASTQs were found for the requested parameters.") # # Downsampling setup # # The total available input raw gigabases of input data (est_gb), and the base pairs per read pair (bp_per_read_pair) # are estimated above. (est_gb, bp_per_read_pair) = (float(total_seq_bases) / 1e9, bp_per_read_pair) downsample = args.downsample if args.downsample is not None else {} # Possible BC subsampling -- try to get the requested amount of data _after_ bc subsampling est_gb_post_bc = est_gb * downsample.get("bc_subsample_rate", 1.0) # Aim high to ensure that we won't be left with too few reads # if the rest of pipeline can trim this down for us. fudge_factor = args.downsample_overage downsample_succeeded = True if downsample.has_key("gigabases"): read_sample_rate = min( 1.0, fudge_factor * downsample['gigabases'] / est_gb_post_bc) requested_read_pairs = int(1e9 * downsample['gigabases'] / bp_per_read_pair) downsample_succeeded = downsample['gigabases'] > est_gb_post_bc elif downsample.has_key("target_reads"): requested_read_pairs = int(downsample['target_reads'] / 2) est_read_pair_post_bc = 1e9 * est_gb_post_bc / bp_per_read_pair read_sample_rate = min( 1.0, fudge_factor * requested_read_pairs / est_read_pair_post_bc) downsample_succeeded = requested_read_pairs > est_read_pair_post_bc elif downsample.has_key("subsample_rate"): read_sample_rate = min( 1.0, downsample["subsample_rate"] / downsample.get("bc_subsample_rate", 1.0)) requested_read_pairs = None else: read_sample_rate = 1.0 requested_read_pairs = None martian.log_info("Downsampling request: %s" % str(downsample)) martian.log_info("Base pairs per read pair: %s" % bp_per_read_pair) martian.log_info( "Estimated Input: %.2f GB, Initial Downsample Rate: %.3f. Requested total reads: %s" % (est_gb, read_sample_rate, str(requested_read_pairs))) # Copy over the per-chunk subsample rates if read_sample_rate is not None: for chunk in chunks: chunk['subsample_rate'] = chunk.get('subsample_rate', 1.0) * read_sample_rate if downsample.has_key("bc_subsample_rate"): chunk["bc_subsample_rate"] = downsample["bc_subsample_rate"] outs.requested_read_pairs = requested_read_pairs martian.log_info("Input reads: %s" % str(chunks)) outs.chunks = chunks outs.read_groups = [rg for rg in read_groups] downsample_info = {} downsample_info['available_gb'] = est_gb downsample_info['requested_gb'] = downsample.get('gigabases', None) downsample_info['requested_rate'] = read_sample_rate downsample_info['post_downsample_gb'] = float( requested_read_pairs * bp_per_read_pair) / 1e9 if requested_read_pairs is not None else None downsample_info['downsample_succeeded'] = downsample_succeeded with open(outs.downsample_info, 'w') as downsample_out: tenkit.safe_json.dump_numpy(downsample_info, downsample_out) check_fastqs(outs.chunks) # Give out full path to BC whitelist if args.barcode_whitelist: outs.barcode_whitelist_path = BARCODE_LOCATION + "/" + args.barcode_whitelist + ".txt" else: outs.barcode_whitelist_path = None