def split(args): assert args.read1s is not None and args.read2s is not None chunks = [] if cr_chem.get_barcode_whitelist(args.chemistry_def) is not None: # Data are barcoded for read1_fq, read2_fq, barcodes_json in zip(args.read1s, args.read2s, args.chunk_barcodes): with open(barcodes_json) as f: chunk_barcodes = json.load(f) chunks.append({ 'read1_chunk': read1_fq, 'read2_chunk': read2_fq, 'barcodes_chunk': chunk_barcodes, '__mem_gb': 3.0, }) else: # Most stages assume that each chunk has a single barcode. # So unfortunately we have to put all reads in the same chunk, otherwise # metric computation will break. read1_out_filename = martian.make_path('chunk0_1.fastq') read2_out_filename = martian.make_path('chunk0_2.fastq') with open(read1_out_filename, 'w') as read1_out, open(read2_out_filename, 'w') as read2_out: for read1_file, read2_file in zip(args.read1s, args.read2s): with open(read1_file) as in1, open(read2_file) as in2: fastq1_iter = tk_fasta.read_generator_fastq( in1, paired_end=False) fastq2_iter = tk_fasta.read_generator_fastq( in2, paired_end=False) for read1_tuple in fastq1_iter: read2_tuple = fastq2_iter.next() tk_fasta.write_read_fastq(read1_out, *read1_tuple) tk_fasta.write_read_fastq(read2_out, *read2_tuple) chunks.append({ 'read1_chunk': read1_out_filename, 'read2_chunk': read2_out_filename, 'barcodes_chunk': [""], }) # Martian doesn't like empty chunk lists so create a chunk w/ empty data if len(chunks) == 0: return get_dummy_chunk() return {'chunks': chunks}
def main(args, outs): ok, msg = tk_preflight.check_gem_groups(args.sample_def) if not ok: martian.exit(msg) if args.chemistry_name is None: martian.exit( "The chemistry was unable to be automatically determined. This can happen if not enough reads originate from the given reference. Please verify your choice of reference or explicitly specify the chemistry via the --chemistry argument." ) if args.chemistry_name == cr_chem.CUSTOM_CHEMISTRY_NAME: chemistry = args.custom_chemistry_def else: chemistry = cr_chem.get_chemistry(args.chemistry_name) ## Build chunk dicts outs.chunks = [] for sample_def in args.sample_def: fq_spec = cr_fastq.FastqSpec.from_sample_def(sample_def) gem_group = sample_def['gem_group'] library_id = sample_def.get('library_id', 'MissingLibrary') chunks = setup_chunks(args.sample_id, fq_spec, gem_group, library_id, chemistry) if len(chunks) == 0: # No FASTQs found for a sample def martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE) outs.chunks += chunks if len(outs.chunks) == 0: # No FASTQs found at all martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE) ## Check the FASTQ files themselves check_chunk_fastqs(outs.chunks) ## Check the chemistry specifications check_chunk_chemistries(outs.chunks) ## Output chemistry and barcode whitelist outs.chemistry_def = outs.chunks[0]['chemistry'] outs.barcode_whitelist = cr_chem.get_barcode_whitelist(outs.chemistry_def)
def join(args, outs, chunk_defs, chunk_outs): outs.chunked_reporter = None reporter = cr_report.merge_reporters( [chunk_out.chunked_reporter for chunk_out in chunk_outs]) outs.reads_per_bc = [chunk_out.reads_per_bc for chunk_out in chunk_outs] if args.output_fastqs: outs.barcode_chunked_read1 = [ chunk_out.barcode_chunked_read1 for chunk_out in chunk_outs ] outs.barcode_chunked_read2 = [ chunk_out.barcode_chunked_read2 for chunk_out in chunk_outs ] outs.barcode_chunked_bams = [] else: outs.barcode_chunked_read1 = [] outs.barcode_chunked_read2 = [] outs.barcode_chunked_bams = [ chunk_out.barcode_chunked_bams for chunk_out in chunk_outs ] # Output barcodes in each chunk outs.barcodes_in_chunks = [ chunk_def.barcodes_chunk for chunk_def in chunk_defs ] # If a single chunk w/ no barcodes, return null for chunk info if len(outs.barcodes_in_chunks ) == 1 and outs.barcodes_in_chunks[0][0] == '': outs.barcodes_in_chunks = None # Write UMI info (only for barcoded data) if cr_chem.get_barcode_whitelist(args.chemistry_def) is not None: write_umi_info([c.chunked_gene_umi_counts for c in chunk_outs], outs.umi_info) reporter.store_reference_metadata(args.vdj_reference_path, vdj_constants.REFERENCE_TYPE, vdj_constants.REFERENCE_METRIC_PREFIX) # Write output json reporter.report_summary_json(outs.summary)
def split(args): assert args.read1s is not None and args.read2s is not None chunks = [] # Ensure that data are barcoded assert cr_chem.get_barcode_whitelist(args.chemistry_def) is not None for read1_fq, read2_fq, barcodes_json in zip(args.read1s, args.read2s, args.chunk_barcodes): chunks.append({ 'read1_chunk': read1_fq, 'read2_chunk': read2_fq, 'barcodes_chunk': barcodes_json, '__mem_gb': 3, }) # Martian doesn't like empty chunk lists so create a chunk w/ empty data if len(chunks) == 0: return get_dummy_chunk() return {'chunks': chunks}
def main(args, outs): ok, msg = tk_preflight.check_gem_groups(args.sample_def) if not ok: martian.exit(msg) outs.chunks = [] for sample_def in args.sample_def: fastq_mode = sample_def['fastq_mode'] chunks = [] if fastq_mode == tk_constants.BCL_PROCESSOR_FASTQ_MODE: chunks = main_bcl_processor(args.sample_id, sample_def, args.chemistry_name, args.custom_chemistry_def) elif fastq_mode == tk_constants.ILMN_BCL2FASTQ_FASTQ_MODE: chunks = main_ilmn_bcl2fastq(args.sample_id, sample_def, args.chemistry_name, args.custom_chemistry_def) else: martian.throw("Unrecognized fastq_mode: %s" % fastq_mode) if len(chunks) == 0: martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE) outs.chunks += chunks if len(outs.chunks) == 0: martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE) check_chunk_fastqs(outs.chunks) check_chunk_chemistries(outs.chunks) # Output chemistry and barcode whitelist outs.chemistry_def = outs.chunks[0]['chemistry'] outs.barcode_whitelist = cr_chem.get_barcode_whitelist(outs.chemistry_def)
def main(args, outs): ok, msg = tk_preflight.check_gem_groups(args.sample_def) if not ok: martian.exit(msg) if args.chemistry_name is None: martian.exit( "The chemistry was unable to be automatically determined. This can happen if not enough reads originate from the given reference. Please verify your choice of reference or explicitly specify the chemistry via the --chemistry argument." ) if args.chemistry_name == cr_chem.CUSTOM_CHEMISTRY_NAME: chemistry = args.custom_chemistry_def else: chemistry = cr_chem.get_chemistry(args.chemistry_name) ## Build chunk dicts outs.chunks = [] ## Assign library ids sample_defs = args.sample_def default_lib_type = args.default_library_type or lib_constants.DEFAULT_LIBRARY_TYPE library_ids = cr_sample_def.assign_library_ids(sample_defs, default_lib_type) for sample_def, library_id in zip(sample_defs, library_ids): fq_spec = cr_fastq.FastqSpec.from_sample_def(sample_def) gem_group = cr_sample_def.get_gem_group(sample_def) library_type = cr_sample_def.get_library_type( sample_def) or default_lib_type subsample_rate = cr_sample_def.get_subsample_rate(sample_def) chunks = setup_chunks(args.sample_id, fq_spec, gem_group, library_id, chemistry, library_type, subsample_rate) if len(chunks) == 0: # No FASTQs found for a sample def martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE) outs.chunks += chunks if len(outs.chunks) == 0: # No FASTQs found at all martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE) ## Check the FASTQ files themselves check_chunk_fastqs(outs.chunks) ## Check the chemistry specifications check_chunk_chemistries(outs.chunks) ## Output chemistry and barcode whitelist outs.chemistry_def = outs.chunks[0]['chemistry'] outs.barcode_whitelist = cr_chem.get_barcode_whitelist(outs.chemistry_def) ## Output library info lib_tuples = sorted( set((c['gem_group'], c['library_id'], c['library_type']) for c in outs.chunks)) lib_info = [] for g, i, t in lib_tuples: lib_info.append({ 'gem_group': g, 'library_id': i, 'library_type': t, }) outs.library_info = lib_info