def join(args, outs, chunk_defs, chunk_outs): contig_info = get_contig_info(args) with open(outs.contig_info_json, 'w') as outfile: json.dump(contig_info, outfile) call = [ "dlconverter", args.sample_id, "--output", outs.output_for_dloupe, "--description", args.sample_desc, "--node-profile-h5", args.normalized_node_profiles, "--contig-info-json", outs.contig_info_json, "--merged-bed", args.node_cnv_calls, "--tree-data", args.tree_data, "--tracks", args.tracks, "--per-cell-summary", args.per_cell_summary_metrics ] gene_annotation_path = tk_ref.get_loupe_genes(args.reference_path) if os.path.exists(gene_annotation_path): call.extend(["--gene-annotations", gene_annotation_path]) # the sample desc may be unicode, so send the whole # set of args str utf-8 to check_output unicode_call = [arg.encode('utf-8') for arg in call] martian.log_info("Running dlconverter: %s" % " ".join(call)) try: results = tk_subproc.check_output(unicode_call) martian.log_info("dlconverter output: %s" % results) except subprocess.CalledProcessError, e: outs.output_for_dloupe = None martian.throw("Could not generate .dloupe file: \n%s" % e.output)
def main(args, outs): if do_not_make_cloupe(args): outs.output_for_cloupe = None return gem_group_index_json = get_gem_group_index_json(args, outs) call = [ "crconverter", args.sample_id, args.pipestance_type, "--matrix", args.filtered_gene_bc_matrices_h5, "--analysis", get_analysis_h5_path(args), "--output", outs.output_for_cloupe, "--description", args.sample_desc ] if args.metrics_json: call.extend(["--metrics", args.metrics_json]) if args.aggregation_csv: call.extend(["--aggregation", args.aggregation_csv]) if gem_group_index_json: call.extend(["--gemgroups", gem_group_index_json]) # the sample desc may be unicode, so send the whole # set of args str utf-8 to check_output unicode_call = [arg.encode('utf-8') for arg in call] # but keep the arg 'call' here because log_info inherently # attempts to encode the message... (TODO: should log_info # figure out the encoding of the input string) martian.log_info("Running crconverter: %s" % " ".join(call)) try: results = tk_subproc.check_output(unicode_call) martian.log_info("crconverter output: %s" % results) except subprocess.CalledProcessError, e: outs.output_for_cloupe = None martian.throw("Could not generate .cloupe file: \n%s" % e.output)
def main(args, outs): if do_not_make_cloupe(args): outs.output_for_cloupe = None return gem_group_index_json = get_gem_group_index_json(args, outs) call = [ "crconverter", args.sample_id, args.pipestance_type, "--matrix", args.filtered_gene_bc_matrices_h5, "--analysis", get_analysis_h5_path(args), "--output", outs.output_for_cloupe, "--description", args.sample_desc ] if args.metrics_json: call.extend(["--metrics", args.metrics_json]) if args.aggregation_csv: call.extend(["--aggregation", args.aggregation_csv]) if gem_group_index_json: call.extend(["--gemgroups", gem_group_index_json]) martian.log_info("Running crconverter: %s" % " ".join(call)) try: results = subprocess.check_output(call) martian.log_info("crconverter output: %s" % results) except subprocess.CalledProcessError, e: outs.output_for_cloupe = None martian.throw("Could not generate .cloupe file: \n%s" % e.output)
def main(args, outs): if args.pipestance_type != "count" and args.pipestance_type != "aggr": martian.exit("The type argument must be one of: count, aggr") if args.pipestance_type == "count": pname = "SC_RNA_COUNTER_CS" if args.pipestance_type == "aggr": pname = "SC_RNA_AGGREGATOR_CS" pipestance_exists = os.path.exists(args.pipestance_path) if not pipestance_exists: martian.exit("Invalid pipestance path: %s" % args.pipestance_path) # check to see if an analysis file exists. If it doesn't, then # this is likely a barnyard sample, and we cannot generate a # .loupe file (CELLRANGER-773); analysis_h5_path = os.path.join(args.pipestance_path, "outs/analysis/analysis.h5") # 1.2.0 location only internal_count_h5_path = os.path.join( args.pipestance_path, "SC_RNA_COUNTER_CS/SC_RNA_COUNTER/SC_RNA_ANALYZER/SUMMARIZE_ANALYSIS/fork0/files/analysis/analysis.h5" ) internal_aggr_h5_path = os.path.join( args.pipestance_path, "SC_RNA_AGGREGATOR_CS/SC_RNA_AGGREGATOR/SC_RNA_ANALYZER/SUMMARIZE_ANALYSIS/fork0/files/analysis/analysis.h5" ) if not os.path.exists(analysis_h5_path) \ and not os.path.exists(internal_count_h5_path) \ and not os.path.exists(internal_aggr_h5_path): martian.exit( "Could not find single-species analysis HDF5 file. " + "Loupe Cell Browser files are not generated for multi-species experiments." ) # has to be 1.2 or higher cellranger_pd_before_1_2_path = os.path.join(args.pipestance_path, "CELLRANGER_PD") cellranger_cs_before_1_2_path = os.path.join(args.pipestance_path, "CELLRANGER_CS") if os.path.exists(cellranger_pd_before_1_2_path) or os.path.exists( cellranger_cs_before_1_2_path): martian.exit( "mkloupe is only supported for Cell Ranger 1.2 and later.") call = [ "crconverter", args.sample_id, pname, "--pipestance", args.pipestance_path, "--output", outs.output_for_cloupe ] martian.log_info("Running crconverter: %s" % " ".join(call)) try: results = subprocess.check_output(call) martian.log_info("crconverter output: %s" % results) except subprocess.CalledProcessError, e: outs.output_for_cloupe = None martian.throw("Could not generate .cloupe file: \n%s" % e.output)
def main(args, outs): chunk = args.chunk assert(chunk['reads_interleaved']) if not chunk['reads_interleaved'] and (chunk['read1'] is None or chunk['read2'] is None): martian.throw("must supply a read1 and read2 when reads_interleave == False") output_dir = os.path.dirname(os.path.realpath(outs.default)) if args.barcode_whitelist: barcode_whitelist = BARCODE_LOCATION + "/" + args.barcode_whitelist + ".txt" else: barcode_whitelist = "none" gem_group = chunk["gem_group"] or 1 barcode_read = chunk["barcode"] or "none" barcode_counts = args.barcode_counts or "none" sample_index = chunk["sample_index"] or "none" read_group_string = chunk["read_group"] or "none" subprocess.check_call(['bucket_fastq_by_bc', '-reads='+chunk["read1"], '-read_group_string='+read_group_string, '-barcodes='+barcode_read, '-barcodeCounts='+barcode_counts, '-bcConfidenceThreshold='+str(args.bc_confidence_threshold), '-output_directory='+output_dir, '-sample_index_reads='+sample_index, '-gem_group='+str(gem_group), '-barcode_whitelist='+barcode_whitelist, '-interleaved='+str(chunk['reads_interleaved']), '-max_expected_barcode_errors='+str(args.max_expected_barcode_errors), '-buckets='+str(args.buckets)])
def join(args, outs, chunk_defs, chunk_outs): if do_not_make_cloupe(args): outs.output_for_cloupe = None return reference = ReferenceManager(args.reference_path) contig_info_fn = martian.make_path("contig_info.json") with open(contig_info_fn, 'w') as outfile: contig_info = get_contig_info(args.reference_path) json.dump(contig_info, outfile) gem_group_index_json = get_gem_group_index_json(args, outs) call = [ "crconverter", args.sample_id, args.pipestance_type, "--matrix", args.feature_barcode_matrix, "--analysis", args.analysis, "--output", outs.output_for_cloupe, "--description", '"' + args.sample_desc + '"', "--peaks", args.peaks, "--fragmentsindex", args.fragments_index, "--geneannotations", reference.genes, "--contiginfo", contig_info_fn, ] if args.metrics_json is not None: call.extend(["--metrics", args.metrics_json]) if args.aggregation_csv is not None: call.extend(["--aggregation", args.aggregation_csv]) if gem_group_index_json is not None: call.extend(["--gemgroups", gem_group_index_json]) transcript_gene_types = get_annotation_gene_types(args) if transcript_gene_types is not None: call.extend(["--geneannotationtypes", ",".join(transcript_gene_types)]) # the sample desc may be unicode, so send the whole # set of args str utf-8 to check_output unicode_call = [arg.encode('utf-8') for arg in call] # but keep the arg 'call' here because log_info inherently # attempts to encode the message... (TODO: should log_info # figure out the encoding of the input string) martian.log_info("Running crconverter: %s" % " ".join(call)) try: results = tk_subproc.check_output(unicode_call) martian.log_info("crconverter output: %s" % results) except subprocess.CalledProcessError as e: outs.output_for_cloupe = None martian.throw("Could not generate .cloupe file: \n%s" % e.output)
def main(args, outs): if not args.run_qc: return out_base = os.path.dirname(outs.qc_summary) whitelist_path = tk_preflight.check_barcode_whitelist(args.barcode_whitelist) file_infos = [tk_fasta.IlmnFastqFile(path) for path in args.input_files] bc_file_type = args.file_read_types_map[args.bc_read_type] barcode_files = [f for f in file_infos if f.read == bc_file_type] outs.summary_chunk = { 'barcode': [], 'read1': [], 'read2': [] } for idx, bf in enumerate(barcode_files): output_json_path = os.path.join(out_base, "output_%d_BC.json" % idx) subproc_args = [ 'barcodeqc', bf.filename, output_json_path, "--whitelist", whitelist_path, "--bc-start-index", str(args.bc_start_index), "--bc-length", str(args.bc_length)] if args.bc_read_type == "I2" and args.rc_i2_read: subproc_args.append("--rc") try: tk_proc.check_call(subproc_args) except subprocess.CalledProcessError, e: martian.throw("Could not QC barcodes: return code %s" % e.returncode) outs.summary_chunk['barcode'].append(output_json_path)
def main(self): """Parses command line arguments and runs the stage main.""" # Load args and retvals from metadata. args = martian.Record(self.metadata.read('args')) if self._run_type == 'split': self._run( lambda: self._record_result(lambda: self._module.split(args))) self.metadata.write('stage_defs', self._result) return outs = martian.Record(self.metadata.read('outs')) if self._run_type == 'main': self._run(lambda: self._module.main(args, outs)) elif self._run_type == 'join': chunk_defs = [ martian.Record(chunk_def) for chunk_def in self.metadata.read('chunk_defs') ] chunk_outs = [ martian.Record(chunk_out) for chunk_out in self.metadata.read('chunk_outs') ] self._run( lambda: self._module.join(args, outs, chunk_defs, chunk_outs)) else: martian.throw('Invalid run type %s' % self._run_type) # Write the output as JSON. self.metadata.write('outs', outs.items())
def main(args, outs): chunk = args.chunk if not chunk['reads_interleaved'] and (chunk['read1'] is None or chunk['read2'] is None): martian.throw( "must supply a read1 and read2 when reads_interleave == False") if chunk['reads_interleaved']: reads = chunk['read1'] else: reads = [chunk['read1']] if chunk['read2'] is not None: reads.append(chunk['read2']) a = tenkit.align.Aligner(reads, outs.default) aligner = args.aligner ref_fasta = tenkit.reference.get_fasta(args.reference_path) a.output_alignment(aligner=aligner, aligner_params={ 'ref_fasta': ref_fasta, 'algorithm': args.aligner_method }, num_threads=args.__threads, sample=args.read_group_sample)
def main(args, outs): # this silences a weird non-failure in --strict=error mode # TODO(lhepler): remove this when martian upstream handles this itself outs.default = [] chunk = args.chunk if not chunk['reads_interleaved'] and (chunk['read1'] is None or chunk['read2'] is None): martian.throw( "must supply a read1 and read2 when reads_interleave == False") if chunk['reads_interleaved']: reads = chunk['read1'] else: reads = [chunk['read1']] if chunk['read2'] is not None: reads.append(chunk['read2']) a = tenkit.align.Aligner(reads, outs.output) aligner = args.aligner ref_fasta = tenkit.reference.get_fasta(args.reference_path) rg_string = chunk['read_group'] read_group_header = tk_bam.make_rg_header(rg_string) a.output_alignment(aligner=aligner, aligner_params={ 'ref_fasta': ref_fasta, 'algorithm': args.aligner_method }, num_threads=args.__threads, read_group_header=read_group_header)
def main(args, outs): if not args.run_qc: return out_base = os.path.dirname(outs.qc_summary) whitelist_path = tk_preflight.check_barcode_whitelist( args.barcode_whitelist) file_infos = [tk_fasta.IlmnFastqFile(path) for path in args.input_files] bc_file_type = args.file_read_types_map[args.bc_read_type] barcode_files = [f for f in file_infos if f.read == bc_file_type] # Note: this is Martian 3 incompatible; revert back to summary_chunk if merging # back into master (also applies to additional references to `qc_summary` in the main function) # # see https://github.com/10XDev/tenkit/commit/2c59c9a24b0e7cd81945544f62ffde7ab632ed42 outs.qc_summary = {'barcode': [], 'read1': [], 'read2': []} for idx, bf in enumerate(barcode_files): output_json_path = os.path.join(out_base, "output_%d_BC.json" % idx) subproc_args = [ 'barcodeqc', bf.filename, output_json_path, "--whitelist", whitelist_path, "--bc-start-index", str(args.bc_start_index), "--bc-length", str(args.bc_length) ] if args.bc_read_type == "I2" and args.rc_i2_read: subproc_args.append("--rc") try: tk_proc.check_call(subproc_args) except subprocess.CalledProcessError, e: martian.throw("Could not QC barcodes: return code %s" % e.returncode) # needs to be summary_chunk in Martian 3 outs.qc_summary['barcode'].append(output_json_path)
def _run(self, cmd): """Run the given command under the currently configured profiler.""" if self.jobinfo.profile_mode == 'mem': profiler = _MemoryProfile() self._run_profiler(cmd, profiler, 'profile_mem_txt') elif self.jobinfo.profile_mode == 'line': profiler = None try: profiler = line_profiler.LineProfiler() except NameError: martian.throw( 'Line-level profiling was requested, but line_profiler was not found.' ) for func in self.funcs: profiler.add_function(func) self._run_profiler(cmd, profiler, 'profile_line_bin') iostr = StringIO() profiler.print_stats(stream=iostr) self.metadata.write_raw('profile_line_txt', iostr.getvalue()) elif self.jobinfo.profile_mode == 'cpu': profiler = cProfile.Profile() self._run_profiler(cmd, profiler, 'profile_cpu_bin') iostr = StringIO() stats = pstats.Stats(profiler, stream=iostr).sort_stats('cumulative') stats.print_stats() self.metadata.write_raw('profile_cpu_txt', iostr.getvalue()) else: if self.jobinfo.profile_mode and self.jobinfo.profile_mode != 'disable': # Give the profiler a little bit of time to attach. time.sleep(0.5) cmd()
def load_alerts(): alerts_file = os.path.join(ALARMS_LOCATION, "alarms-supernova.json") json_string = open(alerts_file, "r").read() try: alerts = json.loads(json_string) except: martian.throw("Incorrectly formatted alarms-supernova.json file.") for stage, alert_list in alerts.iteritems(): for alert in alert_list: check_alert(stage, alert) return alerts
def base_mask(read): if read["read_name"][0] == "R": return "Y" + str(read["read_length"]) elif read["read_name"][0] == "I": if ignore_dual_index and read["read_name"] != sample_index_read: return "N" + str(read["read_length"]) elif dual_indexed or read["read_name"] == sample_index_read: return "I" + str(read["read_length"]) else: return "Y" + str(read["read_length"]) else: martian.throw("read name was not recognized: %s" % read["read_name"])
def main(args, outs): """ Run the vlconverter executable with inputs that should be available in the outs folder at the end of the pipeline run. This will generate "output_for_vloupe.vloupe" in the stage folder. Memory usage not expected to be excessive with this (thus no custom split/join as of yet); it will need to load a few full files (bam.bai, fasta.fai) into memory. """ if args.concat_ref_bam is None or not os.path.isfile(args.concat_ref_bam) or \ args.consensus_bam is None or not os.path.isfile(args.consensus_bam) or \ args.contig_bam_bai is None or not os.path.isfile(args.contig_bam_bai): martian.log_info( 'One or more bam files missing - cannot make vloupe file') return call = [ "vlconverter", args.sample_id, args.pipestance_type, "--output", outs.output_for_vloupe, "--reference-bam", args.concat_ref_bam, "--reference-bam-index", args.concat_ref_bam_bai, "--reference-fasta", args.concat_ref_fasta, "--reference-fasta-index", args.concat_ref_fasta_fai, "--reference-annotations", args.concat_ref_annotations_json, "--clonotypes", args.clonotypes_csv, "--consensus-bam", args.consensus_bam, "--consensus-bam-index", args.consensus_bam_bai, "--consensus-annotations", args.consensus_annotations_json, "--consensus-fasta", args.consensus_fasta, "--consensus-fasta-index", args.consensus_fasta_fai, "--contig-bam-relative-path", args.contig_bam_relative_path, "--contig-bam-index", args.contig_bam_bai, "--contig-annotations", args.contig_annotations_json, "--contig-bed", args.contig_annotations_bed, "--contig-fasta", args.contig_fasta, "--contig-fasta-index", args.contig_fasta_fai, "--description", args.sample_desc ] # the sample desc may be unicode, so send the whole # set of args str utf-8 to check_output unicode_call = [arg.encode('utf-8') for arg in call] # but keep the arg 'call' here because log_info inherently # attempts to encode the message... (TODO: should log_info # figure out the encoding of the input string) martian.log_info("Running vlconverter: %s" % " ".join(call)) try: results = tk_subproc.check_output(unicode_call) martian.log_info("vlconverter output: %s" % results) except subprocess.CalledProcessError, e: outs.output_for_vloupe = None martian.throw("Could not generate .vloupe file: \n%s" % e.output)
def split(args): vc_mode, variant_caller, precalled_filename, gatk_path = tk_io.get_vc_mode( args.vc_precalled, args.variant_mode) precalled_file = None if vc_mode == "precalled" or vc_mode == "precalled_plus": mem_gb = 8 threads = 1 precalled_file = martian.make_path("precalled_vcf.vcf") tenkit.log_subprocess.check_call( ['cp', precalled_filename, precalled_file]) tk_tabix.index_vcf(precalled_file) precalled_file = precalled_file + ".gz" if vc_mode != "precalled": if variant_caller == 'freebayes': mem_gb = 5 threads = 1 elif variant_caller == "gatk": mem_gb = 8 threads = 2 # make sure the gatk jar file exists if gatk_path is None: martian.throw( "variant_caller 'gatk' selected, must supply path to gatk jar file -- e.g. \"gatk:/path/to/GenomeAnalysisTK.jar\"" ) gatk_loc = gatk_path if not (os.path.exists(gatk_loc)): martian.throw( "variant_caller 'gatk' selected, gatk jar file does not exist: %s" % gatk_loc) else: raise NotSupportedException('Variant caller not supported: ' + vc_mode) primary_contigs = tk_reference.load_primary_contigs(args.reference_path) bam_chunk_size_gb = 3.0 if args.restrict_locus is None: loci = tk_chunks.get_sized_bam_chunks(args.input, bam_chunk_size_gb, contig_whitelist=primary_contigs, extra_args={ '__mem_gb': mem_gb, '__threads': threads, 'split_input': precalled_file }) else: loci = [{'locus': args.restrict_locus}] return {'chunks': loci}
def write_stage_alerts(stage, path, alerts_file="alerts.list"): alerts = load_alerts() out_file = os.path.join(path, alerts_file) if not os.path.exists(path): os.makedirs(path) out_handle = open(out_file, "w") keys = ["metric", "threshold", "compare", "action", "message"] if not alerts.has_key(stage): martian.throw("No alerts found for stage %s" % stage) for alert in alerts[stage]: out_handle.write("#\n") out_handle.write(alert["metric"] + "\n") out_handle.write(str(alert["threshold"]) + "\n") out_handle.write(alert["compare"] + "\n") out_handle.write(alert["action"] + "\n") out_handle.write(alert["message"] + "\n") out_handle.close()
def split(args): if args.fragments is None: return {'chunks': [], 'join': {}} if args.peaks is None: martian.throw("peaks BED file expected") if args.cell_barcodes is None: martian.throw("cell barcodes CSV file expected") ctg_mgr = ReferenceManager(args.reference_path) all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True) chunks = [] for contig in all_contigs: chunks.append({'contig': contig, '__mem_gb': 4}) return {'chunks': chunks, 'join': {'__mem_gb': 8}}
def main(args, outs): """ Run the vlconverter executable with inputs that should be available in the outs folder at the end of the pipeline run. This will generate "output_for_vloupe.vloupe" in the stage folder. Memory usage not expected to be excessive with this (thus no custom split/join as of yet); it will need to load a few full files (bam.bai, fasta.fai) into memory. """ if not os.path.isfile(args.concat_ref_bam) or \ not os.path.isfile(args.consensus_bam) or \ not os.path.isfile(args.contig_bam_bai): martian.log_info( 'One or more bam files missing - cannot make vloupe file') return call = [ "vlconverter", args.sample_id, args.pipestance_type, "--output", outs.output_for_vloupe, "--reference-bam", args.concat_ref_bam, "--reference-bam-index", args.concat_ref_bam_bai, "--reference-fasta", args.concat_ref_fasta, "--reference-fasta-index", args.concat_ref_fasta_fai, "--reference-annotations", args.concat_ref_annotations_json, "--clonotypes", args.clonotypes_csv, "--consensus-bam", args.consensus_bam, "--consensus-bam-index", args.consensus_bam_bai, "--consensus-annotations", args.consensus_annotations_json, "--consensus-fasta", args.consensus_fasta, "--consensus-fasta-index", args.consensus_fasta_fai, "--contig-bam-relative-path", args.contig_bam_relative_path, "--contig-bam-index", args.contig_bam_bai, "--contig-annotations", args.contig_annotations_json, "--contig-bed", args.contig_annotations_bed, "--contig-fasta", args.contig_fasta, "--contig-fasta-index", args.contig_fasta_fai, "--description", args.sample_desc ] martian.log_info("Running vlconverter: %s" % " ".join(call)) try: results = subprocess.check_output(call) martian.log_info("vlconverter output: %s" % results) except subprocess.CalledProcessError, e: outs.output_for_vloupe = None martian.throw("Could not generate .vloupe file: \n%s" % e.output)
def main(args, outs): if args.chrom is None or len(args.starts) == 0 or args.barcode_whitelist is None: tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_calls) return max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(args.insert_sizes, MAX_INSERT_SIZE_PRC) if max_insert is None: martian.throw('No Q60 reads') # This is slightly bigger than the maximum "normal" insert min_call_insert, _ = tk_sv_utils.get_insert_size_info(args.insert_sizes, MIN_SV_INSERT_SIZE_PRC) min_sv_len = max(args.min_sv_len, min_call_insert) martian.log_info('Setting min_sv_len to {}'.format(min_sv_len)) with open(args.basic_summary, 'r') as f: summary = json.load(f) chimera_rate_del = summary['far_chimera_rate'] chimera_rate_inv = summary['far_chimera_rate'] + summary['same_dir_chimera_rate'] chimera_rate_dup = summary['far_chimera_rate'] + summary['outward_dir_chimera_rate'] chimera_rates = {tk_readpairs.DEL_STR:chimera_rate_del, tk_readpairs.INV_STR:chimera_rate_inv, tk_readpairs.TDUP_STR:chimera_rate_dup, tk_readpairs.TRANS_STR:summary['far_chimera_rate']} df, read_counts, _ = tk_readpairs.get_discordant_loci(args.possorted_bam, chrom = str(args.chrom), starts = args.starts, stops = args.stops, min_mapq = args.min_mapq, min_insert = 0, max_insert = max_insert, max_merge_range = args.merge_range_factor * max_insert, min_sv_len = min_sv_len, max_sv_len = args.max_sv_len, ins_logsf_fun = ins_logsf_fun, min_lr_to_call = args.min_lr_to_call, min_reads_to_call = args.min_reads_to_call, chimera_rate = chimera_rates, reads_as_qual = True) # Need to convert to dict because defaultdict doesn't get pickled properly read_counts['split'] = dict(read_counts['split']) read_counts['pair'] = dict(read_counts['pair']) tk_sv_io.write_sv_df_to_bedpe(df, outs.sv_calls) with open(outs.discordant_read_counts, 'w') as f: f.write(tenkit.safe_json.safe_jsonify(read_counts))
def main(args, outs): chunk = args.chunk if not chunk['reads_interleaved'] and (chunk['read1'] is None or chunk['read2'] is None): martian.throw("must supply a read1 and read2 when reads_interleave == False") if chunk['reads_interleaved']: reads = chunk['read1'] else: reads = [chunk['read1']] if chunk['read2'] is not None: reads.append(chunk['read2']) a = tenkit.align.Aligner(reads, outs.default) aligner = args.aligner ref_fasta = tenkit.reference.get_fasta(args.reference_path) rg_string = chunk['read_group'] read_group_header = tk_bam.make_rg_header(rg_string) a.output_alignment(aligner=aligner, aligner_params={'ref_fasta': ref_fasta, 'algorithm': args.aligner_method}, num_threads=martian.get_threads_allocation(), read_group_header=read_group_header)
def main(args, outs): """ run_path must be the top-level Illumina flowcell directory """ if not os.path.exists(args.run_path): martian.throw("Run directory does not exist: %s" % args.run_path) run_info_xml = os.path.join(args.run_path, "RunInfo.xml") read_info, flowcell = tk_bcl.load_run_info(run_info_xml) outs.si_read_type = get_si_read_type(read_info) (rta_version, rc_i2_read, bcl_params) = tk_bcl.get_rta_version(args.run_path) martian.log_info("BCL folder RTA Version: %s" % rta_version) martian.log_info("BCL params: %s" % str(bcl_params)) martian.log_info("RC'ing i2 read: %s" % str(rc_i2_read)) outs.rc_i2_read = rc_i2_read split_by_tile = _split_by_tile(args) martian.log_info("Splitting by tile: %s" % str(split_by_tile)) outs.split_by_tile = split_by_tile
def read_sv_bedpe_to_df(bedpe): col_names = [ 'chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2', 'name', 'qual', 'strand1', 'strand2', 'filters', 'info' ] if bedpe is None: return pd.DataFrame(columns=col_names) try: df = pd.read_csv(bedpe, sep='\t', header=None, index_col=None, comment='#') except ValueError: return pd.DataFrame(columns=col_names) if df.shape[1] < 6: martian.throw('BEDPE file must have at least 6 columns') ncols = min(len(col_names), df.shape[1]) df = df.iloc[:, 0:ncols] df.columns = col_names[0:ncols] df[['chrom1', 'chrom2']] = df[['chrom1', 'chrom2']].astype(object) df[['start1', 'stop1', 'start2', 'stop2']] = df[['start1', 'stop1', 'start2', 'stop2']].astype(int) if not 'name' in df.columns: df['name'] = np.arange((len(df), )) if not 'qual' in df.columns: df['qual'] = 1 if not 'strand1' in df.columns: df['strand1'] = '.' if not 'strand2' in df.columns: df['strand2'] = '.' if not 'filters' in df.columns: df['filters'] = '.' if not 'info' in df.columns: df['info'] = '.' return df
def split(args): # if the files have not been split by tile, we're done, just bail if not args.split_by_tile: return {'chunks': [{'lane': None, 'bcs': []}]} # from here forward, assume that we're dealing with FASTQs separated # by tile file_glob = os.path.join(args.demultiplexed_fastq_path, "Tile*", "*.fastq*") files = glob.glob(file_glob) if len(files) == 0: martian.throw("No FASTQ files were found.") # find the unique # of lanes there are in all file_info = [ BclProcessorFastqFile(x) for x in files ] lanes = sorted(set([fi.lane for fi in file_info])) # lexicographically sort barcodes (incoming order is in reverse frequency # order) in order to spread work around more evenly sorted_bcs = sorted(args.common_bcs) bc_groups, bc_remgroup = divmod(len(sorted_bcs), BARCODE_GROUP_SIZE) chunks = [] for group_index in range(bc_groups): bcs = sorted_bcs[group_index*BARCODE_GROUP_SIZE:(group_index+1)*BARCODE_GROUP_SIZE] for lane in lanes: chunks.append({'__mem_gb': CHUNK_MEM_GB, 'lane': lane, 'bcs': bcs}) if bc_remgroup > 0: bcs = sorted_bcs[bc_groups*BARCODE_GROUP_SIZE:] for lane in lanes: chunks.append({'__mem_gb': CHUNK_MEM_GB, 'lane': lane, 'bcs': bcs}) # finally, the leftovers (si_X) for lane in lanes: chunks.append({'__mem_gb': CHUNK_MEM_GB, 'lane': lane, 'bcs': [DEMULTIPLEX_INVALID_SAMPLE_INDEX]}) return {'chunks': chunks}
def validate_input(args): """Does various parsing and checking of input arguments before we enter the main flow path """ ok, msg = tk_preflight.check_gem_groups(args.sample_def) if not ok: martian.exit(msg) def check_key(n, dict_in, name, tys): if not name in dict_in: martian.exit("Entry %d in sample_def missing required field: %s" % (n, name)) if not (type(dict_in[name]) in tys): martian.exit( "Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s" % (n, name, str(tys), type(dict_in[name]))) for (idx, sample_item) in enumerate(args.sample_def): check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "gem_group", [int, type(None)]) if args.input_mode == "BCL_PROCESSOR": check_key(idx, sample_item, "sample_indices", [list, type(None)]) elif args.input_mode == "ILMN_BCL2FASTQ": check_key(idx, sample_item, "sample_names", [list, type(None)]) if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]: martian.throw("Unrecognized input_mode: %s" % args.input_mode) if args.downsample is not None: assert ("gigabases" in args.downsample or "subsample_rate" in args.downsample) assert (not ("gigabases" in args.downsample and "subsample_rate" in args.downsample)) if 'subsample_rate' in args.downsample and args.downsample[ 'subsample_rate'] is not None: assert (args.downsample['subsample_rate'] <= 1.0)
def main(args, outs): ok, msg = tk_preflight.check_gem_groups(args.sample_def) if not ok: martian.exit(msg) outs.chunks = [] for sample_def in args.sample_def: fastq_mode = sample_def['fastq_mode'] chunks = [] if fastq_mode == tk_constants.BCL_PROCESSOR_FASTQ_MODE: chunks = main_bcl_processor(args.sample_id, sample_def, args.chemistry_name, args.custom_chemistry_def) elif fastq_mode == tk_constants.ILMN_BCL2FASTQ_FASTQ_MODE: chunks = main_ilmn_bcl2fastq(args.sample_id, sample_def, args.chemistry_name, args.custom_chemistry_def) else: martian.throw("Unrecognized fastq_mode: %s" % fastq_mode) if len(chunks) == 0: martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE) outs.chunks += chunks if len(outs.chunks) == 0: martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE) check_chunk_fastqs(outs.chunks) check_chunk_chemistries(outs.chunks) # Output chemistry and barcode whitelist outs.chemistry_def = outs.chunks[0]['chemistry'] outs.barcode_whitelist = cr_chem.get_barcode_whitelist(outs.chemistry_def)
def check_alert(stage, alert): keys = ["action", "metric", "compare", "threshold", "message"] for key in keys: if not alert.has_key(key): print key, " is missing in " print alert martian.throw("incorrectly formatted alert, see stdout.") if not (alert["compare"] == "<" or alert["compare"] == ">"): print alert martian.throw("invalid value for compare in alert") if not (type(alert["threshold"]) == int or type(alert["threshold"]) == float): martian.throw("%s: invalid type for threshold" % type(alert["threshold"]))
def main(args, outs): martian.throw('No chunks defined.')
def main(args, outs): """Combine reads from multiple input FASTQ files, and potentially trim. Demultiplex outputs a series of FASTQ files with filenames of the form: read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz]. """ def check_key(n, dict_in, name, tys): if not dict_in.has_key(name): martian.exit("Entry %d in sample_def missing required field: %s" % (n, name)) if not (type(dict_in[name]) in tys): martian.exit("Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s" % (n, name, str(tys), type(dict_in[name]))) global_subsample_rate = 1.0 downsample_gigabases = False downsample_reads = False if args.downsample is not None: ## make sure that exactly one downsampling option is specified options_supplied=0 for subsample_key in ["gigabases", "subsample_rate", "target_reads"]: if args.downsample.get(subsample_key, None) is not None: options_supplied += 1 assert( options_supplied == 1 ) ## if 'subsample_rate' in args.downsample and args.downsample['subsample_rate'] is not None: global_subsample_rate = args.downsample['subsample_rate'] assert( global_subsample_rate <= 1.0 ) elif 'target_reads' in args.downsample and args.downsample['target_reads'] is not None: downsample_reads = True else: downsample_gigabases = True # Check for self-consistent gem_group settings in the sample_def entries gem_groups = [x['gem_group'] for x in args.sample_def] all_null = all([x is None for x in gem_groups]) all_int = all([type(x) is int for x in gem_groups]) if not (all_null or all_int): martian.exit("Inconsistent gem_group tags. Please specify all gem_group tags as null, or all gem_group tags with an integer") # If all gem_groups are set to null, then set them all to 1 if all_null: for sample_item in args.sample_def: sample_item['gem_group'] = 1 # Predicted input bases total_seq_bases = 0 total_seq_reads = 0 # verify input mode upfront if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]: martian.throw("Unrecognized input_mode: %s" % args.input_mode) for (idx, sample_item) in enumerate(args.sample_def): # validate fields check_key(idx, sample_item, "read_path", [str, unicode]) check_key(idx, sample_item, "lanes", [list, type(None)]) check_key(idx, sample_item, "gem_group", [int, type(None)]) if args.input_mode == "BCL_PROCESSOR": check_key(idx, sample_item, "sample_indices", [list, type(None)]) elif args.input_mode == "ILMN_BCL2FASTQ": check_key(idx, sample_item, "sample_names", [list, type(None)]) interleaved_read_type = "RA" chunks = [] read_groups = set() for read_chunk in args.sample_def: # Check if subsample_rate exists in sample_def if 'subsample_rate' in read_chunk.keys(): subsample_rate = global_subsample_rate * read_chunk['subsample_rate'] else: subsample_rate = global_subsample_rate bc_in_read = {} if read_chunk.has_key('bc_in_read'): if read_chunk['bc_in_read'] is not None: bc_in_read['bc_in_read'] = read_chunk['bc_in_read'] bc_in_read['bc_length'] = read_chunk['bc_length'] path = read_chunk['read_path'] lanes = read_chunk['lanes'] gem_group = read_chunk['gem_group'] unbarcoded = read_chunk.get('unbarcoded') sample_id = args.sample_id library_id = read_chunk.get('library_id', 'MissingLibrary') # split on BCL_PROCESSOR / ILMN_BCL2FASTQ # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index; # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name if args.input_mode == "BCL_PROCESSOR": sample_index_strings, msg = tk_preflight.check_sample_indices(read_chunk) if sample_index_strings is None: martian.exit(msg) sample_seq_bases = 0 sample_seq_reads = 0 find_func = tk_fasta.find_input_fastq_files_10x_preprocess for sample_index in sample_index_strings: # process interleaved reads reads = find_func(path, interleaved_read_type, sample_index, lanes) for read in reads: predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read) sample_seq_bases += predicted_seq_bases sample_seq_reads += predicted_seq_reads martian.log_info("Input data: Predict %f GB from %s" % (float(sample_seq_bases)/1e9, path)) total_seq_bases += sample_seq_bases total_seq_reads += sample_seq_reads for sample_index in sample_index_strings: reads = find_func(path, interleaved_read_type, sample_index, lanes) # TODO confirm that this works with cellranger si_read, bc_read = ("I1", "I2") if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1': si_read, bc_read = ("I2", "I1") sis = find_func(path, si_read, sample_index, lanes) # allow empty sample index case if all reads in lane are same sample if sis is None or sis == []: sis = [None] * len(reads) if not unbarcoded: barcodes = find_func(path, bc_read, sample_index, lanes) if len(barcodes) == 0: barcodes = [None] * len(reads) else: barcodes = [None] * len(reads) # calculate chunks for r,b,si in zip(reads, barcodes, sis): (flowcell, lane) = get_run_data(r) rg_string = ':'.join([sample_id, library_id, str(gem_group), flowcell, lane]) new_chunk = { 'read1': r, 'read2': None, 'reads_interleaved': True, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) elif args.input_mode == "ILMN_BCL2FASTQ": sample_names = read_chunk['sample_names'] sample_seq_bases = 0 sample_seq_reads = 0 find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult for sample_name in sample_names: # process read 1 reads = find_func(path, "R1", sample_name, lanes) for read in reads: predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read) sample_seq_bases += predicted_seq_bases sample_seq_reads += predicted_seq_reads # process read 2 reads = find_func(path, "R2", sample_name, lanes) for read in reads: predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read) sample_seq_bases += predicted_seq_bases sample_seq_reads += predicted_seq_reads martian.log_info("Input data: Predict %f GB from %s" % (float(sample_seq_bases)/1e9, path)) total_seq_bases += sample_seq_bases total_seq_reads += sample_seq_reads for sample_name in sample_names: r1_reads = find_func(path, "R1", sample_name, lanes) r2_reads = find_func(path, "R2", sample_name, lanes) # TODO confirm that this works with cellranger si_read, bc_read = ("I1", "I2") if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1': si_read, bc_read = ("I2", "I1") sis = find_func(path, si_read, sample_name, lanes) # allow empty sample index case if all reads in lane are same sample if sis is None or sis == []: sis = [None] * len(r1_reads) # in Chromium chemistry... there shouldn't be separate barcode reads... if not unbarcoded: barcodes = find_func(path, bc_read, sample_name, lanes) if len(barcodes) == 0: barcodes = [None] * len(r1_reads) else: barcodes = [None] * len(r1_reads) # again, with Chromium, the barcodes should be an array of Nones, but # just in case... if not (len(r1_reads) == len(r2_reads) == len(barcodes)): martian.log_info("Read 1 files: %s" % str(r1_reads)) martian.log_info("Read 2 files: %s" % str(r2_reads)) martian.log_info("Barcode files: %s" % str(barcodes)) martian.exit("Read1, Read2, and Barcode files are mismatched. Exiting pipline") # calculate chunks for r1,r2,b,si in zip(r1_reads, r2_reads, barcodes, sis): (flowcell, lane) = get_run_data(r1) rg_string = ':'.join([sample_id, library_id, str(gem_group), flowcell, lane]) new_chunk = { 'read1': r1, 'read2': r2, 'reads_interleaved': False, 'barcode': b, 'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group, 'subsample_rate': subsample_rate, 'read_group': rg_string } new_chunk.update(bc_in_read) chunks.append(new_chunk) read_groups.add(rg_string) martian.log_info("Input data: Predict %f total GB" % (float(total_seq_bases)/1e9)) martian.log_info(" Predict %d total reads" % total_seq_reads) if len(chunks) == 0: martian.exit("No input FASTQs were found for the requested parameters.") if downsample_gigabases and args.downsample['gigabases'] is not None: # Calculate global downsample rate global_subsample_rate = min(1.0, float(args.downsample['gigabases'])*1e9 / float(total_seq_bases)) martian.log_info("Input data downsampling: Requested: %.2f GB, Estimated Input: %.2f GB, Downsample Rate: %.3f" \ % (float(args.downsample['gigabases']), float(total_seq_bases)/1e9, global_subsample_rate)) for chunk in chunks: chunk['subsample_rate'] = chunk['subsample_rate'] * global_subsample_rate elif downsample_reads: global_subsample_rate = min(1.0, float(args.downsample['target_reads'])/float(total_seq_reads)) martian.log_info("Input data downsampling: Requested: %.2f M reads, Estimated Input: %.2f M reads, Downsample Rate: %.3f" \ % (float(args.downsample['target_reads'])/1e6, float(total_seq_reads)/1e6, global_subsample_rate)) for chunk in chunks: chunk['subsample_rate'] = chunk['subsample_rate'] * global_subsample_rate martian.log_info("Input reads: %s" % str(chunks)) outs.chunks = chunks outs.read_groups = [rg for rg in read_groups] # log info about input vs requested GB # first, set defaults available_gb = float(total_seq_bases)/1e9 requested_gb = None available_reads = total_seq_reads requested_reads = None requested_rate = None post_downsample_gb = requested_gb downsample_succeeded = True if args.downsample is not None and args.downsample.get('gigabases') is not None: requested_gb = float(args.downsample['gigabases']) post_downsample_gb = min(available_gb, requested_gb) if available_gb < requested_gb: martian.log_info("Downsample requested more GB than was available; will not downsample.") downsample_succeeded = False elif args.downsample is not None and args.downsample.get('subsample_rate') is not None: requested_rate = float(args.downsample['subsample_rate']) post_downsample_gb = available_gb * requested_rate elif args.downsample is not None and args.downsample.get('target_reads') is not None: requested_reads = float(args.downsample['target_reads']) downsample_info = {} downsample_info['available_gb'] = available_gb downsample_info['requested_gb'] = requested_gb downsample_info['available_reads'] = available_reads downsample_info['requested_reads'] = requested_reads downsample_info['requested_rate'] = requested_rate downsample_info['post_downsample_gb'] = post_downsample_gb downsample_info['downsample_succeeded'] = downsample_succeeded with open(outs.downsample_info, 'w') as downsample_out: tenkit.safe_json.dump_numpy(downsample_info, downsample_out) check_fastqs(outs.chunks)
def main(args, outs): martian.throw("No chunks defined")