def add_command(self, run_time, number): run_time = int(run_time) number = int(number) finish_events = { "disco": DiskFinishedEvent, "leitora1": LeitoraUmFinishedEvent, "leitora2": LeitoraDoisFinishedEvent, "impressora1": ImpressoraUmFinishedEvent, "impressora2": ImpressoraDoisFinishedEvent } for _ in range(number): io = { "disco": None, "leitora1": None, "leitora2": None, "impressora1": None, "impressora2": None } last_start_cycles = [1] for dev in io.keys(): io_requests = [] has_device = bool(random.random() < 0.9) if not has_device: continue number_requests = random.randint(1, 5) for i in range(number_requests): io_cycles = random.randint(*io_config[dev]) start_cycle = 1 try: start_cycle = random.randint( last_start_cycles[-1], i * run_time // number_requests - io_cycles) if start_cycle in last_start_cycles: continue last_start_cycles.append(start_cycle) except ValueError: continue io_requests.append((start_cycle, io_cycles)) if len(io_requests): io[dev] = Device(dev, io_requests, finish_events[dev]) job_priority = random.choice(list(JobPriority)) job_size = random.randint(10, 70) new_job = Job(self.job_ids, run_time, job_priority, io, job_size) self.job_ids += 1 self.os.add_job(new_job)
def run(self): for job_config in self.job_configs: assert job_config.submit_time >= self.env.now yield self.env.timeout(job_config.submit_time - self.env.now) job = Job(self.env, job_config) # print('a task arrived at time %f' % self.env.now) self.cluster.add_job(job) self.destroyed = True
def run_arriba(self): """ """ jobs = [] for sample in self.samples: if len(sample.readsets) > 1: raise Exception("Error: only one read set per sample allowed") if sample.readsets[0].bam: # .bam input fastq_dir = os.path.join("fusions", "picard_sam_to_fastq", sample.name) bam = sample.readsets[0].bam left_fastq = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) right_fastq = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split( ".")[-1] == "gz": left_fastq = sample.readsets[0].fastq1 right_fastq = sample.readsets[0].fastq2 else: raise Exception( "Error: only .bam and .fastq.gz inputs allowed") output_dir = os.path.join("fusions", "arriba", sample.name) # JOBS chgdir_job = Job(command="cd " + output_dir) back_to_outdir_job = Job(command="cd " + self._output_dir) # CONCAT job = concat_jobs([ Job(command="mkdir -p " + output_dir), chgdir_job, arriba.run(left_fastq, right_fastq, self._output_dir, output_dir, keep_bam=self.args.keep_bams), back_to_outdir_job ], name="run_arriba." + sample.name) job.samples = [sample] jobs.append(job) return jobs
def gunzip_fastq(self): """ Gunzip .fastq.gz files or symlink if already uncompressed """ jobs = [] for readset in self.readsets: out_dir = os.path.join("fusions", "gunzip_fastq", readset.sample.name) # Find input readset FASTQs first from previous trimmomatic job, # then from original FASTQs in the readset sheet if readset.run_type == "PAIRED_END": candidate_input_files = [] if readset.fastq1 and readset.fastq2: candidate_input_files.append( [readset.fastq1, readset.fastq2]) if readset.bam: picard_dir = os.path.join("fusions", "picard_sam_to_fastq", readset.sample.name) candidate_input_files.append([ os.path.join( picard_dir, os.path.basename( re.sub(r"\.bam$", ".pair1.fastq.gz", readset.bam))), os.path.join( picard_dir, os.path.basename( re.sub(r"\.bam$", ".pair2.fastq.gz", readset.bam))) ]) if readset.cram: picard_dir = os.path.join("fusions", "picard_sam_to_fastq", readset.sample.name) candidate_input_files.append([ os.path.join( picard_dir, os.path.basename(readset.cram) + ".pair1.fastq.gz"), os.path.join( picard_dir, os.path.basename(readset.cram) + ".pair2.fastq.gz") ]) [fastq1, fastq2] = self.select_input_files(candidate_input_files) else: raise Exception("Error: run type \"" + readset.run_type + "\" is invalid for readset \"" + readset.name + "\" (should be PAIRED_END)!") gunzip1_job = gunzip.gunzip_fastq(fastq1, out_dir) gunzip2_job = gunzip.gunzip_fastq(fastq2, out_dir) job = concat_jobs( [Job(command="mkdir -p " + out_dir), gunzip1_job, gunzip2_job], name="gunzip_fastq." + readset.sample.name + "." + readset.name) jobs.append(job) return jobs
def __init__(self, guid, remote_address, pubkey_xml): self.guid = guid self.address = remote_address self.data = None self.checkin_time = None self.crypto = ECDHE(pubkey_xml) self.jobs = Queue() self.add_job(Job(command=('checkin', '')))
def chimerascan(self): """ Run chimerascan to call gene fusions """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "chimerascan", sample.name) chimerascan_job = chimerascan.run(fastq1, fastq2, out_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), Job(command="rm -r " + out_dir), chimerascan_job ], name="chimerascan." + sample.name) jobs.append(job) return jobs
def ericscript(self): """ Run EricScript to call gene fusions """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "ericscript", sample.name) ericscript_job = ericscript.ericscript( fastq1, fastq2, out_dir, keep_bam=self.args.keep_bams) job = concat_jobs([ Job(command="mkdir -p " + out_dir), Job(command="rm -r " + out_dir), ericscript_job ], name="ericscript." + sample.name) jobs.append(job) return jobs
def main(hub_id, dataset_id, version): conn = psql.connect('') queue = pq.PQ(conn=conn)['jobs'] queue.put( Job(1, 'verify_partitions', { 'hub_id': hub_id, 'dataset_id': dataset_id, 'version': version, }).__dict__)
def _build_jobs(self, response, data, epoch, now, source, ignoreempty = False, discardiffull = False): decoded = data.decode("utf_8") if len(decoded) == 0 and ignoreempty: self.core.log(self, "Got empty %s response\n" % source, 500) return decoded = json.loads(decoded) data = unhexlify(decoded["result"]["data"].encode("ascii")) target = unhexlify(decoded["result"]["target"].encode("ascii")) try: identifier = int(decoded["result"]["identifier"]) except: identifier = None if identifier != self.lastidentifier: self._cancel_jobs() self.lastidentifier = identifier self.blockchain.check_job(Job(self.core, self, 0, data, target, True, identifier)) roll_ntime = 1 expiry = 60 isp2pool = False headers = response.getheaders() for h in headers: if h[0].lower() == "x-is-p2pool" and h[1].lower() == "true": isp2pool = True elif h[0].lower() == "x-roll-ntime" and h[1] and h[1].lower() != "n": roll_ntime = 60 parts = h[1].split("=", 1) if parts[0].strip().lower() == "expire": try: roll_ntime = int(parts[1]) except: pass expiry = roll_ntime if isp2pool: expiry = 60 self.stats.supports_rollntime = roll_ntime > 1 if epoch != self.jobepoch: self.core.log(self, "Discarding %d jobs from %s response because request was issued before flush\n" % (roll_ntime, source), 500) with self.stats.lock: self.stats.jobsreceived += roll_ntime return if self.core.workqueue.count > self.core.workqueue.target * (1 if discardiffull else 5): self.core.log(self, "Discarding %d jobs from %s response because work buffer is full\n" % (roll_ntime, source), 500) with self.stats.lock: self.stats.jobsreceived += roll_ntime return expiry += now - self.settings.expirymargin midstate = Job.calculate_midstate(data) prefix = data[:68] timebase = struct.unpack(">I", data[68:72])[0] suffix = data[72:] return [Job(self.core, self, expiry, prefix + struct.pack(">I", timebase + i) + suffix, target, midstate, identifier) for i in range(roll_ntime)]
def integrate(self): """ Run Integrate to call gene fusions """ jobs = [] for sample in self.samples: input_dir = os.path.join("fusions", "tophat2", sample.name) accepted_bam = os.path.join(self.output_dir, input_dir, "accepted_hits.bam") unmapped_bam = os.path.join(self.output_dir, input_dir, "unmapped.bam") out_dir = os.path.join("fusions", "integrate", sample.name) integrate_job = integrate.integrate(accepted_bam, unmapped_bam, out_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), Job(command="cd " + out_dir), integrate_job, Job(command="cd -") ], name="integrate." + sample.name) jobs.append(job) return jobs
def sleep(self, guid: str, interval: int): """ Set the checkin interval for an agent Usage: sleep <guid> <interval> [-h] Arguments: guid filter by session's guid interval checkin interval in milliseconds """ for session in self.sessions: if session == guid: session.add_job(Job(command=('sleep', int(interval))))
def fusionmap(self): """ Run FusionMap to call gene fusions """ jobs = [] for sample in self.samples: # add pipeline top outpud dir as input to bfx fusionmap script # self._output_dir assigned from command line args in pipeline.py top_dir = self._output_dir fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "fusionmap", sample.name) fusionmap_job = fusionmap.fusionmap(fastq1, fastq2, out_dir, top_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), fusionmap_job, Job(command="ls " + out_dir + "/02_RNA*") ], name="fusionmap." + sample.name) jobs.append(job) return jobs
def get_jobs(self): jobs = [] for task in self.tasks: for job_number in range(0, task.get_number_of_jobs(self.H)): start = task.phase + task.period * job_number end = task.deadline + task.period * job_number + task.phase job = Job(task=task, name=job_number + 1, release=start, deadline=end, ex_time=task.ex_time, status=1) jobs.append(job) return jobs
def run_star_seqr(self): """ RNA Fusion Detection and Quantification using STAR https://github.com/ExpressionAnalysis/STAR-SEQR """ jobs = [] for sample in self.samples: if len(sample.readsets) > 1: raise Exception("Error: only one read set per sample allowed") if sample.readsets[0].bam: # .bam input fastq_dir = os.path.join("fusions", "picard_sam_to_fastq", sample.name) bam = sample.readsets[0].bam # fastq1 = os.path.join(out_dir, os.path.basename(re.sub("\.bam$", ".pair1.fastq.gz", out_bam))) # fastq2 = os.path.join(out_dir, os.path.basename(re.sub("\.bam$", ".pair2.fastq.gz", out_bam))) left_fastq = os.path.join( fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) right_fastq = os.path.join( fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split( ".")[-1] == "gz": # print(sample.readsets[0].fastq2) # print(sample.readsets[0].fastq2.split(".")[-1]) left_fastq = sample.readsets[0].fastq1 right_fastq = sample.readsets[0].fastq2 else: raise Exception( "Error: only .bam and .fastq.gz inputs allowed") output_dir = os.path.join("fusions", "star_seqr", sample.name) job = concat_jobs([ Job(command="mkdir -p " + output_dir), star_seqr.run(left_fastq, right_fastq, output_dir, sample.name, keep_bam=self.args.keep_bams) ], name="run_star_seqr." + sample.name) job.samples = [sample] jobs.append(job) return jobs
def delete_fastqs(self): """ Delete fastqs when all callers' jobs are finished """ jobs = [] for sample in self.samples: defuse_result = os.path.join("fusions", "defuse", sample.name, "results.filtered.tsv") fusionmap_result = os.path.join("fusions", "fusionmap", sample.name, "02_RNA.FusionReport.txt") ericscript_result = os.path.join("fusions", "ericscript", sample.name, "fusion.results.filtered.tsv") integrate_result = os.path.join("fusions", "integrate", sample.name, "breakpoints.cov.tsv") star_seqr_result = os.path.join("fusions", "star_seqr", sample.name, "out_STAR-SEQR_candidates.txt") arriba_result = os.path.join("fusions", "arriba", sample.name, "fusions.tsv") star_fusion_result = os.path.join( "fusions", "star_fusion", sample.name, "star-fusion.fusion_predictions.abridged.coding_effect.tsv") cicero_result = os.path.join("fusions", "cicero", sample.name, "final_fusions.txt") # result_file_list = [defuse_result, fusionmap_result, ericscript_result, integrate_result, # star_seqr_result, arriba_result, star_fusion_result] result_file_list = [defuse_result, fusionmap_result] del_job = delete_fastqs.delete_fastqs(sample.name, result_file_list) job = concat_jobs([Job(command="mkdir -p delete_fastqs"), del_job], name="delete_fastqs." + sample.name) # job = concat_jobs([ # Job(command="mkdir -p delete_fastqs") # ], name="delete_fastqs." + sample.name) job.input_files = [ defuse_result, fusionmap_result, ericscript_result, integrate_result, star_seqr_result, arriba_result, star_fusion_result, cicero_result ] jobs.append(job) # DELETE BAMS JOB (one across all samples) del_bams_job = concat_jobs( [delete_fastqs.delete_bams(result_file_list, self._output_dir)], name="delete_bams") jobs.append(del_bams_job) return jobs
def tophat2(self): """ Run Tophat2 for Integrate. Determines accepted hits and unmapped reads, and outputs corresponding .bam files required as input files for integrate step. """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join(self.output_dir, "fusions", "tophat2", sample.name) tophat2_job = tophat2.tophat2(fastq1, fastq2, out_dir) job = concat_jobs( [Job(command="mkdir -p " + out_dir), tophat2_job], name="tophat2." + sample.name) jobs.append(job) return jobs
def MetaFusion_clinical(self): """ Run MetaFusion.IsoHunter.clinical """ jobs = [] out_dir_abspath = self._output_dir metafusion_outdir = os.path.join("fusions", "metafusion_clinical") metafusion_job = metafusion_clinical.run_metafusion_clinical( out_dir_abspath, self.args.database) job = concat_jobs( [Job(command="mkdir -p " + metafusion_outdir), metafusion_job], name="MetaFusion.clinical") jobs.append(job) return jobs
def MetaFusion_IsoHunter(self): """ Run MetaFusion.IsoHunter """ jobs = [] out_dir_abspath = self._output_dir isohunter_outdir = os.path.join("fusions", "metafusion_isohunter") metafusion_job = metafusion_isohunter.run_isohunter_singularity( out_dir_abspath) job = concat_jobs( [Job(command="mkdir -p " + isohunter_outdir), metafusion_job], name="MetaFusion.IsoHunter") jobs.append(job) return jobs
def run(self, guids: List[str]): """ Run a module Usage: run <guids>... run -h | --help Arguments: guids session guids to run modules on Options: -h, --help Show dis """ job = Job(self.selected) for guid in guids: ipc_server.publish(NEW_JOB, (guid, job.encode()))
def main(): if len(sys.argv) < 3: print 'usage: %s input_dir output_dir' % sys.argv[0] return conf = DefaultConfigure() job = Job(conf) job.set_splliter(LineSplitter) job.set_mapper(WordCountMapper) job.set_mapper_num(4) job.set_reducer(WordCountReducer) job.set_reducer_num(1) job.add_input_path(sys.argv[1]) job.set_output_path(sys.argv[2]) print job.run()
def wrapper(): if Agent.get('agent_status') == 'disabled': return jsonify({'status': 'disabled'}) Agent.set('agent_status', 'busy') log.info(f'processing request: \n' f'{[{k:v} for k,v in request.args.items()]}\n' f'role: {api.__name__}') try: job = Job(request) job.set('role', api.__name__) log.info(f'job object created with id: {job.job_id}') return api(job) except Exception as e: log.info(f'error in job processing: {e}', report=True)
def MetaFusion(self): """ Run MetaFusion """ jobs = [] cff_dir_abspath = os.path.join(self._output_dir, "fusions", "cff") out_dir_abspath = os.path.join(self._output_dir, "fusions", "metafusion") metafusion_job = metafusion.run_metafusion_singularity(out_dir_abspath) # metafusion_job.name = "MetaFusion" job = concat_jobs( [Job(command="mkdir -p " + out_dir_abspath), metafusion_job], name="MetaFusion") jobs.append(job) return jobs
def run(self, guids: List[str]): """ Run a module Usage: run <guids>... run -h | --help Arguments: guids session guids to run modules on Options: -h, --help Show dis """ for guid in guids: self.prompt_session.contexts[1].add_job( (guid, Job(module=self.selected)))
def main(): logging.configure() conn = psql.connect('') queue = pq.PQ(conn=conn)['jobs'] backends = load_backends(conn.cursor()) for job_entry in queue: if job_entry is None: time.sleep(2) continue job = Job(**job_entry.data) backend = backends[job.backend_id] run_job(conn.cursor(), backend, job) conn.commit()
def defuse(self): """ Run Defuse to call gene fusions """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "defuse", sample.name) defuse_job = defuse.defuse(fastq1, fastq2, out_dir, keep_bam=self.args.keep_bams) job = concat_jobs([Job(command="mkdir -p " + out_dir), defuse_job], name="defuse." + sample.name) jobs.append(job) return jobs
def __init__(self, guid, remote_address, pubkey_xml): self.guid = guid self.address = remote_address self.data = None self.checkin_time = None self.crypto = ECDHE(pubkey_xml) self.jobs = Queue() self.logger = logging.getLogger(str(guid)) self.logger.propagate = False self.logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(message)s') fh = logging.FileHandler(f"./logs/{guid}.log", encoding='UTF-8') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) self.logger.addHandler(fh) self.add_job(Job(command=('checkin', '')))
def star_fusion(self): """ Run STAR-Fusion to call gene fusions """ jobs = [] CTAT_resource_lib = "/hpf/largeprojects/ccmbio/mapostolides/validate_fusion/test_star_star-fusion/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play/ctat_genome_lib_build_dir" for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "star_fusion", sample.name) # star_fusion_job = star_fusion.star_fusion(fastq1, fastq2, out_dir, CTAT_resource_lib) star_fusion_job = star_fusion.star_fusion( fastq1, fastq2, CTAT_resource_lib, out_dir, keep_bam=self.args.keep_bams) job = concat_jobs( [Job(command="mkdir -p " + out_dir), star_fusion_job], name="star_fusion." + sample.name) jobs.append(job) return jobs
def fusion_stats(self): """ Outputs count files and plots about the detected gene fusions. """ jobs = [] cff_dir = os.path.join("fusions", "cff") out_dir = os.path.join("fusions", "fusion_stats") sampleinfo_file = os.path.relpath(self.args.sampleinfo.name, self.output_dir) fusion_stats_job = fusion_stats.fusion_stats(cff_dir, out_dir, sampleinfo_file) category_table_job = fusion_stats.generate_category_count_table( cff_dir, out_dir) category_barplot_job = fusion_stats.generate_categories_barplot( fusion_stats_dir=out_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), fusion_stats_job, category_table_job, category_barplot_job ], name="fusion_stats") jobs.append(job) return jobs
def convert_fusion_results_to_cff(self): """ Convert fusion results of all 4 gene fusion callers to cff format """ jobs = [] out_dir = os.path.join("fusions", "cff") job_list = [Job(command="mkdir -p " + out_dir)] sampleinfo_file = os.path.relpath(self.args.sampleinfo.name, self.output_dir) for sample in self.samples: # Define result files # output_file = os.path.join(output_dir, prefix + "_STAR-SEQR", prefix + "_STAR-SEQR_candidates.txt") # star_seqr_result = os.path.join("fusions", "star_seqr", sample.name, # "out_STAR-SEQR", "out_STAR-SEQR_candidates.txt") star_seqr_result = os.path.join("fusions", "star_seqr", sample.name, "out_STAR-SEQR_candidates.txt") # print >> sys.stderr, star_seqr_result arriba_result = os.path.join("fusions", "arriba", sample.name, "fusions.tsv") # star_fusion_result = os.path.join("fusions", "star_fusion", # sample.name, "star-fusion.fusion_predictions.abridged.tsv") star_fusion_result = os.path.join( "fusions", "star_fusion", sample.name, "star-fusion.fusion_predictions.abridged.coding_effect.tsv") defuse_result = os.path.join("fusions", "defuse", sample.name, "results.filtered.tsv") fusionmap_result = os.path.join("fusions", "fusionmap", sample.name, "02_RNA.FusionReport.txt") ericscript_result = os.path.join("fusions", "ericscript", sample.name, "fusion.results.filtered.tsv") integrate_result = os.path.join("fusions", "integrate", sample.name, "breakpoints.cov.tsv") cicero_result = os.path.join("fusions", "cicero", sample.name, "final_fusions.txt") # Build tool_results list based on self.tool_list result_file_dict = { "star_seqr": star_seqr_result, "arriba": arriba_result, "star_fusion": star_fusion_result, "defuse": defuse_result, "fusionmap": fusionmap_result, "ericscript": ericscript_result, "integrate": integrate_result, "cicero": cicero_result } tool_results = [(key, result_file_dict[key]) for key in result_file_dict.keys() if key in self.tool_list] # tool_results = [("star_seqr",star_seqr_result), ("arriba", arriba_result), # ("star_fusion", star_fusion_result), ("defuse", defuse_result), # ("fusionmap", fusionmap_result), ("ericscript", ericscript_result), # ("integrate", integrate_result)] # tool_results = [("arriba", arriba_result), ("star_fusion", star_fusion_result), # ("defuse", defuse_result), ("fusionmap", fusionmap_result), # ("ericscript", ericscript_result), ("integrate", integrate_result)] # determine sample_type """ sample_type = "" for contrast in self.contrasts: if sample in contrast.controls: sample_type = "Normal" elif sample in contrast.treatments: sample_type = "Tumor" if sample_type: disease_name = contrast.name break if not sample_type: raise Exception("Error: sample " + sample.name + " not found in design file " + self.args.design.name) """ # convert caller output files to common fusion format(cff) for tool, result_file in tool_results: job = cff_conversion.cff_convert(sample.name, result_file, sampleinfo_file, tool, out_dir) job.command = job.command.strip() job_list.append(job) job = concat_jobs(job_list, name="cff_conversion") jobs.append(job) return jobs
def run_cicero(self): """ Fusion detection specializing in internal tandem duplication (ITD) https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02043-x https://github.com/stjude/Cicero This software runs as a docker application. However, this can also be installed manually. As of May 2021, versions 0.2.0, 0.3.0 and 1.4.2 are available as modules on the HPF. Also runs RNApeg, a complementary tool to generate the junctions file for use by CICERO. Available on the HPF via RNApeg/20210226 and runs as a singularity container. """ jobs = [] for sample in self.samples: # Get fastq files if len(sample.readsets) > 1: raise Exception("Error: only one read set per sample allowed") if sample.readsets[0].bam: # .bam input fastq_dir = os.path.join("fusions", "picard_sam_to_fastq", sample.name) bam = sample.readsets[0].bam fq1 = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) fq2 = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split( ".")[-1] == "gz": fq1 = sample.readsets[0].fastq1 fq2 = sample.readsets[0].fastq2 else: raise Exception( "Error: only .bam and .fastq.gz inputs allowed") # Directories tmp_dir = "/localhd/${PBS_JOBID}" # The variable should be unevaluated in the qsub script trim_dir = os.path.join(tmp_dir, "trimmomatic") align_dir = os.path.join(tmp_dir, "star") cicero_dir = os.path.join(tmp_dir, "cicero") rnapeg_dir = os.path.join(tmp_dir, "rnapeg") output_dir = os.path.join("fusions", "cicero", sample.name) # Files fq1_trimmed = os.path.join( trim_dir, "".join([sample.name, ".trimmed.R1.fq.gz"])) fq2_trimmed = os.path.join( trim_dir, "".join([sample.name, ".trimmed.R2.fq.gz"])) fq1_dropped = os.path.join( trim_dir, "".join([sample.name, ".filtered.R1.fq.gz"])) fq2_dropped = os.path.join( trim_dir, "".join([sample.name, ".filtered.R2.fq.gz"])) trim_log = os.path.join(trim_dir, "".join([sample.name, ".trim.log"])) star_bam = os.path.join(align_dir, "Aligned.sortedByCoord.out.bam") dedup_bam = os.path.join(align_dir, "Aligned.sortedByCoord.dedup.bam") dedup_metrics = os.path.join( align_dir, "Aligned.sortedByCoord.dedup.metrics") symlink_bam = os.path.join(cicero_dir, sample.name + ".bam") junction_file = os.path.join( rnapeg_dir, sample.name + ".bam.junctions.tab.shifted.tab") # Jobs trim = trimmomatic.trimmomatic( fq1, fq2, fq1_trimmed, fq1_dropped, fq2_trimmed, fq2_dropped, None, None, config.param("trimmomatic", "adapter_fasta", required=False), trim_log) align = star.align(fq1_trimmed, fq2_trimmed, align_dir, config.param("run_cicero", "genome_build"), rg_id=sample.name, rg_library=sample.name, rg_sample=sample.name, rg_platform="ILLUMINA", sort_bam=True) index = samtools.index(star_bam) # Also indexes for us! idx_file=re.sub(r"\.bam$", ".bai", dedup_bam) dedup = picard.mark_duplicates([star_bam], dedup_bam, dedup_metrics) # RNApeg rna_peg = Job( input_files=[dedup_bam], output_files=[junction_file], module_entries=[("run_cicero", "module_rnapeg")], name="RNApeg", command="""ln -s \\\n{idx_file} \\\n{new_idx_file} && \\ ln -s {bamfile} \\\n{new_bamfile} && \\ singularity exec --cleanenv -B /hpf:/hpf -B /localhd:/localhd -B {outpath}:/results \\ $(which rnapeg.sif) RNApeg.sh -b {new_bamfile} \\\n -f {ref} \\\n -r {reflat}""" .format(bamfile=dedup_bam, ref=config.param("run_cicero", "reference", required=True), reflat=config.param("run_cicero", "reflat", required=True), outpath=rnapeg_dir, idx_file=re.sub(r"\.bam$", ".bai", dedup_bam), new_bamfile=symlink_bam, new_idx_file=symlink_bam + ".bai")) # Cicero cicero = Job( input_files=[dedup_bam, junction_file], output_files=[ os.path.join(cicero_dir, "CICERO_DATADIR", sample.name, "final_fusions.txt") ], module_entries=[("run_cicero", "module_cicero")], name="run_cicero" + sample.name, command= """singularity exec --cleanenv -B /hpf:/hpf -B /localhd:/localhd \\ $CICERO_PATH/CICERO_1.4.2.sif \\ Cicero.sh -n {threads} -b {bamfile} \\\n -g {genome} \\\n -r {reference} \\\n -j {junction} -o {out_dir}""" .format(threads=config.param("run_cicero", "threads", required=True), bamfile=symlink_bam, genome=config.param("run_cicero", "genome", required=True), reference=config.param("run_cicero", "cicero_data", required=True), junction=junction_file, out_dir=cicero_dir)) save_out = Job( input_files=[ os.path.join(cicero_dir, "CICERO_DATADIR", sample.name, "final_fusions.txt") ], output_files=[os.path.join(output_dir, "final_fusions.txt")], name="save_cicero_results" + sample.name, command="""mv {files_to_keep} {target_dir}""".format( files_to_keep=" ".join([ junction_file, os.path.join(cicero_dir, "0*.{err,log}"), # Logs os.path.join(cicero_dir, "CICERO_DATADIR", sample.name, "*.{txt,frame.tab,html}") # # Result files ]), target_dir=output_dir) ) # the files in /localhd/ should be removed automatically upon job end job_mkdir = Job( command="mkdir -p {trim} {align} {cicero} {output} {rnapeg}". format(trim=trim_dir, align=align_dir, cicero=cicero_dir, output=output_dir, rnapeg=rnapeg_dir)) combined_job = concat_jobs([ job_mkdir, trim, align, index, dedup, rna_peg, cicero, save_out ], name="run_cicero." + sample.name) # Replace input and output specification combined_job._output_files = [ os.path.join(output_dir, "final_fusions.txt") ] combined_job.input_files = [fq1, fq2] jobs.append(combined_job) return jobs