def submit_job(self, func, job_key, args=None, kwargs=None, trigger=None, job_id=None, replace_exist=False, filter_key='', filter_value='', **trigger_args): """ submit job to master through rpc :type func: str or callable obj or unicode :type job_key: str or unicode :type args: tuple or list :type kwargs: dict :type trigger: str or unicode :type job_id: str or unicode :type replace_exist: bool :type trigger_args: dict """ job_key = '%s:%s' % (self.name, job_key) # use worker's timezone if trigger don't provide specific `timezone` configuration trigger_args['timezone'] = self.timezone job_in_dict = { 'id': job_id, 'func': func, 'args': args, 'trigger': create_trigger(trigger, trigger_args) if trigger else None, 'kwargs': kwargs, 'filter_key': '%s_%s' % (self.name, filter_key), 'filter_value': filter_value, } job = Job(**job_in_dict) rpc_client_call('submit_job', Binary(job.serialize()), job_key, job.id, replace_exist)
def _build_jobs(self, response, data, now, ignoreempty = False): roll_ntime = 1 expiry = 60 isp2pool = False headers = response.getheaders() for h in headers: if h[0].lower() == "x-is-p2pool" and h[1].lower() == "true": isp2pool = True elif h[0].lower() == "x-roll-ntime" and h[1] and h[1].lower() != "n": roll_ntime = 60 parts = h[1].split("=", 1) if parts[0].strip().lower() == "expire": try: roll_ntime = int(parts[1]) except: pass expiry = roll_ntime if isp2pool: expiry = 60 self.stats.supports_rollntime = roll_ntime > 1 response = data.decode("utf_8") if len(response) == 0 and ignoreempty: return response = json.loads(response) data = unhexlify(response["result"]["data"].encode("ascii")) target = unhexlify(response["result"]["target"].encode("ascii")) try: identifier = int(response["result"]["identifier"]) except: identifier = None midstate = Job.calculate_midstate(data) prefix = data[:68] timebase = struct.unpack(">I", data[68:72])[0] suffix = data[72:] return [Job(self.core, self, now + expiry - self.settings.expirymargin, prefix + struct.pack(">I", timebase + i) + suffix, target, midstate, identifier) for i in range(roll_ntime)]
def submit_job(self, serialized_job, job_key, job_id, replace_exist): """ Receive submit_job rpc request from worker. :type serialized_job str or xmlrpclib.Binary :type job_key str :type job_id str :type replace_exist bool """ self.log.debug('client call submit job, id=%s, key=%s' % (job_id, job_key)) if isinstance(serialized_job, Binary): serialized_job = serialized_job.data job_in_dict = Job.deserialize_to_dict(serialized_job) # if job doesn't contains trigger, then enqueue it into job queue immediately if not job_in_dict['trigger']: self._enqueue_job(job_key, serialized_job) # else store job into job store first else: # should I need a lock here? with self.jobstore_lock: try: self.jobstore.add_job(job_id, job_key, job_in_dict['next_run_time'], serialized_job) except JobAlreadyExist: if replace_exist: self.jobstore.update_job(job_id, job_key, job_in_dict['next_run_time'], serialized_job) else: self.log.warning('submit job error. job id%s already exist' % job_id) # wake up when new job has store into job store self.wake_up()
def start_new_task(task, user_proc_exit_cb=None, save_buffer=True): worker = TornadoWorker.get_instance() logging.debug("we have new job to start %s" % str(task)) core = Core.get_instance() new_job = Job.create_job(task, core) logging.debug(new_job) if new_job.task_id not in worker.pool["buffer"]: worker.pool["buffer"][new_job.task_id] = [] task.update(status=JobDataModel.STATUS_RUNNING) proc_exit_cb = None if user_proc_exit_cb: proc_exit_cb = lambda job, exit_code: TornadoWorker.process_finished( worker, job, exit_code, user_proc_exit_cb) else: proc_exit_cb = lambda job, exit_code: TornadoWorker.process_finished( worker, job, exit_code) read_logs = None if save_buffer: read_logs = lambda lines, log_level: TornadoWorker.async_read_logs( worker, new_job, lines, log_level) else: read_logs = lambda lines, log_level: TornadoWorker.async_write_logs2stdout( worker, new_job, lines, log_level) job_log_manager = LogManager(core) new_job.start_job_async(job_log_manager, proc_exit_cb, read_logs)
def add_command(self, run_time, number): run_time = int(run_time) number = int(number) finish_events = { "disco": DiskFinishedEvent, "leitora1": LeitoraUmFinishedEvent, "leitora2": LeitoraDoisFinishedEvent, "impressora1": ImpressoraUmFinishedEvent, "impressora2": ImpressoraDoisFinishedEvent } for _ in range(number): io = { "disco": None, "leitora1": None, "leitora2": None, "impressora1": None, "impressora2": None } last_start_cycles = [1] for dev in io.keys(): io_requests = [] has_device = bool(random.random() < 0.9) if not has_device: continue number_requests = random.randint(1, 5) for i in range(number_requests): io_cycles = random.randint(*io_config[dev]) start_cycle = 1 try: start_cycle = random.randint( last_start_cycles[-1], i * run_time // number_requests - io_cycles) if start_cycle in last_start_cycles: continue last_start_cycles.append(start_cycle) except ValueError: continue io_requests.append((start_cycle, io_cycles)) if len(io_requests): io[dev] = Device(dev, io_requests, finish_events[dev]) job_priority = random.choice(list(JobPriority)) job_size = random.randint(10, 70) new_job = Job(self.job_ids, run_time, job_priority, io, job_size) self.job_ids += 1 self.os.add_job(new_job)
def wrapper(): if Agent.get('agent_status') == 'disabled': return jsonify({'status': 'disabled'}) Agent.set('agent_status', 'busy') log.info(f'processing request: \n' f'{[{k:v} for k,v in request.args.items()]}\n' f'role: {api.__name__}') try: job = Job(request) job.set('role', api.__name__) log.info(f'job object created with id: {job.job_id}') return api(job) except Exception as e: log.info(f'error in job processing: {e}', report=True)
def run_arriba(self): """ """ jobs = [] for sample in self.samples: if len(sample.readsets) > 1: raise Exception("Error: only one read set per sample allowed") if sample.readsets[0].bam: # .bam input fastq_dir = os.path.join("fusions", "picard_sam_to_fastq", sample.name) bam = sample.readsets[0].bam left_fastq = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) right_fastq = os.path.join( self._output_dir, fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split( ".")[-1] == "gz": left_fastq = sample.readsets[0].fastq1 right_fastq = sample.readsets[0].fastq2 else: raise Exception( "Error: only .bam and .fastq.gz inputs allowed") output_dir = os.path.join("fusions", "arriba", sample.name) # JOBS chgdir_job = Job(command="cd " + output_dir) back_to_outdir_job = Job(command="cd " + self._output_dir) # CONCAT job = concat_jobs([ Job(command="mkdir -p " + output_dir), chgdir_job, arriba.run(left_fastq, right_fastq, self._output_dir, output_dir, keep_bam=self.args.keep_bams), back_to_outdir_job ], name="run_arriba." + sample.name) job.samples = [sample] jobs.append(job) return jobs
def run(self): for job_config in self.job_configs: assert job_config.submit_time >= self.env.now yield self.env.timeout(job_config.submit_time - self.env.now) job = Job(self.env, job_config) # print('a task arrived at time %f' % self.env.now) self.cluster.add_job(job) self.destroyed = True
def run(self, guids: List[str]): """ Run a module Usage: run <guids>... run -h | --help Arguments: guids session guids to run modules on Options: -h, --help Show dis """ job = Job(self.selected) for guid in guids: ipc_server.publish(NEW_JOB, (guid, job.encode()))
def gunzip_fastq(self): """ Gunzip .fastq.gz files or symlink if already uncompressed """ jobs = [] for readset in self.readsets: out_dir = os.path.join("fusions", "gunzip_fastq", readset.sample.name) # Find input readset FASTQs first from previous trimmomatic job, # then from original FASTQs in the readset sheet if readset.run_type == "PAIRED_END": candidate_input_files = [] if readset.fastq1 and readset.fastq2: candidate_input_files.append( [readset.fastq1, readset.fastq2]) if readset.bam: picard_dir = os.path.join("fusions", "picard_sam_to_fastq", readset.sample.name) candidate_input_files.append([ os.path.join( picard_dir, os.path.basename( re.sub(r"\.bam$", ".pair1.fastq.gz", readset.bam))), os.path.join( picard_dir, os.path.basename( re.sub(r"\.bam$", ".pair2.fastq.gz", readset.bam))) ]) if readset.cram: picard_dir = os.path.join("fusions", "picard_sam_to_fastq", readset.sample.name) candidate_input_files.append([ os.path.join( picard_dir, os.path.basename(readset.cram) + ".pair1.fastq.gz"), os.path.join( picard_dir, os.path.basename(readset.cram) + ".pair2.fastq.gz") ]) [fastq1, fastq2] = self.select_input_files(candidate_input_files) else: raise Exception("Error: run type \"" + readset.run_type + "\" is invalid for readset \"" + readset.name + "\" (should be PAIRED_END)!") gunzip1_job = gunzip.gunzip_fastq(fastq1, out_dir) gunzip2_job = gunzip.gunzip_fastq(fastq2, out_dir) job = concat_jobs( [Job(command="mkdir -p " + out_dir), gunzip1_job, gunzip2_job], name="gunzip_fastq." + readset.sample.name + "." + readset.name) jobs.append(job) return jobs
def chimerascan(self): """ Run chimerascan to call gene fusions """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "chimerascan", sample.name) chimerascan_job = chimerascan.run(fastq1, fastq2, out_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), Job(command="rm -r " + out_dir), chimerascan_job ], name="chimerascan." + sample.name) jobs.append(job) return jobs
def __init__(self, guid, remote_address, pubkey_xml): self.guid = guid self.address = remote_address self.data = None self.checkin_time = None self.crypto = ECDHE(pubkey_xml) self.jobs = Queue() self.add_job(Job(command=('checkin', '')))
def _build_jobs(self, response, data, epoch, now, source, ignoreempty = False, discardiffull = False): decoded = data.decode("utf_8") if len(decoded) == 0 and ignoreempty: self.core.log(self, "Got empty %s response\n" % source, 500) return decoded = json.loads(decoded) data = unhexlify(decoded["result"]["data"].encode("ascii")) target = unhexlify(decoded["result"]["target"].encode("ascii")) try: identifier = int(decoded["result"]["identifier"]) except: identifier = None if identifier != self.lastidentifier: self._cancel_jobs() self.lastidentifier = identifier self.blockchain.check_job(Job(self.core, self, 0, data, target, True, identifier)) roll_ntime = 1 expiry = 60 isp2pool = False headers = response.getheaders() for h in headers: if h[0].lower() == "x-is-p2pool" and h[1].lower() == "true": isp2pool = True elif h[0].lower() == "x-roll-ntime" and h[1] and h[1].lower() != "n": roll_ntime = 60 parts = h[1].split("=", 1) if parts[0].strip().lower() == "expire": try: roll_ntime = int(parts[1]) except: pass expiry = roll_ntime if isp2pool: expiry = 60 self.stats.supports_rollntime = roll_ntime > 1 if epoch != self.jobepoch: self.core.log(self, "Discarding %d jobs from %s response because request was issued before flush\n" % (roll_ntime, source), 500) with self.stats.lock: self.stats.jobsreceived += roll_ntime return if self.core.workqueue.count > self.core.workqueue.target * (1 if discardiffull else 5): self.core.log(self, "Discarding %d jobs from %s response because work buffer is full\n" % (roll_ntime, source), 500) with self.stats.lock: self.stats.jobsreceived += roll_ntime return expiry += now - self.settings.expirymargin midstate = Job.calculate_midstate(data) prefix = data[:68] timebase = struct.unpack(">I", data[68:72])[0] suffix = data[72:] return [Job(self.core, self, expiry, prefix + struct.pack(">I", timebase + i) + suffix, target, midstate, identifier) for i in range(roll_ntime)]
def main(hub_id, dataset_id, version): conn = psql.connect('') queue = pq.PQ(conn=conn)['jobs'] queue.put( Job(1, 'verify_partitions', { 'hub_id': hub_id, 'dataset_id': dataset_id, 'version': version, }).__dict__)
def ericscript(self): """ Run EricScript to call gene fusions """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "ericscript", sample.name) ericscript_job = ericscript.ericscript( fastq1, fastq2, out_dir, keep_bam=self.args.keep_bams) job = concat_jobs([ Job(command="mkdir -p " + out_dir), Job(command="rm -r " + out_dir), ericscript_job ], name="ericscript." + sample.name) jobs.append(job) return jobs
def start(self): #self._install_signal_handlers() if self.running: raise AlreadyRunningException self._stopped = False self.log.debug('elric worker running..') while self.running: key, serialized_job = RedisJobQueue.dequeue_any(self.server, self.listen_keys) job = Job.deserialize(serialized_job) self.log.debug('get job id=[%s] func=[%s]from key %s' % (job.id, job.func, key)) self.executor.execute_job(job)
def start(self): """ Start elric master. Select all due jobs from jobstore and enqueue them into redis queue. Then update due jobs' information into jobstore. :return: """ if self.running: raise AlreadyRunningException self._stopped = False self.log.debug('eric master start...') while True: now = datetime.now(self.timezone) wait_seconds = None with self.jobstore_lock: for job_id, job_key, serialized_job in self.jobstore.get_due_jobs(now): # enqueue due job into redis queue self._enqueue_job(job_key, serialized_job) # update job's information, such as next_run_time job_in_dict = Job.deserialize_to_dict(serialized_job) last_run_time = Job.get_serial_run_times(job_in_dict, now) if last_run_time: next_run_time = Job.get_next_trigger_time(job_in_dict, last_run_time[-1]) if next_run_time: job_in_dict['next_run_time'] = next_run_time self.update_job(job_id, job_key, next_run_time, Job.dict_to_serialization(job_in_dict)) else: # if job has no next run time, then remove it from jobstore self.remove_job(job_id=job_id) # get next closet run time job from jobstore and set it to be wake up time closest_run_time = self.jobstore.get_closest_run_time() if closest_run_time is not None: wait_seconds = max(timedelta_seconds(closest_run_time - now), 0) self.log.debug('Next wakeup is due at %s (in %f seconds)' % (closest_run_time, wait_seconds)) self._event.wait(wait_seconds if wait_seconds is not None else self.MAX_WAIT_TIME) self._event.clear()
def get_jobs(self): jobs = [] for task in self.tasks: for job_number in range(0, task.get_number_of_jobs(self.H)): start = task.phase + task.period * job_number end = task.deadline + task.period * job_number + task.phase job = Job(task=task, name=job_number + 1, release=start, deadline=end, ex_time=task.ex_time, status=1) jobs.append(job) return jobs
def integrate(self): """ Run Integrate to call gene fusions """ jobs = [] for sample in self.samples: input_dir = os.path.join("fusions", "tophat2", sample.name) accepted_bam = os.path.join(self.output_dir, input_dir, "accepted_hits.bam") unmapped_bam = os.path.join(self.output_dir, input_dir, "unmapped.bam") out_dir = os.path.join("fusions", "integrate", sample.name) integrate_job = integrate.integrate(accepted_bam, unmapped_bam, out_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), Job(command="cd " + out_dir), integrate_job, Job(command="cd -") ], name="integrate." + sample.name) jobs.append(job) return jobs
def sleep(self, guid: str, interval: int): """ Set the checkin interval for an agent Usage: sleep <guid> <interval> [-h] Arguments: guid filter by session's guid interval checkin interval in milliseconds """ for session in self.sessions: if session == guid: session.add_job(Job(command=('sleep', int(interval))))
def fusionmap(self): """ Run FusionMap to call gene fusions """ jobs = [] for sample in self.samples: # add pipeline top outpud dir as input to bfx fusionmap script # self._output_dir assigned from command line args in pipeline.py top_dir = self._output_dir fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "fusionmap", sample.name) fusionmap_job = fusionmap.fusionmap(fastq1, fastq2, out_dir, top_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), fusionmap_job, Job(command="ls " + out_dir + "/02_RNA*") ], name="fusionmap." + sample.name) jobs.append(job) return jobs
def run_star_seqr(self): """ RNA Fusion Detection and Quantification using STAR https://github.com/ExpressionAnalysis/STAR-SEQR """ jobs = [] for sample in self.samples: if len(sample.readsets) > 1: raise Exception("Error: only one read set per sample allowed") if sample.readsets[0].bam: # .bam input fastq_dir = os.path.join("fusions", "picard_sam_to_fastq", sample.name) bam = sample.readsets[0].bam # fastq1 = os.path.join(out_dir, os.path.basename(re.sub("\.bam$", ".pair1.fastq.gz", out_bam))) # fastq2 = os.path.join(out_dir, os.path.basename(re.sub("\.bam$", ".pair2.fastq.gz", out_bam))) left_fastq = os.path.join( fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) right_fastq = os.path.join( fastq_dir, os.path.basename(re.sub(r"\.bam$", ".pair1.fastq.gz", bam))) elif sample.readsets[0].fastq2 and sample.readsets[0].fastq2.split( ".")[-1] == "gz": # print(sample.readsets[0].fastq2) # print(sample.readsets[0].fastq2.split(".")[-1]) left_fastq = sample.readsets[0].fastq1 right_fastq = sample.readsets[0].fastq2 else: raise Exception( "Error: only .bam and .fastq.gz inputs allowed") output_dir = os.path.join("fusions", "star_seqr", sample.name) job = concat_jobs([ Job(command="mkdir -p " + output_dir), star_seqr.run(left_fastq, right_fastq, output_dir, sample.name, keep_bam=self.args.keep_bams) ], name="run_star_seqr." + sample.name) job.samples = [sample] jobs.append(job) return jobs
def delete_fastqs(self): """ Delete fastqs when all callers' jobs are finished """ jobs = [] for sample in self.samples: defuse_result = os.path.join("fusions", "defuse", sample.name, "results.filtered.tsv") fusionmap_result = os.path.join("fusions", "fusionmap", sample.name, "02_RNA.FusionReport.txt") ericscript_result = os.path.join("fusions", "ericscript", sample.name, "fusion.results.filtered.tsv") integrate_result = os.path.join("fusions", "integrate", sample.name, "breakpoints.cov.tsv") star_seqr_result = os.path.join("fusions", "star_seqr", sample.name, "out_STAR-SEQR_candidates.txt") arriba_result = os.path.join("fusions", "arriba", sample.name, "fusions.tsv") star_fusion_result = os.path.join( "fusions", "star_fusion", sample.name, "star-fusion.fusion_predictions.abridged.coding_effect.tsv") cicero_result = os.path.join("fusions", "cicero", sample.name, "final_fusions.txt") # result_file_list = [defuse_result, fusionmap_result, ericscript_result, integrate_result, # star_seqr_result, arriba_result, star_fusion_result] result_file_list = [defuse_result, fusionmap_result] del_job = delete_fastqs.delete_fastqs(sample.name, result_file_list) job = concat_jobs([Job(command="mkdir -p delete_fastqs"), del_job], name="delete_fastqs." + sample.name) # job = concat_jobs([ # Job(command="mkdir -p delete_fastqs") # ], name="delete_fastqs." + sample.name) job.input_files = [ defuse_result, fusionmap_result, ericscript_result, integrate_result, star_seqr_result, arriba_result, star_fusion_result, cicero_result ] jobs.append(job) # DELETE BAMS JOB (one across all samples) del_bams_job = concat_jobs( [delete_fastqs.delete_bams(result_file_list, self._output_dir)], name="delete_bams") jobs.append(del_bams_job) return jobs
def cancel_task(request, task_id): task_id = int(task_id) redirect_to=None redirect_to = request.GET.get("back", None) if redirect_to is None: redirect_to="/" try: if task_id !=0 : job = Job.get_job(task_id) job.job2canceled() except: # just do nothing pass return HttpResponseRedirect(redirect_to)
def MetaFusion_clinical(self): """ Run MetaFusion.IsoHunter.clinical """ jobs = [] out_dir_abspath = self._output_dir metafusion_outdir = os.path.join("fusions", "metafusion_clinical") metafusion_job = metafusion_clinical.run_metafusion_clinical( out_dir_abspath, self.args.database) job = concat_jobs( [Job(command="mkdir -p " + metafusion_outdir), metafusion_job], name="MetaFusion.clinical") jobs.append(job) return jobs
def MetaFusion_IsoHunter(self): """ Run MetaFusion.IsoHunter """ jobs = [] out_dir_abspath = self._output_dir isohunter_outdir = os.path.join("fusions", "metafusion_isohunter") metafusion_job = metafusion_isohunter.run_isohunter_singularity( out_dir_abspath) job = concat_jobs( [Job(command="mkdir -p " + isohunter_outdir), metafusion_job], name="MetaFusion.IsoHunter") jobs.append(job) return jobs
def tophat2(self): """ Run Tophat2 for Integrate. Determines accepted hits and unmapped reads, and outputs corresponding .bam files required as input files for integrate step. """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join(self.output_dir, "fusions", "tophat2", sample.name) tophat2_job = tophat2.tophat2(fastq1, fastq2, out_dir) job = concat_jobs( [Job(command="mkdir -p " + out_dir), tophat2_job], name="tophat2." + sample.name) jobs.append(job) return jobs
def MetaFusion(self): """ Run MetaFusion """ jobs = [] cff_dir_abspath = os.path.join(self._output_dir, "fusions", "cff") out_dir_abspath = os.path.join(self._output_dir, "fusions", "metafusion") metafusion_job = metafusion.run_metafusion_singularity(out_dir_abspath) # metafusion_job.name = "MetaFusion" job = concat_jobs( [Job(command="mkdir -p " + out_dir_abspath), metafusion_job], name="MetaFusion") jobs.append(job) return jobs
def defuse(self): """ Run Defuse to call gene fusions """ jobs = [] for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "defuse", sample.name) defuse_job = defuse.defuse(fastq1, fastq2, out_dir, keep_bam=self.args.keep_bams) job = concat_jobs([Job(command="mkdir -p " + out_dir), defuse_job], name="defuse." + sample.name) jobs.append(job) return jobs
def main(): logging.configure() conn = psql.connect('') queue = pq.PQ(conn=conn)['jobs'] backends = load_backends(conn.cursor()) for job_entry in queue: if job_entry is None: time.sleep(2) continue job = Job(**job_entry.data) backend = backends[job.backend_id] run_job(conn.cursor(), backend, job) conn.commit()
def run(self, guids: List[str]): """ Run a module Usage: run <guids>... run -h | --help Arguments: guids session guids to run modules on Options: -h, --help Show dis """ for guid in guids: self.prompt_session.contexts[1].add_job( (guid, Job(module=self.selected)))
def __init__(self, guid, remote_address, pubkey_xml): self.guid = guid self.address = remote_address self.data = None self.checkin_time = None self.crypto = ECDHE(pubkey_xml) self.jobs = Queue() self.logger = logging.getLogger(str(guid)) self.logger.propagate = False self.logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(message)s') fh = logging.FileHandler(f"./logs/{guid}.log", encoding='UTF-8') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) self.logger.addHandler(fh) self.add_job(Job(command=('checkin', '')))
def star_fusion(self): """ Run STAR-Fusion to call gene fusions """ jobs = [] CTAT_resource_lib = "/hpf/largeprojects/ccmbio/mapostolides/validate_fusion/test_star_star-fusion/GRCh37_v19_CTAT_lib_Feb092018.plug-n-play/ctat_genome_lib_build_dir" for sample in self.samples: fastq1, fastq2 = self.select_input_fastq(sample) out_dir = os.path.join("fusions", "star_fusion", sample.name) # star_fusion_job = star_fusion.star_fusion(fastq1, fastq2, out_dir, CTAT_resource_lib) star_fusion_job = star_fusion.star_fusion( fastq1, fastq2, CTAT_resource_lib, out_dir, keep_bam=self.args.keep_bams) job = concat_jobs( [Job(command="mkdir -p " + out_dir), star_fusion_job], name="star_fusion." + sample.name) jobs.append(job) return jobs
def submit_job(self, serialized_job, job_key, job_id, replace_exist): def exist(key, value): with self.filter_lock: try: return self.filter_list[key].exist(value) except KeyError: self.filter_list[key] = MemoryFilter() return self.filter_list[key].exist(value) self.log.debug("client call submit job %s" % job_id) if isinstance(serialized_job, Binary): serialized_job = serialized_job.data job_in_dict = Job.deserialize_to_dict(serialized_job) filter_key = job_in_dict['filter_key'] filter_value = job_in_dict['filter_value'] if filter_key and filter_value: if exist(filter_key, filter_value): self.log.debug("%s has been filter..." % filter_value) return False if not job_in_dict['trigger']: self._enqueue_job(job_key, serialized_job) else: with self.jobstore_lock: try: self.jobstore.add_job(job_id, job_key, job_in_dict['next_run_time'], serialized_job) except JobAlreadyExist: if replace_exist: self.jobstore.update_job(job_id, job_key, job_in_dict['next_run_time'], serialized_job) else: self.log.warn('job %s already exist' % job_id) self.wake_up() return True
def fusion_stats(self): """ Outputs count files and plots about the detected gene fusions. """ jobs = [] cff_dir = os.path.join("fusions", "cff") out_dir = os.path.join("fusions", "fusion_stats") sampleinfo_file = os.path.relpath(self.args.sampleinfo.name, self.output_dir) fusion_stats_job = fusion_stats.fusion_stats(cff_dir, out_dir, sampleinfo_file) category_table_job = fusion_stats.generate_category_count_table( cff_dir, out_dir) category_barplot_job = fusion_stats.generate_categories_barplot( fusion_stats_dir=out_dir) job = concat_jobs([ Job(command="mkdir -p " + out_dir), fusion_stats_job, category_table_job, category_barplot_job ], name="fusion_stats") jobs.append(job) return jobs
def polling_thread(self): try: lastshares = [] errorcount = [0] * (self.device.maximum_multiplier + 1) errorweight = [0] * (self.device.maximum_multiplier + 1) maxerrorrate = [0] * (self.device.maximum_multiplier + 1) errorlimit = 0.05 errorhysteresis = 0.1 counter = 0 while not self.shutdown: counter += 1 # Poll for nonces now = time.time() nonces = self.device.read_nonces() exhausted = False with self.wakeup: if nonces[0][1] < self.lastnonce: self.lastnonce = nonces[0][1] exhausted = True if exhausted: self.send("keyspace_exhausted") for nonce in nonces: if nonce[0] != -self.device.nonce_offset and not nonce[0] in lastshares: if self.job: self.send("nonce_found", time.time(), struct.pack("<I", nonce[0])) lastshares.append(nonce[0]) while len(lastshares) > len(nonces): lastshares.pop(0) # Verify proper operation and adjust clocking if neccessary if now > self.checklockout and self.job: errorcount[self.multiplier] *= 0.995 errorweight[self.multiplier] = errorweight[self.multiplier] * 0.995 + 1 for nonce in nonces: invalid = True for offset in (0, 1, -1, 2, -2): hash = Job.calculate_hash(self.job[:76] + struct.pack("<I", nonce[1] + offset)) if struct.unpack("!I", hash[-4:])[0] == (nonce[2] + 0x5be0cd19) & 0xffffffff: invalid = False break if invalid: errorcount[self.multiplier] += 1. / len(nonces) certainty = min(1, errorweight[self.multiplier] / 100) errorrate = errorcount[self.multiplier] / errorweight[self.multiplier] maxerrorrate[self.multiplier] = max(maxerrorrate[self.multiplier], errorrate * certainty) for i in range(len(maxerrorrate) - 1): if maxerrorrate[i + 1] * i < maxerrorrate[i] * (i + 20): maxerrorrate[i + 1] = maxerrorrate[i] * (1 + 20.0 / i) limit = 0 while limit < self.device.default_multiplier and maxerrorrate[limit + 1] < errorlimit: limit += 1 while limit < self.device.maximum_multiplier and errorweight[limit] > 150 and maxerrorrate[limit + 1] < errorlimit: limit += 1 multiplier = 0 best = 0 for i in range(limit + 1): effective = (i + 1 + (errorhysteresis if i == self.multiplier else 0)) * (1 - maxerrorrate[i]) if effective > best: best = effective multiplier = i self._set_multiplier(multiplier) if counter >= 10: counter = 0 try: self.send("error_rate", errorcount[self.multiplier] / errorweight[self.multiplier]) except: pass with self.wakeup: self.wakeup.wait(self.pollinterval) except Exception as e: self.error = e # Unblock main thread self.send("ping")
def main(): if len(sys.argv) < 3: print 'usage: %s input_dir output_dir' % sys.argv[0] return conf = DefaultConfigure() job = Job(conf) job.set_splliter(LineSplitter) job.set_mapper(WordCountMapper) job.set_mapper_num(4) job.set_reducer(WordCountReducer) job.set_reducer_num(1) job.add_input_path(sys.argv[1]) job.set_output_path(sys.argv[2]) print job.run()
def convert_fusion_results_to_cff(self): """ Convert fusion results of all 4 gene fusion callers to cff format """ jobs = [] out_dir = os.path.join("fusions", "cff") job_list = [Job(command="mkdir -p " + out_dir)] sampleinfo_file = os.path.relpath(self.args.sampleinfo.name, self.output_dir) for sample in self.samples: # Define result files # output_file = os.path.join(output_dir, prefix + "_STAR-SEQR", prefix + "_STAR-SEQR_candidates.txt") # star_seqr_result = os.path.join("fusions", "star_seqr", sample.name, # "out_STAR-SEQR", "out_STAR-SEQR_candidates.txt") star_seqr_result = os.path.join("fusions", "star_seqr", sample.name, "out_STAR-SEQR_candidates.txt") # print >> sys.stderr, star_seqr_result arriba_result = os.path.join("fusions", "arriba", sample.name, "fusions.tsv") # star_fusion_result = os.path.join("fusions", "star_fusion", # sample.name, "star-fusion.fusion_predictions.abridged.tsv") star_fusion_result = os.path.join( "fusions", "star_fusion", sample.name, "star-fusion.fusion_predictions.abridged.coding_effect.tsv") defuse_result = os.path.join("fusions", "defuse", sample.name, "results.filtered.tsv") fusionmap_result = os.path.join("fusions", "fusionmap", sample.name, "02_RNA.FusionReport.txt") ericscript_result = os.path.join("fusions", "ericscript", sample.name, "fusion.results.filtered.tsv") integrate_result = os.path.join("fusions", "integrate", sample.name, "breakpoints.cov.tsv") cicero_result = os.path.join("fusions", "cicero", sample.name, "final_fusions.txt") # Build tool_results list based on self.tool_list result_file_dict = { "star_seqr": star_seqr_result, "arriba": arriba_result, "star_fusion": star_fusion_result, "defuse": defuse_result, "fusionmap": fusionmap_result, "ericscript": ericscript_result, "integrate": integrate_result, "cicero": cicero_result } tool_results = [(key, result_file_dict[key]) for key in result_file_dict.keys() if key in self.tool_list] # tool_results = [("star_seqr",star_seqr_result), ("arriba", arriba_result), # ("star_fusion", star_fusion_result), ("defuse", defuse_result), # ("fusionmap", fusionmap_result), ("ericscript", ericscript_result), # ("integrate", integrate_result)] # tool_results = [("arriba", arriba_result), ("star_fusion", star_fusion_result), # ("defuse", defuse_result), ("fusionmap", fusionmap_result), # ("ericscript", ericscript_result), ("integrate", integrate_result)] # determine sample_type """ sample_type = "" for contrast in self.contrasts: if sample in contrast.controls: sample_type = "Normal" elif sample in contrast.treatments: sample_type = "Tumor" if sample_type: disease_name = contrast.name break if not sample_type: raise Exception("Error: sample " + sample.name + " not found in design file " + self.args.design.name) """ # convert caller output files to common fusion format(cff) for tool, result_file in tool_results: job = cff_conversion.cff_convert(sample.name, result_file, sampleinfo_file, tool, out_dir) job.command = job.command.strip() job_list.append(job) job = concat_jobs(job_list, name="cff_conversion") jobs.append(job) return jobs