def create_raw_fas_for_clusters_in_bin(self, cids, d, uc, partial_uc): """ Create raw subreads fasta files for clusters in cids. For each cluster k in cids, * Collect raw subreads of zmws associated with cluster k in either uc or partial_uc. cids --- cluster ids d --- MetaSubreadsFastaReader uc --- uc[k] returns fl ccs reads associated with cluster k partial_uc --- partial_uc[k] returns nfl ccs reads associated with cluster k (Liz) for Quiver, subsample down to max 100 (with uc having priority over partial_uc) """ #data_queue = [] #fake_func = lambda x: x for k in cids: # for each cluster k # $root_dir/tmp/?/c{k}/in.raw_with_partial.fa raw_fa = self.raw_fa_of_cluster(k) in_seqids = uc[k] if len(in_seqids) > 100: in_seqids = random.sample(in_seqids, 100) else: in_seqids += random.sample(partial_uc[k], min(len(partial_uc[k]), 100 - len(in_seqids))) # write cluster k's associated raw subreads to raw_fa #data_queue.append([d, in_seqids, raw_fa, True]) write_in_raw_fasta(input_fasta_d=d, in_seqids=in_seqids, out_fa=raw_fa, ignore_keyerror=True)
def create_raw_fas_for_clusters_in_bin(self, cids, d, uc, partial_uc): """ Create raw subreads fasta files for clusters in cids. For each cluster k in cids, * Collect raw subreads of zmws associated with cluster k in either uc or partial_uc. cids --- cluster ids d --- MetaSubreadsFastaReader uc --- uc[k] returns fl ccs reads associated with cluster k partial_uc --- partial_uc[k] returns nfl ccs reads associated with cluster k (Liz) for Quiver, subsample down to max 100 (with uc having priority over partial_uc) """ #data_queue = [] #fake_func = lambda x: x for k in cids: # for each cluster k # $root_dir/tmp/?/c{k}/in.raw_with_partial.fa raw_fa = self.raw_fa_of_cluster(k) in_seqids = uc[k] if len(in_seqids) > 100: in_seqids = random.sample(in_seqids, 100) else: in_seqids += random.sample( partial_uc[k], min(len(partial_uc[k]), 100 - len(in_seqids))) # write cluster k's associated raw subreads to raw_fa #data_queue.append([d, in_seqids, raw_fa, True]) write_in_raw_fasta(input_fasta_d=d, in_seqids=in_seqids, out_fa=raw_fa, ignore_keyerror=True)
def create_raw_fas_for_clusters_in_bin(self, cids, d, uc, partial_uc): """ Create raw subreads fasta files for clusters in cids. For each cluster k in cids, * Collect raw subreads of zmws associated with cluster k in either uc or partial_uc. cids --- cluster ids d --- MetaSubreadsFastaReader uc --- uc[k] returns fl ccs reads associated with cluster k partial_uc --- partial_uc[k] returns nfl ccs reads associated with cluster k """ for k in cids: # for each cluster k # $root_dir/tmp/?/c{k}/in.raw_with_partial.fa raw_fa = self.raw_fa_of_cluster(k) # write cluster k's associated raw subreads to raw_fa write_in_raw_fasta(input_fasta_d=d, in_seqids=uc[k] + partial_uc[k], out_fa=raw_fa, ignore_keyerror=True)
def submit_quiver_jobs(self, d, uc, partial_uc, refs, keys, start, end, submitted, todo, use_sge, max_sge_jobs, quiver_nproc): """Call quiver to polish consensus. (1) for each cluster k, obtain unrolled sequences of all reads (zmws) belonging to this cluster, and save in raw_fa_of_cluster(k) (2) for each cluster k, call blasr to align raw_f_of_cluster to consensus sequence of the cluster and create sam_of_cluster. (3) Put every 100 clusters into one big bin, and then merge all sam_of_cluster files to sam_of_quivered_bin (4) Prepare commands including samtoh5, loadPulses, cmph5tools.py ... in order to convert sam_of_quivered_bin to cmph5_of_quivered_bin. * Either write these command to script_of_quivered_bin and qsub all jobs later when scripts of all quivered bins are done, * Or execute the commands immediately. """ for i in xrange(start, end, 100): for k in keys[i:min(end, i + 100)]: #os.chdir(op.join('./tmp', str(k/10000), 'c'+str(k))) raw_fa = self.raw_fa_of_cluster(k) # write_in_raw_fa return movies of reads in partial_uc # logging.debug("uc[k]={0}".format(uc[k])) # logging.debug("partial_uc[k]={0}".format(uc[k])) write_in_raw_fasta(input_fasta_d=d, in_seqids=uc[k] + partial_uc[k], out_fa=raw_fa, ignore_keyerror=True) #TODO: use multi-processing pool, reduce nproc blasr_sam_for_quiver(input_fasta=raw_fa, ref_fasta=refs[k], out_sam_filename=self.sam_of_cluster(k), run_cmd=True) fname = self.setup_quiver_for_batch(cids=keys[i:min(end, i + 100)], refs=refs, quiver_nproc=quiver_nproc, return_script=True) todo.append(fname) if use_sge is not True or \ max_sge_jobs == 0: # don't use SGE for job in todo: elog = op.join(self.quivered_log_dir, op.basename(job) + ".elog") olog = op.join(self.quivered_log_dir, op.basename(job) + ".olog") msg = "Running quiver job locally: {j} ".format(j=job) + \ "1>{olog} 2>{elog}".format(olog=olog, elog=elog) self.add_log(msg) cmd = "bash " + job + " 1>{olog} 2>{elog}".\ format(olog=olog, elog=elog) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Failed to run quiver {j}".format( j=job) + _msg self.add_log(errMsg, level=logging.ERROR) raise RuntimeError(errMsg) submitted.append(("local", job)) todo = [] else: while len(todo) > 0: n = min(max_sge_jobs, len(todo)) for job in todo[:n]: # ex: Your job 8613116 ("c20to70.sh") has been submitted elog = op.join(self.quivered_log_dir, op.basename(job) + ".elog") olog = op.join(self.quivered_log_dir, op.basename(job) + ".olog") qsub_cmd = "qsub " + \ "-pe smp {n} ".format(n=quiver_nproc) + \ "-cwd -S /bin/bash -V " + \ "-e {elog} ".format(elog=elog) + \ "-o {olog} ".format(olog=olog) + \ "{job}".format(job=job) msg = "Submitting CMD: {cmd}.\n".format(cmd=qsub_cmd) self.add_log(msg) _out, _code, _msg = backticks(qsub_cmd) if _code != 0: errMsg = "Failed to submit CMD {cmd}.".format( cmd=qsub_cmd) self.add_log(errMsg, level=logging.ERROR) raise RuntimeError(errMsg) job_id = str(_out).split()[2] submitted.append((job_id, job)) todo.remove(job)
def submit_quiver_jobs(self, d, uc, partial_uc, refs, keys, start, end, submitted, todo, use_sge, max_sge_jobs, quiver_nproc): """Call quiver to polish consensus. (1) for each cluster k, obtain unrolled sequences of all reads (zmws) belonging to this cluster, and save in raw_fa_of_cluster(k) (2) for each cluster k, call blasr to align raw_f_of_cluster to consensus sequence of the cluster and create sam_of_cluster. (3) Put every 100 clusters into one big bin, and then merge all sam_of_cluster files to sam_of_quivered_bin (4) Prepare commands including samtoh5, loadPulses, cmph5tools.py ... in order to convert sam_of_quivered_bin to cmph5_of_quivered_bin. * Either write these command to script_of_quivered_bin and qsub all jobs later when scripts of all quivered bins are done, * Or execute the commands immediately. """ for i in xrange(start, end, 100): for k in keys[i: min(end, i+100)]: #os.chdir(op.join('./tmp', str(k/10000), 'c'+str(k))) raw_fa = self.raw_fa_of_cluster(k) # write_in_raw_fa return movies of reads in partial_uc # logging.debug("uc[k]={0}".format(uc[k])) # logging.debug("partial_uc[k]={0}".format(uc[k])) write_in_raw_fasta(input_fasta_d=d, in_seqids=uc[k] + partial_uc[k], out_fa=raw_fa, ignore_keyerror=True) #TODO: use multi-processing pool, reduce nproc blasr_sam_for_quiver( input_fasta=raw_fa, ref_fasta=refs[k], out_sam_filename=self.sam_of_cluster(k), run_cmd=True) fname = self.setup_quiver_for_batch(cids=keys[i: min(end, i+100)], refs=refs, quiver_nproc=quiver_nproc, return_script=True) todo.append(fname) if use_sge is not True or \ max_sge_jobs == 0: # don't use SGE for job in todo: elog = op.join(self.quivered_log_dir, op.basename(job) + ".elog") olog = op.join(self.quivered_log_dir, op.basename(job) + ".olog") msg = "Running quiver job locally: {j} ".format(j=job) + \ "1>{olog} 2>{elog}".format(olog=olog, elog=elog) self.add_log(msg) cmd = "bash " + job + " 1>{olog} 2>{elog}".\ format(olog=olog, elog=elog) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Failed to run quiver {j}".format(j=job) + _msg self.add_log(errMsg, level=logging.ERROR) raise RuntimeError(errMsg) submitted.append(("local", job)) todo = [] else: while len(todo) > 0: n = min(max_sge_jobs, len(todo)) for job in todo[:n]: # ex: Your job 8613116 ("c20to70.sh") has been submitted elog = op.join(self.quivered_log_dir, op.basename(job) + ".elog") olog = op.join(self.quivered_log_dir, op.basename(job) + ".olog") qsub_cmd = "qsub " + \ "-pe smp {n} ".format(n=quiver_nproc) + \ "-cwd -S /bin/bash -V " + \ "-e {elog} ".format(elog=elog) + \ "-o {olog} ".format(olog=olog) + \ "{job}".format(job=job) msg = "Submitting CMD: {cmd}.\n".format(cmd=qsub_cmd) self.add_log(msg) _out, _code, _msg = backticks(qsub_cmd) if _code != 0: errMsg = "Failed to submit CMD {cmd}.".format( cmd=qsub_cmd) self.add_log(errMsg, level=logging.ERROR) raise RuntimeError(errMsg) job_id = str(_out).split()[2] submitted.append((job_id, job)) todo.remove(job)