def concat_valid_sams_and_refs_for_bin(self, cids, refs): """ Concat sam files and reference sequences of all valid clusters in bin to create a big sam and a big ref. A cluser is not valid if (1) or (2) (1) identical sequences already exists in another cluster (rare, but happens) (2) the alignment is empty (also rare, but happens) Return valid_cids, a list of valid cluster ids """ first, last = cids[0], cids[-1] bin_sam_file = self.sam_of_quivered_bin(first, last) bin_ref_fa = self.ref_fa_of_quivered_bin(first, last) self.add_log("Concatenating reference files between " + "{first} and {last}.".format(first=first, last=last)) valid_sam_files = [] valid_cids = [] seqs_seen = {} with open(bin_ref_fa, 'w') as bin_ref_fa_writer: for cid in cids: fname = self.sam_of_cluster(cid) if not is_blank_sam(fname): ref_rec = get_the_only_fasta_record(refs[cid]) name = ref_rec.name.strip() #if '/' in name: # # convert both c{cid} and c{cid}/0_len to c{cid} # name = name[:name.find('/')] seq = ref_rec.sequence.strip() if seq not in seqs_seen: valid_sam_files.append(fname) valid_cids.append(cid) seqs_seen[seq] = cid # concate valid ref files, avoid 'cat ...' hundreds # or even thousands of files due to linux cmd line # length limits bin_ref_fa_writer.write(">{0}\n{1}\n".format( name, seq)) else: self.add_log( "ignoring {0} because identical sequence!".format( cid)) else: self.add_log( "ignoring {0} because no alignments!".format(cid)) if len(valid_sam_files) == 0: self.add_log("No alignments were found for clusters between " + "{first} and {last}.".format(first=first, last=last), level=logging.WARNING) assert (len(valid_cids) == 0) else: self.add_log("Concatenating sam files between " + "{first} and {last}.".format(first=first, last=last)) # concat valid sam files concat_sam(valid_sam_files, bin_sam_file) self.add_log("Concatenation done") return valid_cids
def concat_valid_sams_and_refs_for_bin(self, cids, refs): """ Concat sam files and reference sequences of all valid clusters in bin to create a big sam and a big ref. A cluser is not valid if (1) or (2) (1) identical sequences already exists in another cluster (rare, but happens) (2) the alignment is empty (also rare, but happens) Return valid_cids, a list of valid cluster ids """ first, last = cids[0], cids[-1] bin_sam_file = self.sam_of_quivered_bin(first, last) bin_ref_fa = self.ref_fa_of_quivered_bin(first, last) self.add_log("Concatenating reference files between " + "{first} and {last}.".format(first=first, last=last)) valid_sam_files = [] valid_cids = [] seqs_seen = {} with open(bin_ref_fa, 'w') as bin_ref_fa_writer: for cid in cids: fname = self.sam_of_cluster(cid) if not is_blank_sam(fname): ref_rec = get_the_only_fasta_record(refs[cid]) name = ref_rec.name.strip() #if '/' in name: # # convert both c{cid} and c{cid}/0_len to c{cid} # name = name[:name.find('/')] seq = ref_rec.sequence.strip() if seq not in seqs_seen: valid_sam_files.append(fname) valid_cids.append(cid) seqs_seen[seq] = cid # concate valid ref files, avoid 'cat ...' hundreds # or even thousands of files due to linux cmd line # length limits bin_ref_fa_writer.write(">{0}\n{1}\n". format(name, seq)) else: self.add_log("ignoring {0} because identical sequence!".format(cid)) else: self.add_log("ignoring {0} because no alignments!".format(cid)) if len(valid_sam_files) == 0: self.add_log("No alignments were found for clusters between " + "{first} and {last}.".format(first=first, last=last), level=logging.WARNING) assert(len(valid_cids) == 0) else: self.add_log("Concatenating sam files between " + "{first} and {last}.".format(first=first, last=last)) # concat valid sam files concat_sam(valid_sam_files, bin_sam_file) self.add_log("Concatenation done") return valid_cids
def setup_quiver_for_batch(self, cids, refs, quiver_nproc=2, return_script=True): """ NOTE: (1) skip clusters if identical sequences already exists in another cluster (rare, but happens) (2) skip clusters if the alignment is empty (also rare, but happens) if return_script is True, return a job script e.g., quivered/c{}to{}.sh, otherwise, return a list of cmds. """ # concat the sam files first, last = cids[0], cids[-1] #prefix = self._quivered_bin_prefix(first=cids[0], last=cids[-1]) bin_sam_file = self.sam_of_quivered_bin(first, last) bin_ref_fa = self.ref_fa_of_quivered_bin(first, last) bin_cmph5 = self.cmph5_of_quivered_bin(first, last) bin_fq = self.fq_of_quivered_bin(first, last) valid_sam_files = [] valid_cids = [] seqs_seen = {} for cid in cids: fname = self.sam_of_cluster(cid) if not is_blank_sam(fname): seq = get_the_only_fasta_record(refs[cid]).sequence if seq not in seqs_seen: valid_sam_files.append(fname) valid_cids.append(cid) seqs_seen[seq] = cid else: self.add_log( "ignoring {0} because identical sequence!".format(cid)) else: self.add_log("ignoring {0} because no alignments!".format(cid)) concat_sam(valid_sam_files, bin_sam_file) # concat the reference file cmd = "cat " + " ".join(refs[cid] for cid in valid_cids) + \ " > {ref}".format(ref=bin_ref_fa) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Unable to concatenate reference files between " + \ "{first} and {last}.\n".format(first=first, last=last) + _msg self.add_log(errMsg, level=logging.ERROR) raise RuntimeError(errMsg) # write the sh script for the conversion, loadPulses, and quiver cmds = [] cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format( sam=bin_sam_file, ref=bin_ref_fa, cmph5=bin_cmph5)) cmds.append("gzip {sam}".format(sam=bin_sam_file)) metrics = [ "QualityValue", "InsertionQV", "MergeQV", "DeletionQV", "DeletionTag", "SubstitutionTag", "SubstitutionQV" ] cmds.append("loadPulses {bas_fofn} ".format(bas_fofn=self.bas_fofn) + "{cmph5} ".format(cmph5=bin_cmph5) + "-byread -metrics " + ",".join(metrics)) cmds.append("cmph5tools.py sort {cmph5}".format(cmph5=bin_cmph5)) cmds.append("samtools faidx {ref}".format(ref=bin_ref_fa)) cmds.append("quiver {cmph5} ".format(cmph5=bin_cmph5) + "-v -j{n} ".format(n=quiver_nproc) + "-r {ref} ".format(ref=bin_ref_fa) + "-o {fq}".format(fq=bin_fq)) bin_sh = self.script_of_quivered_bin(first, last) with open(bin_sh, 'w') as f: f.write("#!/bin/bash\n") f.write("\n".join(cmds)) if return_script is True: return f.name else: return cmds
def setup_quiver_for_batch(self, cids, refs, quiver_nproc=2, return_script=True): """ NOTE: (1) skip clusters if identical sequences already exists in another cluster (rare, but happens) (2) skip clusters if the alignment is empty (also rare, but happens) if return_script is True, return a job script e.g., quivered/c{}to{}.sh, otherwise, return a list of cmds. """ # concat the sam files first, last = cids[0], cids[-1] #prefix = self._quivered_bin_prefix(first=cids[0], last=cids[-1]) bin_sam_file = self.sam_of_quivered_bin(first, last) bin_ref_fa = self.ref_fa_of_quivered_bin(first, last) bin_cmph5 = self.cmph5_of_quivered_bin(first, last) bin_fq = self.fq_of_quivered_bin(first, last) valid_sam_files = [] valid_cids = [] seqs_seen = {} for cid in cids: fname = self.sam_of_cluster(cid) if not is_blank_sam(fname): seq = get_the_only_fasta_record(refs[cid]).sequence if seq not in seqs_seen: valid_sam_files.append(fname) valid_cids.append(cid) seqs_seen[seq] = cid else: self.add_log( "ignoring {0} because identical sequence!".format(cid)) else: self.add_log( "ignoring {0} because no alignments!".format(cid)) concat_sam(valid_sam_files, bin_sam_file) # concat the reference file cmd = "cat " + " ".join(refs[cid] for cid in valid_cids) + \ " > {ref}".format(ref=bin_ref_fa) _out, _code, _msg = backticks(cmd) if _code != 0: errMsg = "Unable to concatenate reference files between " + \ "{first} and {last}.\n".format(first=first, last=last) + _msg self.add_log(errMsg, level=logging.ERROR) raise RuntimeError(errMsg) # write the sh script for the conversion, loadPulses, and quiver cmds = [] cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format( sam=bin_sam_file, ref=bin_ref_fa, cmph5=bin_cmph5)) cmds.append("gzip {sam}".format(sam=bin_sam_file)) metrics = ["QualityValue", "InsertionQV", "MergeQV", "DeletionQV", "DeletionTag", "SubstitutionTag", "SubstitutionQV"] cmds.append("loadPulses {bas_fofn} ".format(bas_fofn=self.bas_fofn) + "{cmph5} ".format(cmph5=bin_cmph5) + "-byread -metrics " + ",".join(metrics)) cmds.append("cmph5tools.py sort {cmph5}".format(cmph5=bin_cmph5)) cmds.append("samtools faidx {ref}".format(ref=bin_ref_fa)) cmds.append("quiver {cmph5} ".format(cmph5=bin_cmph5) + "-v -j{n} ".format(n=quiver_nproc) + "-r {ref} ".format(ref=bin_ref_fa) + "-o {fq}".format(fq=bin_fq)) bin_sh = self.script_of_quivered_bin(first, last) with open(bin_sh, 'w') as f: f.write("#!/bin/bash\n") f.write("\n".join(cmds)) if return_script is True: return f.name else: return cmds