Esempio n. 1
0
    def concat_valid_sams_and_refs_for_bin(self, cids, refs):
        """
        Concat sam files and reference sequences of all valid clusters
        in bin to create a big sam and a big ref.
        A cluser is not valid if (1) or (2)
            (1) identical sequences already exists in another cluster
                (rare, but happens)
            (2) the alignment is empty (also rare, but happens)
        Return valid_cids, a list of valid cluster ids
        """
        first, last = cids[0], cids[-1]
        bin_sam_file = self.sam_of_quivered_bin(first, last)
        bin_ref_fa = self.ref_fa_of_quivered_bin(first, last)

        self.add_log("Concatenating reference files between " +
                     "{first} and {last}.".format(first=first, last=last))
        valid_sam_files = []
        valid_cids = []
        seqs_seen = {}
        with open(bin_ref_fa, 'w') as bin_ref_fa_writer:
            for cid in cids:
                fname = self.sam_of_cluster(cid)
                if not is_blank_sam(fname):
                    ref_rec = get_the_only_fasta_record(refs[cid])
                    name = ref_rec.name.strip()
                    #if '/' in name:
                    #    # convert both c{cid} and c{cid}/0_len to c{cid}
                    #    name = name[:name.find('/')]
                    seq = ref_rec.sequence.strip()
                    if seq not in seqs_seen:
                        valid_sam_files.append(fname)
                        valid_cids.append(cid)
                        seqs_seen[seq] = cid
                        # concate valid ref files, avoid 'cat ...' hundreds
                        # or even thousands of files due to linux cmd line
                        # length limits
                        bin_ref_fa_writer.write(">{0}\n{1}\n".format(
                            name, seq))
                    else:
                        self.add_log(
                            "ignoring {0} because identical sequence!".format(
                                cid))
                else:
                    self.add_log(
                        "ignoring {0} because no alignments!".format(cid))

        if len(valid_sam_files) == 0:
            self.add_log("No alignments were found for clusters between " +
                         "{first} and {last}.".format(first=first, last=last),
                         level=logging.WARNING)
            assert (len(valid_cids) == 0)
        else:
            self.add_log("Concatenating sam files between " +
                         "{first} and {last}.".format(first=first, last=last))
            # concat valid sam files
            concat_sam(valid_sam_files, bin_sam_file)
            self.add_log("Concatenation done")

        return valid_cids
Esempio n. 2
0
    def concat_valid_sams_and_refs_for_bin(self, cids, refs):
        """
        Concat sam files and reference sequences of all valid clusters
        in bin to create a big sam and a big ref.
        A cluser is not valid if (1) or (2)
            (1) identical sequences already exists in another cluster
                (rare, but happens)
            (2) the alignment is empty (also rare, but happens)
        Return valid_cids, a list of valid cluster ids
        """
        first, last = cids[0], cids[-1]
        bin_sam_file = self.sam_of_quivered_bin(first, last)
        bin_ref_fa = self.ref_fa_of_quivered_bin(first, last)

        self.add_log("Concatenating reference files between " +
                     "{first} and {last}.".format(first=first, last=last))
        valid_sam_files = []
        valid_cids = []
        seqs_seen = {}
        with open(bin_ref_fa, 'w') as bin_ref_fa_writer:
            for cid in cids:
                fname = self.sam_of_cluster(cid)
                if not is_blank_sam(fname):
                    ref_rec = get_the_only_fasta_record(refs[cid])
                    name = ref_rec.name.strip()
                    #if '/' in name:
                    #    # convert both c{cid} and c{cid}/0_len to c{cid}
                    #    name = name[:name.find('/')]
                    seq = ref_rec.sequence.strip()
                    if seq not in seqs_seen:
                        valid_sam_files.append(fname)
                        valid_cids.append(cid)
                        seqs_seen[seq] = cid
                        # concate valid ref files, avoid 'cat ...' hundreds
                        # or even thousands of files due to linux cmd line
                        # length limits
                        bin_ref_fa_writer.write(">{0}\n{1}\n".
                                                format(name, seq))
                    else:
                        self.add_log("ignoring {0} because identical sequence!".format(cid))
                else:
                    self.add_log("ignoring {0} because no alignments!".format(cid))

        if len(valid_sam_files) == 0:
            self.add_log("No alignments were found for clusters between " +
                         "{first} and {last}.".format(first=first, last=last),
                         level=logging.WARNING)
            assert(len(valid_cids) == 0)
        else:
            self.add_log("Concatenating sam files between " +
                         "{first} and {last}.".format(first=first, last=last))
            # concat valid sam files
            concat_sam(valid_sam_files, bin_sam_file)
            self.add_log("Concatenation done")

        return valid_cids
Esempio n. 3
0
    def setup_quiver_for_batch(self,
                               cids,
                               refs,
                               quiver_nproc=2,
                               return_script=True):
        """
        NOTE: (1) skip clusters if identical sequences already exists
                  in another cluster (rare, but happens)
              (2) skip clusters if the alignment is empty (also rare,
                  but happens)

        if return_script is True, return a job script e.g., quivered/c{}to{}.sh,
        otherwise, return a list of cmds.
        """
        # concat the sam files
        first, last = cids[0], cids[-1]
        #prefix = self._quivered_bin_prefix(first=cids[0], last=cids[-1])
        bin_sam_file = self.sam_of_quivered_bin(first, last)
        bin_ref_fa = self.ref_fa_of_quivered_bin(first, last)
        bin_cmph5 = self.cmph5_of_quivered_bin(first, last)
        bin_fq = self.fq_of_quivered_bin(first, last)

        valid_sam_files = []
        valid_cids = []
        seqs_seen = {}
        for cid in cids:
            fname = self.sam_of_cluster(cid)
            if not is_blank_sam(fname):
                seq = get_the_only_fasta_record(refs[cid]).sequence
                if seq not in seqs_seen:
                    valid_sam_files.append(fname)
                    valid_cids.append(cid)
                    seqs_seen[seq] = cid
                else:
                    self.add_log(
                        "ignoring {0} because identical sequence!".format(cid))
            else:
                self.add_log("ignoring {0} because no alignments!".format(cid))
        concat_sam(valid_sam_files, bin_sam_file)

        # concat the reference file
        cmd = "cat " + " ".join(refs[cid] for cid in valid_cids) + \
              " > {ref}".format(ref=bin_ref_fa)
        _out, _code, _msg = backticks(cmd)
        if _code != 0:
            errMsg = "Unable to concatenate reference files between " + \
                "{first} and {last}.\n".format(first=first, last=last) + _msg
            self.add_log(errMsg, level=logging.ERROR)
            raise RuntimeError(errMsg)

        # write the sh script for the conversion, loadPulses, and quiver
        cmds = []
        cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format(
            sam=bin_sam_file, ref=bin_ref_fa, cmph5=bin_cmph5))
        cmds.append("gzip {sam}".format(sam=bin_sam_file))
        metrics = [
            "QualityValue", "InsertionQV", "MergeQV", "DeletionQV",
            "DeletionTag", "SubstitutionTag", "SubstitutionQV"
        ]
        cmds.append("loadPulses {bas_fofn} ".format(bas_fofn=self.bas_fofn) +
                    "{cmph5} ".format(cmph5=bin_cmph5) + "-byread -metrics " +
                    ",".join(metrics))
        cmds.append("cmph5tools.py sort {cmph5}".format(cmph5=bin_cmph5))
        cmds.append("samtools faidx {ref}".format(ref=bin_ref_fa))
        cmds.append("quiver {cmph5} ".format(cmph5=bin_cmph5) +
                    "-v -j{n} ".format(n=quiver_nproc) +
                    "-r {ref} ".format(ref=bin_ref_fa) +
                    "-o {fq}".format(fq=bin_fq))

        bin_sh = self.script_of_quivered_bin(first, last)

        with open(bin_sh, 'w') as f:
            f.write("#!/bin/bash\n")
            f.write("\n".join(cmds))

        if return_script is True:
            return f.name
        else:
            return cmds
Esempio n. 4
0
    def setup_quiver_for_batch(self, cids, refs, quiver_nproc=2,
            return_script=True):
        """
        NOTE: (1) skip clusters if identical sequences already exists
                  in another cluster (rare, but happens)
              (2) skip clusters if the alignment is empty (also rare,
                  but happens)

        if return_script is True, return a job script e.g., quivered/c{}to{}.sh,
        otherwise, return a list of cmds.
        """
        # concat the sam files
        first, last = cids[0], cids[-1]
        #prefix = self._quivered_bin_prefix(first=cids[0], last=cids[-1])
        bin_sam_file = self.sam_of_quivered_bin(first, last)
        bin_ref_fa = self.ref_fa_of_quivered_bin(first, last)
        bin_cmph5 = self.cmph5_of_quivered_bin(first, last)
        bin_fq = self.fq_of_quivered_bin(first, last)

        valid_sam_files = []
        valid_cids = []
        seqs_seen = {}
        for cid in cids:
            fname = self.sam_of_cluster(cid)
            if not is_blank_sam(fname):
                seq = get_the_only_fasta_record(refs[cid]).sequence
                if seq not in seqs_seen:
                    valid_sam_files.append(fname)
                    valid_cids.append(cid)
                    seqs_seen[seq] = cid
                else:
                    self.add_log(
                        "ignoring {0} because identical sequence!".format(cid))
            else:
                self.add_log(
                    "ignoring {0} because no alignments!".format(cid))
        concat_sam(valid_sam_files, bin_sam_file)

        # concat the reference file
        cmd = "cat " + " ".join(refs[cid] for cid in valid_cids) + \
              " > {ref}".format(ref=bin_ref_fa)
        _out, _code, _msg = backticks(cmd)
        if _code != 0:
            errMsg = "Unable to concatenate reference files between " + \
                "{first} and {last}.\n".format(first=first, last=last) + _msg
            self.add_log(errMsg, level=logging.ERROR)
            raise RuntimeError(errMsg)

        # write the sh script for the conversion, loadPulses, and quiver
        cmds = []
        cmds.append("samtoh5 {sam} {ref} {cmph5} -smrtTitle".format(
            sam=bin_sam_file, ref=bin_ref_fa, cmph5=bin_cmph5))
        cmds.append("gzip {sam}".format(sam=bin_sam_file))
        metrics = ["QualityValue", "InsertionQV", "MergeQV", "DeletionQV",
                   "DeletionTag", "SubstitutionTag", "SubstitutionQV"]
        cmds.append("loadPulses {bas_fofn} ".format(bas_fofn=self.bas_fofn) +
                    "{cmph5} ".format(cmph5=bin_cmph5) +
                    "-byread -metrics " + ",".join(metrics))
        cmds.append("cmph5tools.py sort {cmph5}".format(cmph5=bin_cmph5))
        cmds.append("samtools faidx {ref}".format(ref=bin_ref_fa))
        cmds.append("quiver {cmph5} ".format(cmph5=bin_cmph5) +
                    "-v -j{n} ".format(n=quiver_nproc) +
                    "-r {ref} ".format(ref=bin_ref_fa) +
                    "-o {fq}".format(fq=bin_fq))

        bin_sh = self.script_of_quivered_bin(first, last)

        with open(bin_sh, 'w') as f:
            f.write("#!/bin/bash\n")
            f.write("\n".join(cmds))

        if return_script is True:
            return f.name
        else:
            return cmds