Esempio n. 1
0
    def convert_to_dazz_fasta(self):
        """
        Convert input fasta/fastq file to daligner-compatibe fasta with ids:
        <prefix>/<index>/0_<seqlen>

        Also write out mappings to pickle
        """
        log.debug("Converting %s to daligner compatible fasta %s.",
                  self.input_filename, self.dazz_filename)
        reader = ContigSetReaderWrapper(self.input_filename)

        with FastaWriter(self.dazz_filename) as f:
            i = 1
            for r in reader:
                f.writeRecord(
                    "{p}/{i}/0_{len}".format(p=self.dazz_movie_name,
                                             i=i,
                                             len=len(r.sequence)),
                    r.sequence[:])
                self.dazz_mapping[i] = r.name
                i += 1

        reader.close()

        with open(self.pickle_filename, 'w') as f:
            dump(self.dazz_mapping, f)
Esempio n. 2
0
    def get_primer_ids(self):
        """Return primer ids seen in input FLNC file."""
        primer_ids = set()
        for r in ContigSetReaderWrapper(self.flnc_filename):
            primer_ids.add(self._get_primer_id(r))

        primer_ids = sorted(list(primer_ids))
        return primer_ids
Esempio n. 3
0
    def run(self):
        """Run"""
        # separate reads and write
        for r in ContigSetReaderWrapper(self.flnc_filename):
            p = self._get_primer_id(r)
            self.handles[p].write(">{0}\n{1}\n".format(r.name, r.sequence[:]))

        assert all([os.stat(x).st_size > 0 for x in self.out_dirs])
Esempio n. 4
0
    def run(self):
        """Run"""
        read_counter_in_each_bin = dict({b:0 for b in self.size_bins})

        for r in ContigSetReaderWrapper(self.flnc_filename):
            b = self.size_bins.which_bin_contains(len(r.sequence))
            p = read_counter_in_each_bin[b] % self.size_bins_parts[b]
            read_counter_in_each_bin[b] += 1
            self.handles[(b, p)].write(">{0}\n{1}\n".format(r.name, r.sequence[:]))
Esempio n. 5
0
    def get_size_bins_parts(self, bin_size_kb, bin_manual, max_base_limit_MB):
        """
        return a dict {SizeBin: number of parts in this SizeBin}
        """
        # first check min - max size range
        min_size = sys.maxint + 1
        max_size = 0
        base_in_each_size = defaultdict(
            lambda: 0)  # SizeBin --> number of bases
        for r in ContigSetReaderWrapper(self.flnc_filename):
            seqlen = len(r.sequence)
            min_size = min(min_size, seqlen)
            max_size = max(max_size, seqlen)
            b = SizeBin(seqlen / 1000, seqlen / 1000 + 1)
            base_in_each_size[b] += len(r.sequence)

        min_size_kb = min_size / 1000
        max_size_kb = max_size / 1000 + (1 if max_size % 1000 != 0 else 0)

        logging.info("Min read length: %s, %s KB, max read length: %s, %s KB",
                     str(min_size), str(min_size_kb), str(max_size),
                     str(max_size_kb))

        size_bins = None
        if bin_manual is not None and len(bin_manual) > 0:
            if bin_manual[0] > min_size_kb:
                bin_manual.insert(0, min_size_kb)
                logging.warning("bin_manual has been reset to %s kb!",
                                bin_manual)
            if bin_manual[-1] < max_size_kb:
                bin_manual.append(max_size_kb)
                logging.warning("bin_manual has been reset to %s kb!",
                                bin_manual)
            size_bins = SizeBins(bin_manual)
        else:
            size_bins = SizeBins(
                range(min_size_kb, max_size_kb + 1, bin_size_kb))

        logging.info("Read size bins are: %s", str(size_bins))
        size_bins_bases = dict({b: 0
                                for b in size_bins
                                })  # SizeBin -> total n of bases in it
        size_bins_parts = dict({b: 0
                                for b in size_bins
                                })  # SizeBin -> total n of partitions in it
        if max_base_limit_MB is not None:
            for _b, num_bases in base_in_each_size.iteritems():
                b = size_bins.which_bin_contains(_b)
                size_bins_bases[b] += num_bases

            for b, num_bases in size_bins_bases.iteritems():
                size_bins_parts[b] = int((size_bins_bases[b]*1.0 / 10**6) / max_base_limit_MB) + \
                        (1 if (num_bases*1. / 10**6) % max_base_limit_MB > 0 else 0)

        return size_bins_parts
Esempio n. 6
0
    def _updateChimeraInfo(self,
                           suspicous_hits,
                           in_read_fn,
                           out_nc_fn,
                           out_c_fn,
                           primer_report_fn,
                           write_report_header=True):
        """
        in_read_fn --- a fasta of full-length reads or a fasta of
                       non-full-length reads.
        For each full-length read in in_read_fn FASTA file, detect whether
        it is chimeric or not, and write its annotation to
        primer_report_fn.
        Return:
            (num_nc, num_c, num_nc_bases, num_c_bases)
        """
        logging.debug(
            "Update chimera info for reads in {f} ".format(f=in_read_fn))
        logging.debug(
            "Write primer report to {rpt}".format(rpt=primer_report_fn))

        out_nc_fn_fasta, out_c_fn_fasta = out_nc_fn, out_c_fn
        if out_nc_fn.endswith(".xml"):
            out_nc_fn_fasta = out_nc_fn[:-4] + ".fasta"
        if out_c_fn.endswith(".xml"):
            out_c_fn_fasta = out_c_fn[:-4] + ".fasta"
        num_nc, num_c, num_nc_bases, num_c_bases = 0, 0, 0, 0
        with ContigSetReaderWrapper(in_read_fn) as reader, \
                FastaWriter(out_nc_fn_fasta) as writer, \
                FastaWriter(out_c_fn_fasta) as writer_chimera, \
                open(primer_report_fn, 'w') as reporter:
            if write_report_header:
                reporter.write(ReadAnnotation.header(delimiter=",") + "\n")
            for r in reader:
                # e.g. r.name="movie/zmw/0_100_CCS fiveend=1;threeend=100;"
                readid = r.name.split()[0]
                annotation = ReadAnnotation.fromString(
                    r.name, ignore_polyA=self.ignore_polyA)
                if readid not in suspicous_hits:  # Non-chimeric reads
                    # Primer of a primer-trimmed read can not be None.
                    # assert(annotation.primer is not None)
                    annotation.chimera = 0
                    num_nc += 1
                    num_nc_bases += len(r.sequence)
                    writer.writeRecord(annotation.toAnnotation(),
                                       r.sequence[:])
                else:  # chimeric reads
                    annotation.chimera = 1
                    num_c += 1
                    num_c_bases += len(r.sequence)
                    writer_chimera.writeRecord(annotation.toAnnotation(),
                                               r.sequence[:])

                reporter.write(annotation.toReportRecord(delimitor=",") + "\n")
        return (num_nc, num_c, num_nc_bases, num_c_bases)
Esempio n. 7
0
def ice_fa2fq(in_fa, ccs_fofn, out_fq):
    """Convert an input FASTA file to an output FASTQ file,
       reading QVs from the input ccs.h5, ccs.bam or ccs FOFN.
    """
    ccs_fns = get_files_from_file_or_fofn(ccs_fofn)
    fmt = guess_file_format(ccs_fns)

    if fmt == FILE_FORMATS.H5:
        qver = basQVcacher()
        for ccs_fn in ccs_fns:
            qver.add_bash5(ccs_fn)
        bas_handlers = {}
    elif fmt == FILE_FORMATS.BAM:
        qver = BamCollection(*ccs_fns)
    else:
        raise IOError("ice_fa2fq does not support input %s." % ccs_fofn)

    with ContigSetReaderWrapper(in_fa) as reader, \
            FastqWriter(out_fq) as writer:
        for r in reader:
            logging.debug("Getting QVs for {name} ...".format(name=r.name))
            seqid = r.name.split(' ')[0]
            parsed_read_name = _Parsed_Read_Name(seqid)
            if fmt == FILE_FORMATS.H5:
                try:
                    bas_file = qver.bas_files[parsed_read_name.movie][seqid]
                    if bas_file not in bas_handlers:
                        bas_handlers[bas_file] = BasH5Reader(bas_file)
                except KeyError:
                    raise IOError("Could not read {s} from {f}.".format(
                        s=seqid, f=ccs_fofn))
                qvs = get_qv_from_bas_handler(
                    bas_handler=bas_handlers[bas_file],
                    parsed_read_name=parsed_read_name,
                    qv_name="QualityValue")
            elif fmt == FILE_FORMATS.BAM:
                qvs = get_qvs_from_bam(reader=qver,
                                       parsed_read_name=parsed_read_name,
                                       qv_name="QualityValue")
            else:
                assert False

            if len(r.sequence) != len(qvs):
                raise ValueError(
                    "Sequence and QVs of {r} should be the same!".format(
                        r=r.name))
            writer.writeRecord(r.name, r.sequence[:], qvs)

    if fmt == FILE_FORMATS.H5:
        for bas_file, bas_handler in bas_handlers.iteritems():
            logging.debug("Closing {bas_file} ...".format(bas_file=bas_file))
            bas_handler.close()
    elif fmt == FILE_FORMATS.BAM:
        qver.close()
Esempio n. 8
0
    def convert_to_dazz_fasta(self):
        """
        Convert input fasta/fastq file to daligner-compatibe fasta with ids:
        <prefix>/<index>/0_<seqlen>

        Also write out mappings to pickle
        """
        log.debug("Converting %s to daligner compatible fasta %s.",
                  self.input_filename, self.dazz_filename)
        reader = ContigSetReaderWrapper(self.input_filename)

        with FastaWriter(self.dazz_filename) as f:
            i = 1
            for r in reader:
                f.writeRecord("{p}/{i}/0_{len}".format(p=self.dazz_movie_name,
                                                       i=i, len=len(r.sequence)),
                              r.sequence[:])
                self.dazz_mapping[i] = r.name
                i += 1

        reader.close()

        with open(self.pickle_filename, 'w') as f:
            dump(self.dazz_mapping, f)
Esempio n. 9
0
    def _write_config(self, fasta_filename):
        """Write daligner sensitive config to fasta_filename.sensitive.config."""
        lens = [
            len(r.sequence) for r in ContigSetReaderWrapper(fasta_filename)
        ]
        self.low_cDNA_size = int(np.percentile(lens, 10))
        self.high_cDNA_size = int(np.percentile(lens, 90))

        try:
            with open(fasta_filename + '.sensitive.config', 'w') as f:
                f.write("sensitive={s}\n".format(s=self.sensitive_mode))
                f.write("low={l}\n".format(l=self.low_cDNA_size))
                f.write("high={h}\n".format(h=self.high_cDNA_size))
        except IOError:
            pass  # it's OK not to have write permission
Esempio n. 10
0
def combine_consensus_isoforms(split_indices, split_files,
                               combined_consensus_isoforms_fa,
                               sample_name):
    """
    Parameters:
      split_indices -- indices of splitted cluster bins.
      split_files -- consensus isoforms in each splitted cluster bin.
    """
    assert len(split_indices) == len(split_files)
    writer = FastaWriter(combined_consensus_isoforms_fa)
    for i, split_fn in zip(split_indices, split_files):
        logging.debug("Adding prefix i%s to %s.", str(i), split_fn)
        with ContigSetReaderWrapper(split_fn) as reader:
            for read in reader:
                name = combined_cid_ice_name(name=read.name, cluster_bin_index=i,
                                             sample_name=sample_name)
                writer.writeRecord(name, read.sequence[:])
    writer.close()
    logging.info("Consensus isoforms output combined to:%s",
                 combined_consensus_isoforms_fa)
Esempio n. 11
0
    def split(self, reads_in_first_split=None):
        """Split `input_fasta` into smaller files each containing
        `reads_per_split` reads. Return splitted fasta."""
        split_index = 0
        self.out_fns = []
        writer = FastaWriter(self._out_fn(split_index))
        self.out_fns.append(self._out_fn(split_index))
        if reads_in_first_split is None:
            reads_in_first_split = self.reads_per_split
        with ContigSetReaderWrapper(self.input_fasta) as reader:
            for ridx, r in enumerate(reader):
                if ((split_index == 0 and ridx == reads_in_first_split) or
                        (split_index > 0 and ridx % self.reads_per_split == 0)) \
                    and ridx != 0:
                    split_index += 1
                    writer.close()
                    writer = FastaWriter(self._out_fn(split_index))
                    self.out_fns.append(self._out_fn(split_index))
                writer.writeRecord(r.name, r.sequence[:])

        writer.close()
        return list(self.out_fns)
Esempio n. 12
0
              revcmp(Ri_sequence)
           3.2 If revcmp_primers is True,
              >Fi
              Fi_sequence
              >Ri
              Ri_sequence
              >Fi_revcmp
              revcmp(Fi_sqeuence)
              >Ri_revcmp
              revcmp(Ri_sqeuence)
        4. return primers range(0, n)
        """
        logging.info("Process primers for {case}.".format(case=(
            "finding primers" if not revcmp_primers else "detecting chimeras"
        )))
        freader = ContigSetReaderWrapper(primer_fn)
        primers = []
        primerComboId = -1
        for i, r in enumerate(freader):
            if i % 2 == 0:
                direction = "F"
                primerComboId += 1
            else:
                direction = "R"
            expectedName = "{d}{n}".format(d=direction, n=primerComboId)

            if r.name != expectedName:
                errMsg = "Primers should be placed in order F0, R0, F1, R1..."
                logging.error(errMsg)
                raise ClassifierException(errMsg)
Esempio n. 13
0
def build_uc_from_partial(input_fasta,
                          ref_fasta,
                          out_pickle,
                          ccs_fofn=None,
                          done_filename=None,
                          blasr_nproc=12,
                          tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.
    blasr_nproc --- equivalent to blasr -nproc, number of CPUs to use
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 5 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=blasr_nproc) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)

    if ccs_fofn is None:
        logging.info("Loading probability from model")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        # FIXME this will not work with current CCS bam output, which lacks
        # QV pulse features required - this is handled via a workaround in
        # pbtranscript.tasks.ice_partial
        logging.info("Loading probability from QV in %s", ccs_fofn)
        probqv = ProbFromQV(input_fofn=ccs_fofn, fasta_filename=input_fasta)

    logging.info("Calling blasr_against_ref ...")
    hitItems = blasr_against_ref(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=True,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 ece_penalty=1,
                                 ece_min_len=10,
                                 same_strand_only=False)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0]
                  for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)
Esempio n. 14
0
def build_uc_from_partial_daligner(input_fasta,
                                   ref_fasta,
                                   out_pickle,
                                   ccs_fofn=None,
                                   done_filename=None,
                                   use_finer_qv=False,
                                   cpus=24,
                                   no_qv_or_aln_checking=True,
                                   tmp_dir=None):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    ccs_fofn --- If None, assume no quality value is available,
    otherwise, use QV from ccs_fofn.

    tmp_dir - where to save intermediate files such as dazz files.
              if None, writer dazz files to the same directory as query/target.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = op.dirname(out_pickle)

    ice_opts = IceOptions()
    ice_opts.detect_cDNA_size(ref_fasta)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(query_filename=input_fasta,
                            target_filename=ref_fasta,
                            is_FL=False,
                            same_strand_only=False,
                            query_converted=False,
                            target_converted=True,
                            dazz_dir=tmp_dir,
                            script_dir=op.join(output_dir, "script"),
                            use_sge=False,
                            sge_opts=None,
                            cpus=cpus)
    runner.run(min_match_len=300,
               output_dir=output_dir,
               sensitive_mode=ice_opts.sensitive_mode)

    if no_qv_or_aln_checking:
        # not using QVs or alignment checking!
        # this probqv is just a DUMMY to pass to daligner_against_ref, which won't be used
        logging.info("Not using QV for partial_uc. Loading dummy QV.")
        probqv = ProbFromModel(.01, .07, .06)
    else:
        if ccs_fofn is None:
            logging.info("Loading probability from model (0.01,0.07,0.06)")
            probqv = ProbFromModel(.01, .07, .06)
        else:
            start_t = time.time()
            if use_finer_qv:
                probqv = ProbFromQV(input_fofn=ccs_fofn,
                                    fasta_filename=input_fasta)
                logging.info("Loading QVs from %s + %s took %s secs", ccs_fofn,
                             input_fasta,
                             time.time() - start_t)
            else:
                input_fastq = input_fasta[:input_fasta.rfind('.')] + '.fastq'
                logging.info("Converting %s + %s --> %s", input_fasta,
                             ccs_fofn, input_fastq)
                ice_fa2fq(input_fasta, ccs_fofn, input_fastq)
                probqv = ProbFromFastq(input_fastq)
                logging.info("Loading QVs from %s took %s secs", input_fastq,
                             time.time() - start_t)

    logging.info("Calling dalign_against_ref ...")

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for la4ice_filename in runner.la4ice_filenames:
        start_t = time.time()
        hitItems = daligner_against_ref(
            query_dazz_handler=runner.query_dazz_handler,
            target_dazz_handler=runner.target_dazz_handler,
            la4ice_filename=la4ice_filename,
            is_FL=False,
            sID_starts_with_c=True,
            qver_get_func=probqv.get_smoothed,
            qvmean_get_func=probqv.get_mean,
            ece_penalty=1,
            ece_min_len=20,
            same_strand_only=False,
            no_qv_or_aln_checking=no_qv_or_aln_checking)
        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing %s took %s sec", la4ice_filename,
                     str(time.time() - start_t))

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0]
                  for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)

    # remove all the .las and .las.out filenames
    runner.clean_run()
Esempio n. 15
0
    def _processPrimers(self, primer_fn, window_size, primer_out_fn,
                        revcmp_primers=False):
        """
        Check and generate primers.
        1. Check primers in primer_fn are in order F0, R0, F1, R1, ...
        Fn, Rn, and lengths are all < k, where k is the primer search
        window length.
           F0  5' NNNNNNNNNN 3'
           R0  3' NNNNNNNNNN 5'
        2. If Ri and Fi are revers complementarily identical,
        add a polyA tail to 3' of Ri.
        3. For each combo of primers Fi and Ri, save the following to
        primer_out_fn.
           3.1 If revcmp_primers is False,
              >Fi
              Fi_sequence
              >Ri
              revcmp(Ri_sequence)
           3.2 If revcmp_primers is True,
              >Fi
              Fi_sequence
              >Ri
              Ri_sequence
              >Fi_revcmp
              revcmp(Fi_sqeuence)
              >Ri_revcmp
              revcmp(Ri_sqeuence)
        4. return primers range(0, n)
        """
        logging.info("Process primers for {case}.".
                     format(case=("finding primers" if not revcmp_primers
                                  else "detecting chimeras")))
        freader = ContigSetReaderWrapper(primer_fn)
        primers = []
        primerComboId = -1
        for i, r in enumerate(freader):
            if i % 2 == 0:
                direction = "F"
                primerComboId += 1
            else:
                direction = "R"
            expectedName = "{d}{n}".format(d=direction, n=primerComboId)

            if r.name != expectedName:
                errMsg = "Primers should be placed in order F0, R0, F1, R1..."
                logging.error(errMsg)
                raise ClassifierException(errMsg)

            if len(r.sequence) > window_size:
                errMsg = "Primer {n} has length {l} which is longer than {k}.".\
                    format(n=expectedName, l=len(r.sequence), k=window_size)
                logging.error(errMsg)
                raise ClassifierException(errMsg)

            if direction == "F":
                # Save >Fi and Fi_sequence.
                primers.append([expectedName, r.sequence])
            else:  # direction is "R"
                # fwdF/fwdR is the forward sequence of Fi/Ri
                fwdF, fwdR = primers[-1][1], r.sequence
                # revcmpF/revcmpR is the reverse complement of Fi/Ri
                revcmpF, revcmpR = revcmp(fwdF), revcmp(fwdR)
                # If Fi and Ri are reverse complementariliy identical, bail out,
                # because we need Poly A tail to distinguish Fi and Ri.
                if fwdF.find(revcmpR) >= 0 or revcmpR.find(fwdF) >= 0:
                    infoMsg = "Primer F{n}, R{n} ".format(n=primerComboId) + \
                        "are reverse complementarily identical. " + \
                        "Need to add 'AAAA' to 3' to distinguish them."
                    logging.info(infoMsg)
                    if revcmp_primers is False:
                        # Save primer Ri and revcmp(Ri_sequence) + TTTT
                        primers.append([expectedName, revcmpR + "T" * 4])
                    else:  # revcmp_primers is True
                        primers.append([expectedName, "A" * 4 + fwdR])
                        primers.append(['F{n}_revcmp'.format(n=primerComboId),
                                        revcmpF])
                        primers.append(['R{n}_revcmp'.format(n=primerComboId),
                                        revcmpR + "T" * 4])
                else:  # Ri and Fi are not revcmp identical
                    if revcmp_primers is False:
                        # Save >Ri and revcmp(Ri_sequence)
                        primers.append([expectedName, revcmpR])
                    else:
                        # Save >Ri and Ri_sequence
                        primers.append([expectedName, fwdR])
                        # Save >Fi_revcmp and revcmp(Fi_sequence)
                        primers.append(['F{n}_revcmp'.format(n=primerComboId),
                                        revcmpF])
                        # Save >Ri_revcmp and revcmp(Ri_sequence)
                        primers.append(['R{n}_revcmp'.format(n=primerComboId),
                                        revcmpR])
        freader.close()

        # Write Fi and reverse-complemented Ri to primer_out_fn
        f = open(primer_out_fn, 'w')
        for (name, seq) in primers:
            f.write(">{n}\n{s}\n".format(n=name, s=seq))
        f.close()
        return range(0, primerComboId + 1)
Esempio n. 16
0
def build_uc_from_partial_daligner(input_fasta, ref_fasta, out_pickle,
                                   done_filename,
                                   ice_opts,
                                   probqv,
                                   qv_prob_threshold=0.3,
                                   cpus=4,
                                   no_qv_or_aln_checking=False,
                                   tmp_dir=None,
                                   sID_starts_with_c=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using DALIGNER, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.

    tmp_dir - where to save intermediate files such as dazz files.
              if None, writer dazz files to the same directory as query/target.
    """
    input_fasta = realpath(input_fasta)
    ref_fasta = realpath(ref_fasta)
    out_pickle = realpath(out_pickle)
    output_dir = op.dirname(out_pickle)

    ice_opts.detect_cDNA_size(ref_fasta)

    # ice_partial is already being called through qsub, so run everything local!
    runner = DalignerRunner(query_filename=input_fasta,
                            target_filename=ref_fasta,
                            is_FL=False, same_strand_only=False,
                            query_converted=False, target_converted=True,
                            dazz_dir=tmp_dir, script_dir=op.join(output_dir, "script"),
                            use_sge=False, sge_opts=None, cpus=cpus)
    runner.run(min_match_len=ice_opts.min_match_len, output_dir=output_dir, sensitive_mode=ice_opts.sensitive_mode)

    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from DALIGNER hits.")

    for la4ice_filename in runner.la4ice_filenames:
        start_t = time.time()


        # not providing full_missed_start/end since aligning nFLs, ok to partially align only
        hitItems = daligner_against_ref2(query_dazz_handler=runner.query_dazz_handler,
                                        target_dazz_handler=runner.target_dazz_handler,
                                        la4ice_filename=la4ice_filename,
                                        is_FL=False, sID_starts_with_c=sID_starts_with_c,
                                        qver_get_func=probqv.get_smoothed,
                                        qvmean_get_func=probqv.get_mean,
                                        qv_prob_threshold=qv_prob_threshold,
                                        ece_penalty=ice_opts.ece_penalty,
                                        ece_min_len=ice_opts.ece_min_len,
                                        same_strand_only=True,
                                        no_qv_or_aln_checking=no_qv_or_aln_checking,
                                        max_missed_start=ice_opts.max_missed_start,
                                        max_missed_end=ice_opts.max_missed_end,
                                        full_missed_start=ice_opts.full_missed_start,
                                        full_missed_end=ice_opts.full_missed_end)


        for h in hitItems:
            if h.ece_arr is not None:
                if h.cID not in partial_uc:
                    partial_uc[h.cID] = set()
                partial_uc[h.cID].add(h.qID)
                seen.add(h.qID)
        logging.info("processing %s took %s sec",
                     la4ice_filename, str(time.time()-start_t))

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)

    # remove all the .las and .las.out filenames
    runner.clean_run()
Esempio n. 17
0
def build_uc_from_partial_blasr(input_fasta, ref_fasta, out_pickle,
                                done_filename,
                                ice_opts,
                                probqv,
                                qv_prob_threshold=0.3,
                                cpus=4,
                                no_qv_or_aln_checking=False,
                                tmp_dir=None,
                                sID_starts_with_c=False):
    """
    Given an input_fasta file of non-full-length (partial) reads and
    (unpolished) consensus isoforms sequences in ref_fasta, align reads to
    consensus isoforms using BLASR, and then build up a mapping between
    consensus isoforms and reads (i.e., assign reads to isoforms).
    Finally, save
        {isoform_id: [read_ids],
         nohit: set(no_hit_read_ids)}
    to an output pickle file.
    """
    input_fasta = _get_fasta_path(realpath(input_fasta))
    m5_file = os.path.basename(input_fasta) + ".blasr"
    if tmp_dir is not None:
        m5_file = op.join(tmp_dir, m5_file)

    out_pickle = realpath(out_pickle)

    cmd = "blasr {i} ".format(i=real_upath(input_fasta)) + \
          "{r} --bestn 100 --nCandidates 200 ".format(r=real_upath(_get_fasta_path(ref_fasta))) + \
          "--nproc {n} -m 5 ".format(n=cpus) + \
          "--maxScore -1000 --minPctIdentity 85 " + \
          "--minAlnLength {a} ".format(a=ice_opts.min_match_len) + \
          "--out {o} ".format(o=real_upath(m5_file)) + \
          "1>/dev/null 2>/dev/null"

    execute(cmd)


    logging.info("Calling blasr_against_ref ...")

    # no need to provide full_missed_start/end for nFLs, since is_FL = False
    hitItems = blasr_against_ref2(output_filename=m5_file,
                                 is_FL=False,
                                 sID_starts_with_c=sID_starts_with_c,
                                 qver_get_func=probqv.get_smoothed,
                                 qvmean_get_func=probqv.get_mean,
                                 qv_prob_threshold=qv_prob_threshold,
                                 ece_penalty=ice_opts.ece_penalty,
                                 ece_min_len=ice_opts.ece_min_len,
                                 max_missed_start=ice_opts.max_missed_start,
                                 max_missed_end=ice_opts.max_missed_end,
                                 full_missed_start=ice_opts.full_missed_start,
                                 full_missed_end=ice_opts.full_missed_end,
                                 same_strand_only=False)


    partial_uc = {}  # Maps each isoform (cluster) id to a list of reads
    # which can map to the isoform
    seen = set()  # reads seen
    logging.info("Building uc from BLASR hits.")
    for h in hitItems:
        if h.ece_arr is not None:
            if h.cID not in partial_uc:
                partial_uc[h.cID] = set()
            partial_uc[h.cID].add(h.qID)
            seen.add(h.qID)

    for k in partial_uc:
        partial_uc[k] = list(partial_uc[k])

    allhits = set(r.name.split()[0] for r in ContigSetReaderWrapper(input_fasta))

    logging.info("Counting reads with no hit.")
    nohit = allhits.difference(seen)

    logging.info("Dumping uc to a pickle: %s.", out_pickle)
    with open(out_pickle, 'w') as f:
        if out_pickle.endswith(".pickle"):
            dump({'partial_uc': partial_uc, 'nohit': nohit}, f)
        elif out_pickle.endswith(".json"):
            f.write(json.dumps({'partial_uc': partial_uc, 'nohit': nohit}))
        else:
            raise IOError("Unrecognized extension: %s" % out_pickle)

    os.remove(m5_file)

    done_filename = realpath(done_filename) if done_filename is not None \
        else out_pickle + '.DONE'
    logging.debug("Creating %s.", done_filename)
    touch(done_filename)