def test_collapse_fuzzy_junctions(self):
        """Test collapse_fuzzy_junctions, can_merge and compare_fuzzy_junctions."""
        test_name = "collapse_fuzzy_junctions"
        input_gff = op.join(_DAT_DIR_, "input_%s.gff" % test_name)
        input_group = op.join(_DAT_DIR_, "input_%s.group.txt" % test_name)
        output_gff = op.join(_OUT_DIR_, "output_%s.gff" % test_name)
        output_group = op.join(_OUT_DIR_, "output_%s.group.txt" % test_name)

        records = [r for r in CollapseGffReader(input_gff)]
        self.assertEqual(len(records), 4)

        r0, r1, r2, r3 = records
        # comparing r0 and r1
        m = compare_fuzzy_junctions(r0.ref_exons, r1.ref_exons, max_fuzzy_junction=5)
        self.assertEqual(m, "subset")
        self.assertTrue(can_merge(m, r0, r1, allow_extra_5exon=True, max_fuzzy_junction=5))

        # comparing r2 and r3
        m = compare_fuzzy_junctions(r2.ref_exons, r3.ref_exons, max_fuzzy_junction=5)
        self.assertEqual(m, "exact")
        self.assertTrue(can_merge(m, r2, r3, allow_extra_5exon=True, max_fuzzy_junction=5))

        # call collapse_fuzzy_junctions and write fuzzy output.
        collapse_fuzzy_junctions(gff_filename=input_gff,
                                 group_filename=input_group,
                                 fuzzy_gff_filename=output_gff,
                                 fuzzy_group_filename=output_group,
                                 allow_extra_5exon=True,
                                 max_fuzzy_junction=5)

        r4, r5 = [r for r in CollapseGffReader(output_gff)]
        self.assertEqual(r1, r4)
        self.assertEqual(r3, r5)
Exemple #2
0
 def test_good_isoform_ids_by_removing_subsets(self):
     """Test good_isoform_ids_by_removing_subsets"""
     all = [r.seqid for r in CollapseGffReader(GFF_FN)]
     good = good_isoform_ids_by_removing_subsets(in_gff_filename=GFF_FN,
             max_fuzzy_junction=5)
     diff = list(set(all) - set(good))
     self.assertEqual(diff, self.expected_diff)
Exemple #3
0
    def __init__(self,
                 gff_filename,
                 group_filename,
                 self_prefix=None,
                 max_fuzzy_junction=0):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.max_fuzzy_junction = max_fuzzy_junction

        self.record_d = dict(
            (r.seqid, r) for r in CollapseGffReader(gff_filename))
        self.tree = read_gff_as_interval_tree(
            gff_filename=self.gff_filename)  # chr --> strand -->tree
        # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
        self.group_info = MegaPBTree.read_group(self.group_filename,
                                                self.self_prefix)

        # keep track of gff|group files that has been added.
        self._sample_prefixes = []
        self._group_filenames = []
        self._gff_filenames = []
        self._add_sample_files(gff_filename=gff_filename,
                               group_filename=group_filename,
                               sample_prefix="first_sample")
Exemple #4
0
def read_gff_as_interval_tree(gff_filename):
    """
    Read a collapsed GFF file into an IntervalTree
    """
    tree = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr --> strand --> tree
    for r in CollapseGffReader(gff_filename):
        tree[r.chr][r.strand].insert(r.start, r.end, r)
    return tree
Exemple #5
0
    def test_filter_out_subsets(self):
        """Test filter_out_subsets"""
        out_abundance_fn = op.join(_OUT_DIR_, "filter_out_subsets.abundance.txt")
        out_gff_fn = op.join(_OUT_DIR_, "filter_out_subsets.gff")
        out_rep_fn = op.join(_OUT_DIR_, "filter_out_subsets.rep.fastq")
        filter_out_subsets(in_abundance_filename=ABUNDANCE_FN,
                           in_gff_filename=GFF_FN, in_rep_filename=REP_FN,
                           out_abundance_filename=out_abundance_fn,
                           out_gff_filename=out_gff_fn, out_rep_filename=out_rep_fn,
                           max_fuzzy_junction=5)

        all = [r.seqid for r in CollapseGffReader(GFF_FN)]
        expected_good = set(all)-set(self.expected_diff)
        out_abundance_ids = [r.pbid for r in AbundanceReader(out_abundance_fn)]
        self.assertEqual(set(out_abundance_ids), expected_good)

        out_gff_ids = [r.seqid for r in CollapseGffReader(out_gff_fn)]
        self.assertEqual(set(out_gff_ids), expected_good)

        out_rep_ids = [r.name.split('|')[0] for r in FastqReader(out_rep_fn)]
        self.assertEqual(set(out_rep_ids), expected_good)
 def run_after(self, rtc, output_dir):
     rep_fn = rtc.task.output_files[0]
     gff_fn = rtc.task.output_files[1]
     abundance_fn = rtc.task.output_files[2]
     group_fn = rtc.task.output_files[3]
     read_stat_fn = rtc.task.output_files[4]
     from pbcore.io import FastqReader
     from pbtranscript.io import CollapseGffReader, AbundanceReader, GroupReader, ReadStatReader
     self.assertEqual(len([r for r in FastqReader(rep_fn)]), 65)
     self.assertEqual(len([r for r in CollapseGffReader(gff_fn)]), 65)
     self.assertEqual(len([r for r in AbundanceReader(abundance_fn)]), 65)
     self.assertEqual(len([r for r in GroupReader(group_fn)]), 86)
     self.assertEqual(len([r for r in ReadStatReader(read_stat_fn)]), 10873)
Exemple #7
0
    def add_sample(self, gff_filename, group_filename, sample_prefix, o_gff_fn,
                   o_group_fn, o_mega_fn):
        """Add one more sample to this MagaPBTree object.
        Read gff file to get collapsed isoforms from new sample,
        combine with existing collapsed isoforms and update tree.
        """
        self._add_sample_files(gff_filename=gff_filename,
                               group_filename=group_filename,
                               sample_prefix=sample_prefix)

        # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if
        # both not None)
        combined = []
        unmatched_recs = self.record_d.keys()

        for r in CollapseGffReader(gff_filename):
            match_rec = self.match_record_to_tree(r)
            if match_rec is not None:  # found a match! put longer of r1/r2 in
                combined.append((match_rec, r))
                try:
                    unmatched_recs.remove(match_rec.seqid)
                except ValueError:
                    pass  # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, r))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d[seqid], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {
            '+': ClusterTree(0, 0),
            '-': ClusterTree(0, 0)
        })
        for i, (r1, r2) in enumerate(combined):
            if r2 is None or (r1 is not None
                              and r1.end - r1.start > r2.end - r2.start):
                final_tree[r1.chr][r1.strand].insert(r1.start, r1.end, i)
            else:
                final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename,
                                       sample_prefix, o_gff_fn, o_group_fn,
                                       o_mega_fn)
Exemple #8
0
    def test_filter_by_count(self):
        """Test filter_by_count"""
        out_abundance_fn = op.join(_OUT_DIR_, "filter_by_count.abundance.txt")
        out_gff_fn = op.join(_OUT_DIR_, "filter_by_count.gff")
        out_rep_fn = op.join(_OUT_DIR_, "filter_by_count.rep.fastq")
        filter_by_count(in_group_filename=GROUP_FN, in_abundance_filename=ABUNDANCE_FN,
                        in_gff_filename=GFF_FN, in_rep_filename=REP_FN,
                        out_abundance_filename=out_abundance_fn,
                        out_gff_filename=out_gff_fn,
                        out_rep_filename=out_rep_fn,
                        min_count=20)

        out_abundance_ids = [r.pbid for r in AbundanceReader(out_abundance_fn)]
        self.assertEqual(out_abundance_ids, self.expected_good)

        out_gff_ids = [r.seqid for r in CollapseGffReader(out_gff_fn)]
        self.assertEqual(out_gff_ids, self.expected_good)

        out_rep_ids = [r.name.split('|')[0] for r in FastqReader(out_rep_fn)]
        self.assertEqual(out_rep_ids, self.expected_good)
Exemple #9
0
def good_isoform_ids_by_removing_subsets(in_gff_filename, max_fuzzy_junction):
    """Return a list of collapsed isoforms ids by removing isoforms which
    are a subset of any other isoform.
    Parameters:
      in_gff_filename -- input collapsed gff file
    """
    recs_dict = defaultdict(lambda: [])

    with CollapseGffReader(in_gff_filename) as gff_reader:
        for r in gff_reader:
            assert r.seqid.startswith('PB.')
            recs_dict[int(r.seqid.split('.')[1])].append(r)

    good = []
    keys = recs_dict.keys()
    keys.sort()
    for k in recs_dict:
        recs = recs_dict[k]
        remove_subset_isoforms_from_list(recs,
                                         max_fuzzy_junction=max_fuzzy_junction)
        for r in recs:
            good.append(r.seqid)

    return good
Exemple #10
0
def write_good_collapsed_isoforms(in_abundance_filename, in_gff_filename,
                                  in_rep_filename, out_abundance_filename,
                                  out_gff_filename, out_rep_filename, good):
    """Write good collapsed isoforms."""
    in_suffix = parse_ds_filename(in_rep_filename)[1]
    out_suffix = parse_ds_filename(out_rep_filename)[1]
    if in_suffix != out_suffix:
        raise ValueError("Format of input %s and output %s must match." %
                         (in_rep_filename, out_rep_filename))
    if in_suffix not in ("fasta", "fastq"):
        raise ValueError(
            "Format of input %s and output %s must be either FASTA or FASTQ." %
            (in_rep_filename, out_rep_filename))

    # then read gff, and write good gff record.
    with CollapseGffWriter(out_gff_filename) as gff_writer:
        for r in CollapseGffReader(in_gff_filename):
            if r.seqid in good:
                gff_writer.writeRecord(r)

    # next read rep fasta/fastq, and write good rep fasta/fastq record.
    rep_reader = FastaReader(in_rep_filename) if in_suffix == "fasta" \
                 else FastqReader(in_rep_filename)
    rep_writer = FastaWriter(out_rep_filename) if in_suffix == "fasta" \
                 else FastqWriter(out_rep_filename)
    for r in rep_reader:
        # r.name e.g., PB.1.1|PB.1.1:10712-11643(+)|i0_HQ_sample18ba5d|c1543/f8p1/465
        if r.name.split('|')[0] in good:
            rep_writer.writeRecord(r)

    # finally write abundance info of good records.
    with AbundanceReader(in_abundance_filename) as a_reader, \
        AbundanceWriter(out_abundance_filename, comments=a_reader.comments) as a_writer:
        for r in a_reader:
            if r.pbid in good:
                a_writer.writeRecord(r)
def pick_rep(isoform_filename,
             gff_filename,
             group_filename,
             output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(
                ".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError(
                    "%s must contain either indexed FASTA files or " %
                    isoform_filename + "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." %
                      isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." %
                             isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml":  # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                    r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)
def collapse_fuzzy_junctions(gff_filename, group_filename, fuzzy_gff_filename,
                             fuzzy_group_filename, allow_extra_5exon,
                             max_fuzzy_junction):
    """
    Collapses those transcripts in gff_filename which have fuzzy junctions.
    Returns fuzzy_match

    Parameters:
      gff_filename -- input unfuzzy gff filename
      group_filename -- input unfuzzy group filename
      fuzzy_gff_filename -- output gff filename in which transcripts with fuzzy
                            junctions are further collapsed.
      fuzzy_group_filename -- output group filename
      allow_etra_5exon -- whether or not to allow extra 5 exons
      max_fuzzy_junction -- maximum differences to call two exons match
    """

    d = {}  # seqid --> GmapRecord
    recs = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr --> strand --> tree
    fuzzy_match = defaultdict(
        lambda: [])  # seqid --> [seqid of fuzzy match GmapRecords]
    for r in CollapseGffReader(gff_filename):
        # r : a GmapRecord which represents a transcript and its associated exons.
        d[r.seqid] = r
        has_match = False
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            # Compare r1 with r2 and get match pattern, exact, super, subset, partial or nonmatch
            m = compare_fuzzy_junctions(r.ref_exons,
                                        r2.ref_exons,
                                        max_fuzzy_junction=max_fuzzy_junction)
            if can_merge(m,
                         r,
                         r2,
                         allow_extra_5exon=allow_extra_5exon,
                         max_fuzzy_junction=max_fuzzy_junction):
                logging.debug("Collapsing fuzzy transcript %s to %s", r.seqid,
                              r2.seqid)
                fuzzy_match[r2.seqid].append(r.seqid)  # collapse r to r2
                has_match = True
                break
        if not has_match:
            logging.debug("No fuzzy transcript found for %s", r.seqid)
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    # Get group info from input group_filename
    group_info = {
        group.name: group.members
        for group in GroupReader(group_filename)
    }

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))

    fuzzy_gff_writer = CollapseGffWriter(fuzzy_gff_filename)
    fuzzy_group_writer = GroupWriter(fuzzy_group_filename)
    for k in keys:  # Iterates over each group of fuzzy match GmapRecords
        all_members = []
        # Assume the first GmapRecord is the best to represent this fuzzy match GmapRecords group
        best_pbid = fuzzy_match[k][0]  # e.g., PB.1.1
        if not best_pbid in group_info:
            raise ValueError("Could not find %s in Group file %s" %
                             (best_pbid, group_filename))
        best_size, best_num_exons = len(group_info[best_pbid]), len(
            d[best_pbid].ref_exons)
        all_members += group_info[best_pbid]
        for pbid in fuzzy_match[k][
                1:]:  # continue to look for better representative
            if not pbid in group_info:
                raise ValueError("Could not find %s in Group file %s" %
                                 (pbid, group_filename))
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                               and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        # Write the best GmapRecord of the group to fuzzy_gff_filename
        fuzzy_gff_writer.writeRecord(d[best_pbid])
        # Write all members of the group to fuzzy_group_filename
        fuzzy_group_writer.writeRecord(GroupRecord(best_pbid, all_members))
    fuzzy_gff_writer.close()
    fuzzy_group_writer.close()

    return fuzzy_match