Esempio n. 1
0
def read_group_file(group_filename, is_cid=True, sample_prefixes=None):
    """
    Make the connection between partitioned results and final (ex: PB.1.1)
    The partitioned results could either be ICE cluster (ex: i1_c123) or CCS

    Return: dict of seq_or_ice_cluster --> collapsed cluster ID
    """
    cid_info = {}  # ex: i1 --> c123 --> PB.1.1, or None --> c123 --> PB.1.1
    if sample_prefixes is not None:
        for sample_prefix in sample_prefixes:
            cid_info[sample_prefix] = {}
    else:
        cid_info[None] = {}

    reader = GroupReader(group_filename)
    for group in reader:
        pbid, members = group.name, group.members
        for cid in members:
            # ex: x is 'i1_c123/f3p0/123 or
            # m131116_014707_42141_c100591062550000001823103405221462_s1_p0/93278/31_1189_CCS
            if sample_prefixes is None:
                if is_cid:
                    cid = cid.split('/')[0]
                cid_info[None][cid] = pbid
            else:
                if any(
                        cid.startswith(sample_prefix + '|')
                        for sample_prefix in sample_prefixes):
                    sample_prefix, cid = cid.split('|', 1)
                    if is_cid:
                        cid = cid.split('/')[0]
                    cid_info[sample_prefix][cid] = pbid
    reader.close()
    return cid_info
Esempio n. 2
0
def read_group_file(group_filename, is_cid=True, sample_prefixes=None):
    """
    Make the connection between partitioned results and final (ex: PB.1.1)
    The partitioned results could either be ICE cluster (ex: i1_c123) or CCS

    Return: dict of seq_or_ice_cluster --> collapsed cluster ID
    """
    cid_info = {} # ex: i1 --> c123 --> PB.1.1, or None --> c123 --> PB.1.1
    if sample_prefixes is not None:
        for sample_prefix in sample_prefixes:
            cid_info[sample_prefix] = {}
    else:
        cid_info[None] = {}

    reader = GroupReader(group_filename)
    for group in reader:
        pbid, members = group.name, group.members
        for cid in members:
            # ex: x is 'i1_c123/f3p0/123 or
            # m131116_014707_42141_c100591062550000001823103405221462_s1_p0/93278/31_1189_CCS
            if sample_prefixes is None:
                if is_cid:
                    cid = cid.split('/')[0]
                cid_info[None][cid] = pbid
            else:
                if any(cid.startswith(sample_prefix + '|') for sample_prefix in sample_prefixes):
                    sample_prefix, cid = cid.split('|', 1)
                    if is_cid:
                        cid = cid.split('/')[0]
                    cid_info[sample_prefix][cid] = pbid
    reader.close()
    return cid_info
Esempio n. 3
0
def good_isoform_ids_by_count(in_group_filename, in_abundance_filename,
                              min_count):
    """Return a list of collapsed isoforms ids whose supportive FL
    count >= min_count.
    Parameters:
      in_group_filename -- group file of collapsed isoforms
      in_abundance_filename -- abundance file of collapsed isoforms
      min_count -- min number of supportive FL reads to be 'good'
    """
    # read group file
    group_max_count_fl = {}
    group_max_count_nfl = {}
    with GroupReader(in_group_filename) as g_reader:
        for g in g_reader:
            pbid, members = g.name, g.members
            group_max_count_fl[pbid] = 0
            group_max_count_nfl[pbid] = 0
            for m in members:
                s = SampleIsoformName.fromString(m)
                group_max_count_fl[pbid] = max(group_max_count_fl[pbid],
                                               s.num_fl)
                group_max_count_nfl[pbid] = max(group_max_count_nfl[pbid],
                                                s.num_nfl)

    # read abundance to decide good collapsed isoforms based on count
    good = [
        r.pbid for r in AbundanceReader(in_abundance_filename)
        if r.count_fl >= min_count and group_max_count_fl[r.pbid] >= min_count
    ]

    return good
Esempio n. 4
0
 def read_group(group_filename, group_prefix):
     """read a group file and group_prefix to a dict
     if group_prefix is None: return {group.pbid --> group.members}
     else: {group.pbid --> [group_prefix+'|'+m for m in group.members]
     """
     return {
         group.name: group.members
         for group in GroupReader(group_filename, group_prefix)
     }
Esempio n. 5
0
 def run_after(self, rtc, output_dir):
     rep_fn = rtc.task.output_files[0]
     gff_fn = rtc.task.output_files[1]
     abundance_fn = rtc.task.output_files[2]
     group_fn = rtc.task.output_files[3]
     read_stat_fn = rtc.task.output_files[4]
     from pbcore.io import FastqReader
     from pbtranscript.io import CollapseGffReader, AbundanceReader, GroupReader, ReadStatReader
     self.assertEqual(len([r for r in FastqReader(rep_fn)]), 65)
     self.assertEqual(len([r for r in CollapseGffReader(gff_fn)]), 65)
     self.assertEqual(len([r for r in AbundanceReader(abundance_fn)]), 65)
     self.assertEqual(len([r for r in GroupReader(group_fn)]), 86)
     self.assertEqual(len([r for r in ReadStatReader(read_stat_fn)]), 10873)
Esempio n. 6
0
def pick_rep(isoform_filename,
             gff_filename,
             group_filename,
             output_filename,
             pick_least_err_instead=False,
             bad_gff_filename=None):
    """
    For each group of collapsed sam records, select the representative record.

    If is FASTA file -- then always pick the longest one
    If is FASTQ file -- then
          If pick_least_err_instead is True, pick the one w/ least number of expected base errors
          Else, pick the longest one
    """
    fd = None
    is_fq = False
    dummy_prefix, _suffix = parse_ds_filename(isoform_filename)
    if _suffix == "fasta":
        fd = FastaRandomReader(isoform_filename)
    elif _suffix == "fastq":
        fd = FastqRandomReader(isoform_filename)
        is_fq = True
    elif _suffix == "contigset.xml":
        fd = ContigSet(isoform_filename)
        _fns = fd.toExternalFiles()
        if len(_fns) == 1 and _fns[0].endswith(".fq") or _fns[0].endswith(
                ".fastq"):
            fd = FastqRandomReader(_fns[0])
            is_fq = True
        else:
            if not fd.isIndexed:
                # Must be indexed FASTA, or exactly contains one FASTQ file
                raise IOError(
                    "%s must contain either indexed FASTA files or " %
                    isoform_filename + "contain exactly one FASTQ file!")
    else:
        raise IOError("Unable to recognize file type of %s." %
                      isoform_filename)

    fa_out_fn, fq_out_fn, ds_out_fn = None, None, None

    _prefix, _suffix = parse_ds_filename(output_filename)
    if _suffix == "fasta":
        fa_out_fn = output_filename
    elif _suffix == "fastq":
        if not is_fq:
            raise ValueError("Input file %s is not FASTQ while output is." %
                             isoform_filename)
        else:
            fq_out_fn = output_filename
    elif _suffix == "contigset.xml":  # output is contigset.xml
        ds_out_fn = output_filename
        fa_out_fn = _prefix + ".fasta"
        if is_fq:
            fq_out_fn = _prefix + ".fastq"
    else:
        raise IOError("Unable to recognize file type of %s." % output_filename)

    fa_writer = FastaWriter(fa_out_fn) if fa_out_fn is not None else None
    fq_writer = FastqWriter(fq_out_fn) if fq_out_fn is not None else None

    coords = {}
    for r in CollapseGffReader(gff_filename):
        tid = r.transcript_id
        coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                r.strand)

    if bad_gff_filename is not None:
        for r in CollapseGffReader(gff_filename):
            tid = r.transcript_id
            coords[tid] = "{0}:{1}-{2}({3})".format(r.seqid, r.start, r.end,
                                                    r.strand)

    for group in GroupReader(group_filename):
        pb_id, members = group.name, group.members
        if not pb_id in coords:
            raise ValueError("Could not find %s in %s and %s" %
                             (pb_id, gff_filename, bad_gff_filename))
        #logging.info("Picking representative sequence for %s", pb_id)
        best_id = None
        best_seq = None
        best_qual = None
        best_err = 9999999
        err = 9999999
        max_len = 0

        for x in members:
            if is_fq and pick_least_err_instead:
                err = sum(i**-(i / 10.) for i in fd[x].quality)
            if (is_fq and pick_least_err_instead and err < best_err) or \
               ((not is_fq or not pick_least_err_instead) and len(fd[x].sequence) >= max_len):
                best_id = x
                best_seq = fd[x].sequence
                if is_fq:
                    best_qual = fd[x].quality
                    best_err = err
                max_len = len(fd[x].sequence)

        _id_ = "{0}|{1}|{2}".format(pb_id, coords[pb_id], best_id)
        _seq_ = best_seq
        if fq_writer is not None:
            fq_writer.writeRecord(_id_, _seq_, best_qual)
        if fa_writer is not None:
            fa_writer.writeRecord(_id_, _seq_)

    if fa_writer is not None:
        fa_writer.close()
    if fq_writer is not None:
        fq_writer.close()
    if ds_out_fn is not None:
        as_contigset(fa_out_fn, ds_out_fn)
Esempio n. 7
0
def collapse_fuzzy_junctions(gff_filename, group_filename, fuzzy_gff_filename,
                             fuzzy_group_filename, allow_extra_5exon,
                             max_fuzzy_junction):
    """
    Collapses those transcripts in gff_filename which have fuzzy junctions.
    Returns fuzzy_match

    Parameters:
      gff_filename -- input unfuzzy gff filename
      group_filename -- input unfuzzy group filename
      fuzzy_gff_filename -- output gff filename in which transcripts with fuzzy
                            junctions are further collapsed.
      fuzzy_group_filename -- output group filename
      allow_etra_5exon -- whether or not to allow extra 5 exons
      max_fuzzy_junction -- maximum differences to call two exons match
    """

    d = {}  # seqid --> GmapRecord
    recs = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr --> strand --> tree
    fuzzy_match = defaultdict(
        lambda: [])  # seqid --> [seqid of fuzzy match GmapRecords]
    for r in CollapseGffReader(gff_filename):
        # r : a GmapRecord which represents a transcript and its associated exons.
        d[r.seqid] = r
        has_match = False
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            # Compare r1 with r2 and get match pattern, exact, super, subset, partial or nonmatch
            m = compare_fuzzy_junctions(r.ref_exons,
                                        r2.ref_exons,
                                        max_fuzzy_junction=max_fuzzy_junction)
            if can_merge(m,
                         r,
                         r2,
                         allow_extra_5exon=allow_extra_5exon,
                         max_fuzzy_junction=max_fuzzy_junction):
                logging.debug("Collapsing fuzzy transcript %s to %s", r.seqid,
                              r2.seqid)
                fuzzy_match[r2.seqid].append(r.seqid)  # collapse r to r2
                has_match = True
                break
        if not has_match:
            logging.debug("No fuzzy transcript found for %s", r.seqid)
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    # Get group info from input group_filename
    group_info = {
        group.name: group.members
        for group in GroupReader(group_filename)
    }

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))

    fuzzy_gff_writer = CollapseGffWriter(fuzzy_gff_filename)
    fuzzy_group_writer = GroupWriter(fuzzy_group_filename)
    for k in keys:  # Iterates over each group of fuzzy match GmapRecords
        all_members = []
        # Assume the first GmapRecord is the best to represent this fuzzy match GmapRecords group
        best_pbid = fuzzy_match[k][0]  # e.g., PB.1.1
        if not best_pbid in group_info:
            raise ValueError("Could not find %s in Group file %s" %
                             (best_pbid, group_filename))
        best_size, best_num_exons = len(group_info[best_pbid]), len(
            d[best_pbid].ref_exons)
        all_members += group_info[best_pbid]
        for pbid in fuzzy_match[k][
                1:]:  # continue to look for better representative
            if not pbid in group_info:
                raise ValueError("Could not find %s in Group file %s" %
                                 (pbid, group_filename))
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                               and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        # Write the best GmapRecord of the group to fuzzy_gff_filename
        fuzzy_gff_writer.writeRecord(d[best_pbid])
        # Write all members of the group to fuzzy_group_filename
        fuzzy_group_writer.writeRecord(GroupRecord(best_pbid, all_members))
    fuzzy_gff_writer.close()
    fuzzy_group_writer.close()

    return fuzzy_match