def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix):
        combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None)
        unmatched_recs = self.record_d.keys()

        for r in GFF.collapseGFFReader(gff_filename):
            match_rec = self.match_record_to_tree(r)
            if match_rec is not None:  # found a match! put longer of r1/r2 in
                combined.append((match_rec, r))
                try:
                    unmatched_recs.remove(match_rec.seqid)
                except ValueError:
                    pass # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, r))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d[seqid], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)})
        for i,(r1,r2) in enumerate(combined):
            if r2 is None or (r1 is not None and r1.end-r1.start > r2.end-r2.start):
                final_tree[r1.chr][r1.strand].insert(r1.start, r1.end, i)
            else:
                final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix)
    def add_sample(self, gff_filename, group_filename, sample_prefix,
                   output_prefix):
        combined = [
        ]  # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None)
        unmatched_recs = self.record_d.keys()

        for r in GFF.collapseGFFReader(gff_filename):
            match_rec = self.match_record_to_tree(r)
            if match_rec is not None:  # found a match! put longer of r1/r2 in
                combined.append((match_rec, r))
                try:
                    unmatched_recs.remove(match_rec.seqid)
                except ValueError:
                    pass  # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, r))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d[seqid], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {
            '+': ClusterTree(0, 0),
            '-': ClusterTree(0, 0)
        })
        for i, (r1, r2) in enumerate(combined):
            if r2 is None or (r1 is not None
                              and r1.end - r1.start > r2.end - r2.start):
                final_tree[r1.chr][r1.strand].insert(r1.start, r1.end, i)
            else:
                final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename,
                                       sample_prefix, output_prefix)
    def __init__(self, gff_filename, group_filename, self_prefix=None):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.record_d = dict((r.seqid, r) for r in GFF.collapseGFFReader(gff_filename))
        self.tree = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree

        self.read_gff_as_interval_tree()
        self.group_info = MegaPBTree.read_group(self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
    def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.internal_fuzzy_max_dist = internal_fuzzy_max_dist
        self.record_d = dict((r.seqid, r) for r in GFF.collapseGFFReader(gff_filename))
        self.tree = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree

        #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist
        #raw_input()
        self.read_gff_as_interval_tree()
        self.group_info = MegaPBTree.read_group(self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
    def __init__(self,
                 gff_filename,
                 group_filename,
                 internal_fuzzy_max_dist=0,
                 self_prefix=None):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.internal_fuzzy_max_dist = internal_fuzzy_max_dist
        self.record_d = dict(
            (r.seqid, r) for r in GFF.collapseGFFReader(gff_filename))
        self.tree = defaultdict(lambda: {
            '+': IntervalTree(),
            '-': IntervalTree()
        })  # chr --> strand --> tree

        #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist
        #raw_input()
        self.read_gff_as_interval_tree()
        self.group_info = MegaPBTree.read_group(
            self.group_filename,
            self.self_prefix)  # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
Esempio n. 6
0
def filter_by_count(input_prefix, output_prefix, min_count):

    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'

    # read group
    group_max_count_fl = {}
    group_max_count_p = {}
    f = open(group_filename)
    for line in f:
        #ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
        pbid, members = line.strip().split('\t')
        group_max_count_fl[pbid] = 0
        group_max_count_p[pbid] = 0
        members = members.split(',')
        for m in members:
            tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16
            fl_count, p_count = tmp.split('p')
            fl_count = int(fl_count[1:])
            p_count = int(p_count)
            group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
            group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k,v in d.iteritems():
        print k,v
    f.close()

    # group_max_count_p NOT used for now
    good = filter(lambda x: int(d[x]['count_fl']) >= min_count and group_max_count_fl[x] >= min_count and group_max_count_p >= 0, d)

    # write output GFF
    f = open(output_prefix + '.gff', 'w')
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good: GFF.write_collapseGFF_format(f, r)
    f.close()


    # write output rep.fq
    f = FastqWriter(output_prefix + '.rep.fq')
    for r in FastqReader(rep_filename):
        if r.name.split('|')[0] in good:
           f.writeRecord(r)
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix", help="Input prefix")
    parser.add_argument("output_prefix", help="Output prefix")
    parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()

    #group_filename = args.input_prefix + '.group.txt'
    count_filename = args.input_prefix + '.abundance.txt'
    gff_filename = args.input_prefix + '.gff'
    rep_filename = args.input_prefix + '.rep.fq'

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(args.output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k,v in d.iteritems():
        print k,v
    f.close()

    # write output rep.fq
    f = FastqWriter(args.output_prefix + '.rep.fq')
    for r in FastqReader(rep_filename):
        if r.name.split('|')[0] in good:
           f.writeRecord(r)
    f.close()

    # write output to .abundance.txt
    f = open(args.output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()
Esempio n. 8
0
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon,
                             internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
        return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1  #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(
                r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename + '.fuzzy', 'w')
    f_group = open(group_filename + '.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(
            group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                               and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match
 def read_gff_as_interval_tree(self):
     """
     Read a collapsed GFF file into an IntervalTree
     """
     for r in GFF.collapseGFFReader(self.gff_filename):
         self.tree[r.chr][r.strand].insert(r.start, r.end, r)
 def read_gff_as_interval_tree(self):
     """
     Read a collapsed GFF file into an IntervalTree
     """
     for r in GFF.collapseGFFReader(self.gff_filename):
         self.tree[r.chr][r.strand].insert(r.start, r.end, r)
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
        return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1 #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename+'.fuzzy', 'w')
    f_group = open(group_filename+'.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match
Esempio n. 12
0
def filter_by_count(input_prefix, output_prefix, min_count):

    group_filename = input_prefix + ".group.txt"
    count_filename = input_prefix + ".abundance.txt"
    gff_filename = input_prefix + ".gff"
    rep_filename = input_prefix + ".rep.fq"

    # read group
    group_max_count_fl = {}
    group_max_count_p = {}
    f = open(group_filename)
    for line in f:
        # ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
        pbid, members = line.strip().split("\t")
        group_max_count_fl[pbid] = 0
        group_max_count_p[pbid] = 0
        members = members.split(",")
        for m in members:
            tmp = m.split("|")[1].split("/")[1]  # ex: tmp = f30p16
            fl_count, p_count = tmp.split("p")
            fl_count = int(fl_count[1:])
            p_count = int(p_count)
            group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
            group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ""
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith("#"):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r["pbid"], r) for r in DictReader(f, delimiter="\t"))
    for k, v in d.iteritems():
        print k, v
    f.close()

    # group_max_count_p NOT used for now
    good = filter(
        lambda x: int(d[x]["count_fl"]) >= min_count and group_max_count_fl[x] >= min_count and group_max_count_p >= 0,
        d,
    )

    # write output GFF
    f = open(output_prefix + ".gff", "w")
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good:
            GFF.write_collapseGFF_format(f, r)
    f.close()

    # write output rep.fq
    f = FastqWriter(output_prefix + ".rep.fq")
    for r in FastqReader(rep_filename):
        if r.name.split("|")[0] in good:
            f.writeRecord(r)
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + ".abundance.txt", "w")
    f.write(count_header)
    writer = DictWriter(
        f,
        fieldnames=["pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb"],
        delimiter="\t",
        lineterminator="\n",
    )
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()