コード例 #1
0
def scrub_sample_GFFs(sample_dirs, gff_filename, count_filename,
                      group_filename, fastq_filename, output_prefix, tree):

    for sample_name, d in sample_dirs.items():
        outf = open(os.path.join(d, output_prefix + '.gff.tmp'), 'w')
        for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)):
            n = len(r.ref_exons)
            if n == 1:
                GFF.write_collapseGFF_format(outf, r)

            new_ref_exons = scrub_ref_exons(r, tree)
            if new_ref_exons is None:
                print("No changes made due to error:",
                      r.seqid,
                      file=sys.stderr)
            else:
                #print "before:", r.ref_exons
                #print "after :", new_ref_exons
                r.ref_exons = new_ref_exons
            GFF.write_collapseGFF_format(outf, r)
        outf.close()
        cleanup_scrubbed_files_redundancy(outf.name, \
                                          os.path.join(d, group_filename), \
                                          os.path.join(d, count_filename), \
                                          os.path.join(d, fastq_filename) if fastq_filename is not None else None,
                                          os.path.join(d, output_prefix))
コード例 #2
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: test.collapsed.min_fl_2)")
    parser.add_argument("--fuzzy_junction",
                        type=int,
                        default=5,
                        help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    count_filename, gff_filename, rep_filename, rep_type = sanity_check_collapse_input(
        args.input_prefix)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = list(recs.keys())
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.' + ('fq' if rep_type == 'fastq' else 'fa'),
             'w')
    for r in SeqIO.parse(open(rep_filename), rep_type):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, rep_type)
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print("Output written to:", output_prefix + '.gff', file=sys.stderr)
    print("Output written to:", rep_filename, file=sys.stderr)
    print("Output written to:", output_prefix + '.gff', file=sys.stderr)
コード例 #3
0
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group,
                     addon_name, fuzzy_junction, allow_5merge, max_3_diff,
                     n_chunks):
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)
    recs = []
    tree = OrderedDict()
    i = 0
    for r in GFF.collapseGFFReader(addon_gff):
        if r.chr not in tree:
            tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)}
        tree[r.chr][r.strand].insert(r.start, r.end, i)
        recs.append(r)
        i += 1

    n = len(recs)
    chunk_size = (n // n_chunks) + (n % n_chunks > 0)

    split_files = []
    i = 0
    counter = 0
    f_gff = open(addon_gff + '.split' + str(i), 'w')
    f_group = open(addon_group + '.split' + str(i), 'w')
    for v1 in tree.values():
        for strand in ('+', '-'):
            v2 = v1[strand]
            for _start, _end, _indices in v2.getregions():
                for cur in _indices:
                    GFF.write_collapseGFF_format(f_gff, recs[cur])
                    f_group.write("{0}\t{1}\n".format(
                        recs[cur].seqid,
                        ",".join(addon_group_info[recs[cur].seqid])))
                    counter += 1
            if counter >= (i + 1) * chunk_size:
                i += 1
                f_gff.close()
                f_group.close()
                split_files.append((f_gff.name, f_group.name))
                f_gff = open(addon_gff + '.split' + str(i), 'w')
                f_group = open(addon_group + '.split' + str(i), 'w')
    if not f_gff.closed:
        f_gff.close()
        f_group.close()
        split_files.append((f_gff.name, f_group.name))

    result_prefixes = []
    pools = []
    for i, (split_gff, split_group) in enumerate(split_files):
        p = Process(target=chain_helper,
                    args=(ref_gff, ref_group, split_gff, split_group, ref_name,
                          addon_name + '.' + str(i), fuzzy_junction,
                          allow_5merge, max_3_diff))
        p.start()
        pools.append(p)
        result_prefixes.append((ref_name, addon_name + '.' + str(i)))
    for p in pools:
        p.join()
    return result_prefixes, split_files
コード例 #4
0
def write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name, addon_name, use_fq=False):
    # now go through the rec list and figure out in what order we are outputting the total records
    tree = defaultdict(lambda: {'+':ClusterTree(0,0), '-':ClusterTree(0,0)})
    tree_keys_numeric = set()
    tree_keys_alpha = set()
    for i,match_rec in enumerate(rec_list):
        tree[match_rec.rec.chr][match_rec.rec.strand].insert(match_rec.rec.start, match_rec.rec.end, i)

    for chrom in tree:
        try:
            k = int(chrom)
            tree_keys_numeric.add(k)
        except ValueError:
            tree_keys_alpha.add(chrom)
    tree_keys = sorted(list(tree_keys_numeric)) + sorted(list(tree_keys_alpha))

    f_gff = open(final_prefix+'.gff', 'w')
    f_info = open(final_prefix+'.mega_info.txt', 'w')
    writer_info = DictWriter(f_info, fieldnames=['superPBID', ref_name, addon_name], delimiter='\t')
    writer_info.writeheader()
    f_group = open(final_prefix+'.group.txt', 'w')
    if use_fq:
        f_fq = open(final_prefix+'.rep.fq', 'w')
    # sort the combined gff (tree) by chromosome and strand (- first)

    new_group_info = {}

    pb_i = 0

    for _chr in tree_keys:
        # remember to convert potential integer chromsomes keys back to string now that we sorted them!
        _chr = str(_chr)
        for _strand in ('+', '-'):
            for _start,_end,_indices in tree[_chr][_strand].getregions():
                # further sort these records by (start, end, num_exons)
                _indices.sort(key=lambda i: (rec_list[i].rec.start, rec_list[i].rec.end, len(rec_list[i].rec.ref_exons)))
                pb_i += 1
                for pb_j, recs_index in enumerate(_indices):
                    pbgene = "PB.{0}".format(pb_i)
                    pbid = "PB.{0}.{1}".format(pb_i, pb_j + 1)
                    match_rec = rec_list[recs_index]
                    new_group_info[pbid] = match_rec.members
                    match_rec.rec.seqid = pbid
                    match_rec.rec.geneid = pbgene
                    GFF.write_collapseGFF_format(f_gff, match_rec.rec)
                    writer_info.writerow({'superPBID': pbid, ref_name: match_rec.ref_id, addon_name: match_rec.addon_id})
                    f_group.write("{0}\t{1}\n".format(pbid, ",".join(match_rec.members)))
                    if use_fq:
                        match_rec.seqrec.id = pbid
                        match_rec.seqrec.description = ''
                        SeqIO.write(match_rec.seqrec, f_fq, 'fastq')
    f_gff.close()
    f_info.close()
    f_group.close()
    if use_fq:
        f_fq.close()
    return new_group_info
コード例 #5
0
def regroup_sam_to_gff(pooled_sam, demux_count_file, output_prefix, out_group_dict, in_fafq=None):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None: type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(lambda: set()) # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file),delimiter=','):
        for k,v in r.iteritems():
            if k=='id': continue
            if int(v) > 0: in_tissue[r['id']].add(k)

    in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.itervalues():
        handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w')
        if in_fafq is not None: handles_fafq[g] = open("{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq), 'w')

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = fafq_dict.keys()
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None: fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GMAPSAMReader(pooled_sam, True)
    for r in reader:
        if r.sID == '*':
            print >> sys.stderr, "Ignore {0} because unmapped.".format(r.qID)
            continue
        m = rex_pbid.match(r.qID)
        if m is not None: pbid = m.group(1)
        else: pbid = r.qID
        # convert SAM record to GFF record type
        r.seqid = pbid
        r.chr = r.sID
        r.start, r.end = r.sStart, r.sEnd
        r.strand = r.flag.strand
        r.ref_exons = r.segments
        r.cds_exons = None

        groups_to_write_in = set()
        if pbid not in in_tissue:
            print >> sys.stderr, "WARNING: {0} does not belong to any group indicated by outgroup_dict".format(pbid)
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
コード例 #6
0
def regroup_sam_to_gff(pooled_sam, demux_count_file, output_prefix, out_group_dict, in_fafq=None):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None: type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(lambda: set()) # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file),delimiter=','):
        for k,v in r.iteritems():
            if k=='id': continue
            if int(v) > 0: in_tissue[r['id']].add(k)
	
    in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.itervalues():
        handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w')
        if in_fafq is not None: handles_fafq[g] = open("{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq), 'w')

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = fafq_dict.keys()
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None: fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GMAPSAMReader(pooled_sam, True)
    for r in reader:
        if r.sID == '*':
            print >> sys.stderr, "Ignore {0} because unmapped.".format(r.qID)
            continue
        m = rex_pbid.match(r.qID)
        if m is not None: pbid = m.group(1)
        else: pbid = r.qID
        # convert SAM record to GFF record type
        r.seqid = pbid
        r.chr = r.sID
        r.start, r.end = r.sStart, r.sEnd
        r.strand = r.flag.strand
        r.ref_exons = r.segments
        r.cds_exons = None

        groups_to_write_in = set()
        if pbid not in in_tissue:
            print >> sys.stderr, "WARNING: {0} does not belong to any group indicated by outgroup_dict".format(pbid)
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
コード例 #7
0
def regroup_gff(pooled_gff,
                demux_count_file,
                output_prefix,
                out_group_dict,
                in_fafq=None):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None: type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(
        lambda: set())  # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file), delimiter=','):
        for k, v in r.items():
            if k == 'id': continue
            if int(v) > 0: in_tissue[r['id']].add(k)

    in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.values():
        handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w')
        if in_fafq is not None:
            handles_fafq[g] = open(
                "{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq),
                'w')

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = list(fafq_dict.keys())
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None: fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GFF.collapseGFFReader(pooled_gff)
    for r in reader:
        groups_to_write_in = set()
        pbid = r.seqid
        if pbid not in in_tissue:
            print(
                "WARNING: {0} does not belong to any group indicated by outgroup_dict"
                .format(pbid),
                file=sys.stderr)
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
コード例 #8
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)")
    parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(args.input_prefix)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
コード例 #9
0
def sample_sanity_check(group_filename,
                        gff_filename,
                        count_filename,
                        fastq_filename=None):
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\
        group_filename, gff_filename, count_filename)
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [
        fusion_id
        for fusion_id, rs in GFF.collapseGFFFusionReader(gff_filename)
    ]
    f = open(count_filename)
    for i in xrange(14):
        f.readline()  # just through the header
    ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')]
    if len(set(ids2).difference(ids1)) > 0 or len(
            set(ids2).difference(ids3)) > 0:
        raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\
            group_filename, gff_filename, count_filename)

    if fastq_filename is not None:
        ids4 = [
            r.id.split('|')[0]
            for r in SeqIO.parse(open(fastq_filename), 'fastq')
        ]
        if len(set(ids2).difference(ids4)) > 0:
            raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\
                fastq_filename, gff_filename)
コード例 #10
0
ファイル: chain_samples.py プロジェクト: Magdoll/cDNA_Cupcake
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None):
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\
        group_filename, gff_filename, count_filename)
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [r.seqid for r in GFF.collapseGFFReader(gff_filename)]
    f = open(count_filename)
    while True:
        # advance through the headers which start with #
        cur = f.tell()
        if not f.readline().startswith('#') or f.tell() == cur:  # first non-# seen or EOF
            f.seek(cur)
            break
    ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')]
    if len(set(ids2).difference(ids1))>0 or len(set(ids2).difference(ids3))>0:
        raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\
            group_filename, gff_filename, count_filename)

    if fastq_filename is not None:
        ids4 = [r.id.split('|')[0] for r in SeqIO.parse(open(fastq_filename), 'fastq')]
        if len(set(ids2).difference(ids4))>0:
            raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\
                fastq_filename, gff_filename)
コード例 #11
0
def sanity_check_collapse_input(input_prefix):
    """
    Check that
    1. the count, gff, rep files exist
    2. the number of records agree among the three
    """
    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'
    if not os.path.exists(count_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(count_filename)
        sys.exit(-1)
    if not os.path.exists(gff_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(gff_filename)
        sys.exit(-1)
    if not os.path.exists(rep_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(rep_filename)
        sys.exit(-1)

    pbids1 = set([r.id for r in SeqIO.parse(open(rep_filename),'fastq')])
    pbids2 = set([r.seqid for r in GFF.collapseGFFReader(gff_filename)])
    pbids3 = set(read_count_file(count_filename)[0].keys())

    if len(pbids1)!=len(pbids2) or len(pbids2)!=len(pbids3) or len(pbids1)!=len(pbids3):
        print >> sys.stderr, "The number of PBID records in the files disagree! Sanity check failed."
        print >> sys.stderr, "# of PBIDs in {0}: {1}".format(rep_filename, len(pbids1))
        print >> sys.stderr, "# of PBIDs in {0}: {1}".format(gff_filename, len(pbids2))
        print >> sys.stderr, "# of PBIDs in {0}: {1}".format(count_filename, len(pbids3))
        sys.exit(-1)

    return count_filename, gff_filename, rep_filename
コード例 #12
0
    def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None):
        combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None)
        unmatched_recs = self.record_d_fusion.keys()

        for _id, records in GFF.collapseGFFFusionReader(gff_filename):
            match_seqid = self.match_fusion_record(records)
            if match_seqid is not None:
                combined.append((self.record_d_fusion[match_seqid], records))
                try:
                    unmatched_recs.remove(match_seqid)
                except ValueError:
                    pass # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, records))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d_fusion[seqid], None))

        #return combined

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)})
        for i,(r1s,r2s) in enumerate(combined):
            if r2s is None or (r1s is not None and r1s[0].end-r1s[0].start > r2s[0].end-r2s[0].start):
                final_tree[r1s[0].chr][r1s[0].strand].insert(r1s[0].start, r1s[0].end, i)
            else:
                final_tree[r2s[0].chr][r2s[0].strand].insert(r2s[0].start, r2s[0].end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
コード例 #13
0
    def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None):
        combined = [] # list of (<matches to r2 or None>, r2)
        unmatched_recs = set(self.record_d.keys())

        for r in GFF.collapseGFFReader(gff_filename):
            match_rec_list = [r for r in self.match_record_to_tree(r)]
            if len(match_rec_list) > 0:  # found match(es)! put longer of r1/r2 in
                #if len(match_rec_list) > 1: pdb.set_trace()  #DEBUG
                combined.append((match_rec_list, r))
                for match_rec in match_rec_list:
                    try:
                        unmatched_recs.remove(match_rec.seqid)
                    except KeyError:
                        pass # already deleted, OK, this can happen
            else:  # r is not present in current tree
                combined.append((None, r))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append(([self.record_d[seqid]], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)})
        for i,(r1s,r2) in enumerate(combined):
            if r1s is None:
                final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i)
            else:
                if r2 is not None:
                    rep = find_representative_in_iso_list(r1s + [r2])
                else:
                    rep = find_representative_in_iso_list(r1s)
                final_tree[rep.chr][rep.strand].insert(rep.start, rep.end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
コード例 #14
0
    def __init__(self,
                 gff_filename,
                 group_filename,
                 internal_fuzzy_max_dist=0,
                 self_prefix=None,
                 allow_5merge=False,
                 fastq_filename=None):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.internal_fuzzy_max_dist = internal_fuzzy_max_dist
        self.allow_5merge = allow_5merge
        self.record_d = dict(
            (r.seqid, r) for r in GFF.collapseGFFReader(gff_filename))
        #sanity_check_seqids(self.record_d.keys()) # sanity check all IDs look like PB.1.2
        self.tree = defaultdict(lambda: {
            '+': IntervalTree(),
            '-': IntervalTree()
        })  # chr --> strand --> tree
        self.fastq_dict = None
        if fastq_filename is not None:
            self.fastq_dict = MegaPBTree.read_fastq_to_dict(fastq_filename)

        #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist
        #raw_input()
        self.read_gff_as_interval_tree()
        self.group_info = MegaPBTree.read_group(
            self.group_filename,
            self.self_prefix)  # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
コード例 #15
0
def sample_sanity_check(group_filename,
                        gff_filename,
                        count_filename,
                        fastq_filename=None):
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\
        group_filename, gff_filename, count_filename)
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [r.seqid for r in GFF.collapseGFFReader(gff_filename)]
    f = open(count_filename)
    while True:
        # advance through the headers which start with #
        cur = f.tell()
        if not f.readline().startswith('#') or f.tell(
        ) == cur:  # first non-# seen or EOF
            f.seek(cur)
            break
    ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')]
    if len(set(ids2).difference(ids1)) > 0 or len(
            set(ids2).difference(ids3)) > 0:
        raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\
            group_filename, gff_filename, count_filename)

    if fastq_filename is not None:
        ids4 = [
            r.id.split('|')[0]
            for r in SeqIO.parse(open(fastq_filename), 'fastq')
        ]
        if len(set(ids2).difference(ids4)) > 0:
            raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\
                fastq_filename, gff_filename)
コード例 #16
0
    def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None):
        combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None)
        unmatched_recs = self.record_d.keys()

        for r in GFF.collapseGFFReader(gff_filename):
            match_rec = self.match_record_to_tree(r)
            if match_rec is not None:  # found a match! put longer of r1/r2 in
                combined.append((match_rec, r))
                try:
                    unmatched_recs.remove(match_rec.seqid)
                except ValueError:
                    pass # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, r))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d[seqid], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)})
        for i,(r1,r2) in enumerate(combined):
            if r2 is None or (r1 is not None and r1.end-r1.start > r2.end-r2.start):
                final_tree[r1.chr][r1.strand].insert(r1.start, r1.end, i)
            else:
                final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
コード例 #17
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: test.collapsed.min_fl_2)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.nomono'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(
        args.input_prefix)

    good = []
    f = open(output_prefix + '.gff', 'w')
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        if len(r.ref_exons) > 1:
            good.append(r.seqid)
            GFF.write_collapseGFF_format(f, r)

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
コード例 #18
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.nomono'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(args.input_prefix)

    good = []
    f = open(output_prefix + '.gff', 'w')
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        if len(r.ref_exons) > 1:
            good.append(r.seqid)
            GFF.write_collapseGFF_format(f, r)

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
コード例 #19
0
def sanity_check_collapse_input(input_prefix):
    """
    Check that
    1. the count, gff, rep files exist
    2. the number of records agree among the three
    """
    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filenames = [(input_prefix + '.rep.fq', 'fastq'), (input_prefix + '.rep.fastq', 'fastq'), \
                     (input_prefix + '.rep.fa', 'fasta'), (input_prefix + '.rep.fasta', 'fasta')]

    rep_filename = None
    rep_type = None
    for x, type in rep_filenames:
        if os.path.exists(x):
            rep_filename = x
            rep_type = type

    if rep_filename is None:
        print(
            "Expected to find input fasta or fastq files {0}.rep.fa or {0}.rep.fq. Not found. Abort!"
            .format(input_prefix),
            file=sys.stderr)
        sys.exit(-1)

    if not os.path.exists(count_filename):
        print("File {0} does not exist. Abort!".format(count_filename),
              file=sys.stderr)
        sys.exit(-1)
    if not os.path.exists(gff_filename):
        print("File {0} does not exist. Abort!".format(gff_filename),
              file=sys.stderr)
        sys.exit(-1)

    pbids1 = set([r.id for r in SeqIO.parse(open(rep_filename), rep_type)])
    pbids2 = set([r.seqid for r in GFF.collapseGFFReader(gff_filename)])
    pbids3 = set(read_count_file(count_filename)[0].keys())

    if len(pbids1) != len(pbids2) or len(pbids2) != len(pbids3) or len(
            pbids1) != len(pbids3):
        print(
            "The number of PBID records in the files disagree! Sanity check failed.",
            file=sys.stderr)
        print("# of PBIDs in {0}: {1}".format(rep_filename, len(pbids1)),
              file=sys.stderr)
        print("# of PBIDs in {0}: {1}".format(gff_filename, len(pbids2)),
              file=sys.stderr)
        print("# of PBIDs in {0}: {1}".format(count_filename, len(pbids3)),
              file=sys.stderr)
        sys.exit(-1)

    return count_filename, gff_filename, rep_filename, rep_type
コード例 #20
0
    def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None, fastq_filename=None, fusion_max_dist=10):
        """
        Differences with non-fusion MegaPBTree:

        1. allow_5merge is always FALSE. Not a parameter.
        2. fusion_max_dist --- maximum allowed distance on internal fusion sites to be called as equivalent fusions
        """
        super(MegaPBTreeFusion, self).__init__(gff_filename, group_filename, internal_fuzzy_max_dist, self_prefix, False, fastq_filename)

        self.fusion_max_dist = fusion_max_dist

        # ex: PBfusion.1 -> [PBfusion.1.1, PBfusion.1.2]
        self.record_d_fusion = dict((fusion_id, records) for fusion_id,records in GFF.collapseGFFFusionReader(gff_filename))
コード例 #21
0
    def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None, fastq_filename=None, fusion_max_dist=10):
        """
        Differences with non-fusion MegaPBTree:

        1. allow_5merge is always FALSE. Not a parameter.
        2. fusion_max_dist --- maximum allowed distance on internal fusion sites to be called as equivalent fusions
        """
        super(MegaPBTreeFusion, self).__init__(gff_filename, group_filename, internal_fuzzy_max_dist, self_prefix, False, fastq_filename)

        self.fusion_max_dist = fusion_max_dist

        # ex: PBfusion.1 -> [PBfusion.1.1, PBfusion.1.2]
        self.record_d_fusion = dict((fusion_id, records) for fusion_id,records in GFF.collapseGFFFusionReader(gff_filename))
コード例 #22
0
def make_fake_genome(genome_filename, gff_filename, ref_chr, ref_start, ref_end, ref_strand, output_prefix, output_name, genome_d=None):
    if genome_d is None:
        print("Reading genome file {0}...".format(genome_filename), file=sys.stderr)
        d = SeqIO.to_dict(SeqIO.parse(open(genome_filename),'fasta'))
    else:
        d = genome_d

    print("Reading GFF file {0}...".format(gff_filename), file=sys.stderr)
    good = []
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        if r.chr==ref_chr and r.strand==ref_strand and \
                (ref_start <= r.start < r.end <= ref_end) \
            and len(r.ref_exons) > 1:
            print("Adding {0} to fake genome.".format(r.seqid), file=sys.stderr)
            good.append(r)

    if len(good) == 0:
        print("Did not find any transcripts strictly within {0}:{1}-{2} on strand {3}. Abort!".format(\
            ref_chr, ref_start, ref_end, ref_strand), file=sys.stderr)
        sys.exit(-1)

    c = ClusterTree(0, 0)
    for r in good:
        for e in r.ref_exons:
            c.insert(e.start-extra_bp_around_junctions, e.end+extra_bp_around_junctions, 1)

    regions = [(a,b) for (a,b,junk) in c.getregions()]
    regions[0] = (regions[0][0]-__padding_before_after__, regions[0][1])
    regions[-1] = (regions[-1][0], regions[-1][1]+__padding_before_after__)

    with open(output_prefix+'.fasta', 'w') as f:
        f.write(">" + output_name + "\n")
        for a,b in regions:
            f.write(str(d[r.chr][a:b].seq))
        f.write("\n")
        f.close()

    # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome>
    with open(output_prefix+'.mapping.txt', 'w') as f:
        i = 0
        for a,b in regions:
            for j in range(a, b):
                f.write("{0},{1},{2}\n".format(i, ref_chr, j))
                i += 1

        with open(output_prefix+'.pbids.txt', 'w') as f:
            f.write("\n".join(r.seqid for r in good)+'\n')

    print("Output written to {0}.fasta, {0}.mapping.txt, {0}.pbids.txt.".format(output_prefix), file=sys.stderr)
コード例 #23
0
    def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None, allow_5merge=False, fastq_filename=None):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.internal_fuzzy_max_dist = internal_fuzzy_max_dist
        self.allow_5merge = allow_5merge
        self.record_d = dict((r.seqid, r) for r in GFF.collapseGFFReader(gff_filename))
        #sanity_check_seqids(self.record_d.keys()) # sanity check all IDs look like PB.1.2
        self.tree = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree
        self.fastq_dict = None
        if fastq_filename is not None:
            self.fastq_dict = MegaPBTree.read_fastq_to_dict(fastq_filename)


        #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist
        #raw_input()
        self.read_gff_as_interval_tree()
        self.group_info = MegaPBTree.read_group(self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
コード例 #24
0
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None):
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\
        group_filename, gff_filename, count_filename)
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [fusion_id for fusion_id,rs in GFF.collapseGFFFusionReader(gff_filename)]
    f = open(count_filename)
    for i in xrange(14): f.readline() # just through the header
    ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')]
    if len(set(ids2).difference(ids1))>0 or len(set(ids2).difference(ids3))>0:
        raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\
            group_filename, gff_filename, count_filename)

    if fastq_filename is not None:
        ids4 = [r.id.split('|')[0] for r in SeqIO.parse(open(fastq_filename), 'fastq')]
        if len(set(ids2).difference(ids4))>0:
            raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\
                fastq_filename, gff_filename)
コード例 #25
0
def read_cogent2_aligned_to_genome_gff(filename):
    """
     Read cogent2 mapped to a genome.

     Return: dict of {cogent path} --> list of gmapRecord; set of mapped genome contigs

     NOTE: (gmap was run with -n 0 so if multiple must be chimeric)
     """
    d = defaultdict(lambda: [])
    contigs_seen = set()

    if not os.path.exists(filename):
        return {}, set()

    try:
        for r in GFF.gmapGFFReader(filename):
            d[r.seqid].append(r)
            contigs_seen.add(r.chr)
    except IndexError:
        pass
    return dict(d), contigs_seen
コード例 #26
0
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        try:
            # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
            return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)
        except ValueError:
            return 0

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1 #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist, max_5_diff=args.max_5_diff, max_3_diff=args.max_3_diff)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename+'.fuzzy', 'w')
    f_group = open(group_filename+'.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            # note: get_fl_from_id only works on IsoSeq1 and 2 ID formats, will return 0 if IsoSeq3 format or other
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match
コード例 #27
0
def get_gff_from_list(gff_filename, listfile, partial_ok=False):
    seqs = [line.strip() for line in open(listfile)]
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in seqs or r.seqid.split('|')[0] in seqs or (partial_ok and any(r.seqid.startswith(x) for x in seqs)):
            GFF.write_collapseGFF_format(sys.stdout, r)
コード例 #28
0
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2):
    """
    1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff)
    2. read cogent2 mapped to genome1
    3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1)
    """
    if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')):
        return
    seq_info = defaultdict(lambda: [])
    contigs_seen = set()
    # input mapped to Cogent contigs
    filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.gff')
    reader = GFF.gmapGFFReader(filename)
    for r in reader:
        seq_info[r.seqid].append(r)
        contigs_seen.add(r.chr)
    # sanity check that all sequences in in.fa are mapped to cogent2.fa
    for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'):
        assert r.id in seq_info

    d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, 'cogent2.fa.' + genome1 + '.gff'))
    d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, 'cogent2.fa.' + genome2 + '.gff'))

    # write:
    # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden
    f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info),
                                      len(contigs_seen)))
    cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format(
        len(contig_genome1), cov1, acc1, has_chimeric1,
        ",".join(contig_genome1)))
    # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs
    cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\n".format(
        len(contig_genome2), cov2, acc2, has_chimeric2,
        ",".join(contig_genome2)))

    in_aligned_to_genome1 = os.path.join(dirname,
                                         'in.trimmed.fa.' + genome1 + '.gff')
    if os.path.exists(in_aligned_to_genome1):
        d3, junk = read_cogent2_aligned_to_genome_gff(in_aligned_to_genome1)
    else:
        d3 = {}

    for seqid, v in seq_info.iteritems():
        contigs = [x.chr for x in v]
        acc = sum(x.identity * x.coverage for x in v) / sum(x.coverage
                                                            for x in v)
        f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname,
                                               ",".join(contigs), acc))

        if not seqid in d3:
            f2.write("NA\t0\tNA\tNA\n")
        else:
            scaffolds = [x.chr for x in d3[seqid]]
            cov = sum(x.coverage for x in d3[seqid])
            acc = sum(x.identity * x.coverage for x in d3[seqid]) / cov
            f2.write("{0}\t{1}\t{2}\t{3}\n".format(",".join(scaffolds),
                                                   len(scaffolds), cov, acc))
コード例 #29
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: filtered.microexon)")
    parser.add_argument(
        "--micro_exon_size",
        type=int,
        default=12,
        help="Filter away microexons < micro_exon_size (default: 12bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered.microexon'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(
        args.input_prefix)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        for r in xxx:
            min_exon_size = min(e.end - e.start for e in r.ref_exons)
            if min_exon_size > 12:  # minimum exon must be > default 12 bp
                GFF.write_collapseGFF_format(f, r)
                good.append(r.seqid)

    f.close()

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
コード例 #30
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: test.collapsed.min_fl_2)")
    parser.add_argument("--fuzzy_junction",
                        type=int,
                        default=5,
                        help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    #group_filename = args.input_prefix + '.group.txt'
    count_filename = args.input_prefix + '.abundance.txt'
    gff_filename = args.input_prefix + '.gff'
    rep_filename = args.input_prefix + '.rep.fq'

    if not os.path.exists(count_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(
            count_filename)
        sys.exit(-1)
    if not os.path.exists(gff_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(
            gff_filename)
        sys.exit(-1)
    if not os.path.exists(rep_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(
            rep_filename)
        sys.exit(-1)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k, v in d.iteritems():
        print k, v
    f.close()

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
コード例 #31
0
 def read_gff_as_interval_tree(self):
     """
     Read a collapsed GFF file into an IntervalTree
     """
     for r in GFF.collapseGFFReader(self.gff_filename):
         self.tree[r.chr][r.strand].insert(r.start, r.end, r)
コード例 #32
0
def summarize_junctions(sample_dirs, sample_names, gff_filename, output_prefix, genome_d=None, junction_known=None):
    """
    1. for each sample, read all the GFF, store the junction information (both 0-based)

    """
    junc_by_chr_strand = defaultdict(lambda: defaultdict(lambda: [])) # (chr,strand) --> (donor,acceptor) --> samples it show up in (more than once possible)

    for sample_name, d in sample_dirs.iteritems():
        for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)):
            n = len(r.ref_exons)
            if n == 1: continue # ignore single exon transcripts
            for i in xrange(n-1):
                donor = r.ref_exons[i].end-1 # make it 0-based
                accep = r.ref_exons[i+1].start # start is already 0-based
                junc_by_chr_strand[r.chr, r.strand][donor, accep].append(sample_name)

    # write junction report
    f1 = open(output_prefix+'.junction.bed', 'w')
    f1.write("track name=junctions description=\"{0}\" useScore=1\n".format(output_prefix))

    JUNC_DETAIL_FIELDS = ['chr', 'left', 'right', 'strand', 'num_transcript', 'num_sample', 'genome', 'annotation', 'label']


    with open(output_prefix+'.junction_detail.txt', 'w') as f:
        writer = DictWriter(f, JUNC_DETAIL_FIELDS, delimiter='\t')
        writer.writeheader()
        keys = junc_by_chr_strand.keys()
        keys.sort()
        for _chr, _strand in keys:
            v = junc_by_chr_strand[_chr, _strand]
            v_keys = v.keys()
            v_keys.sort()
            labels = cluster_junctions(v_keys)
            for i,(_donor, _accep) in enumerate(v_keys):
                rec = {'chr': _chr,
                       'left': _donor,
                       'right': _accep,
                       'strand': _strand,
                       'num_transcript': len(v[_donor,_accep]),
                       'num_sample': len(set(v[_donor,_accep]))}
                #f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t".format(_chr, _donor, _accep, _strand, len(v[_donor,_accep]), len(set(v[_donor,_accep]))))
                f1.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(_chr, _donor, _accep+1, output_prefix, len(v[_donor,_accep]), _strand))
                # if genome is given, write acceptor-donor site
                if genome_d is None or _chr not in genome_d:
                    rec['genome'] = 'NA'
                    #f.write("NA\t")
                else:
                    up, down = genome_d[_chr][_donor+1:_donor+3], genome_d[_chr][_accep-2:_accep]
                    if _strand == '+':
                        rec['genome'] = "{0}-{1}".format(str(up.seq).upper(), str(down.seq).upper())
                        #f.write("{0}-{1}\t".format(str(up.seq).upper(), str(down.seq).upper()))
                    else:
                        rec['genome'] = "{0}-{1}".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper())
                        #f.write("{0}-{1}\t".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper()))
                # if annotation is given, check if matches with annotation
                if junction_known is None:
                    rec['annotation'] = 'NA'
                    #f.write("NA\n")
                else:
                    if (_chr, _strand) in junction_known and (_donor, _accep) in junction_known[_chr, _strand]:
                        rec['annotation'] = 'Y'
                        #f.write("Y\t")
                    else:
                        rec['annotation'] = 'N'
                        #f.write("N\t")
                rec['label'] = "{c}_{s}_{lab}".format(c=_chr, s=_strand, lab=labels[i])
                writer.writerow(rec)
                #f.write("{c}_{s}_{lab}\n".format(c=_chr, s=_strand, lab=labels[i]))
    f1.close()

    return junc_by_chr_strand
コード例 #33
0
def combine_split_chained_results(output_prefixes, final_prefix, ref_gff,
                                  ref_group, ref_name, ref_fq, addon_gff,
                                  addon_group, addon_name, addon_fq):
    """
    Each <output_prefix> will have .gff, .group.txt, .mega_info.txt.
    There should be NO overlap between the split files, so clean merge should be possible!

    1. read the .gff files, record the group and mega (id-map) info
    2. sort the total records so can properly put on a unified superPBID
    3. write out the unified result
    4. delete the split files
    """

    # sanity check files are all there
    split_files = []  # tuple of (gff, group, mega)
    for ref_name, o in output_prefixes:
        gff_file = 'tmp_' + o + '.gff'
        mega_file = 'tmp_' + o + '.mega_info.txt'
        group_file = 'tmp_' + o + '.group.txt'
        if not os.path.exists(gff_file) or not os.path.exists(
                mega_file) or not os.path.exists(group_file):
            print(
                "Expects to see {0},{1},{2} but one or more files are missing! Abort!"
                .format(gff_file, mega_file, group_file),
                file=sys.stderr)
            sys.exit(-1)
        split_files.append((ref_name, o, gff_file, group_file, mega_file))

    use_fq = False
    if ref_fq is not None and addon_fq is not None:
        use_fq = True
        ref_fq_dict = dict((r.id.split('|')[0], r)
                           for r in SeqIO.parse(open(ref_fq), 'fastq'))
        addon_fq_dict = dict((r.id.split('|')[0], r)
                             for r in SeqIO.parse(open(addon_fq), 'fastq'))

    mega_info = {}  # ref id -> list of matching query_id, or empty list
    split_unmatched = set()

    for (ref_name, split_name, gff_file, group_file, mega_file) in split_files:
        for r in DictReader(open(mega_file), delimiter='\t'):
            if r[ref_name] != 'NA':
                if r[ref_name] not in mega_info:
                    mega_info[r[ref_name]] = []
                if r[split_name] != 'NA':
                    mega_info[r[ref_name]].append(r[split_name])
            else:  # ref is NA, non-ref is not NA
                split_unmatched.add(r[split_name])

    # make a rec list of matches of (ref_id, addon_id, representative record, combined group info) where rec_ref or ref_addon could be None, but not both
    rec_list = []
    d_ref = dict((r.seqid, r) for r in GFF.collapseGFFReader(ref_gff))
    d_addon = dict((r.seqid, r) for r in GFF.collapseGFFReader(addon_gff))

    ref_group_info = sp.MegaPBTree.read_group(ref_group, None)
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)

    for ref_id, matches in mega_info.items():
        if len(matches) == 0:
            rec_list.append(
                sp.MatchRecord(ref_id=ref_id,
                               addon_id='NA',
                               rec=d_ref[ref_id],
                               members=ref_group_info[ref_id],
                               seqrec=ref_fq_dict[ref_id] if use_fq else None))
        else:
            for addon_id in matches:
                r1 = d_ref[ref_id]
                r2 = d_addon[addon_id]
                if (r1.end - r1.start) > (r2.end - r2.start):
                    rec_list.append(
                        sp.MatchRecord(
                            ref_id=ref_id,
                            addon_id=addon_id,
                            rec=r1,
                            members=ref_group_info[ref_id] +
                            addon_group_info[addon_id],
                            seqrec=ref_fq_dict[ref_id] if use_fq else None))
                else:
                    rec_list.append(
                        sp.MatchRecord(ref_id=ref_id,
                                       addon_id=addon_id,
                                       rec=r2,
                                       members=ref_group_info[ref_id] +
                                       addon_group_info[addon_id],
                                       seqrec=addon_fq_dict[addon_id]
                                       if use_fq else None))
    for addon_id in split_unmatched:
        rec_list.append(
            sp.MatchRecord(ref_id='NA',
                           addon_id=addon_id,
                           rec=d_addon[addon_id],
                           members=addon_group_info[addon_id],
                           seqrec=addon_fq_dict[addon_id] if use_fq else None))

    sp.write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name,
                                   addon_name, use_fq)
    for (ref_name, split_name, gff_file, group_file, mega_file) in split_files:
        os.remove(gff_file)
        os.remove(group_file)
        os.remove(mega_file)
コード例 #34
0
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group,
                     addon_name, fuzzy_junction, allow_5merge, max_3_diff,
                     n_chunks):
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)
    recs = []
    tree = OrderedDict()
    i = 0
    for r in GFF.collapseGFFReader(addon_gff):
        if r.chr not in tree:
            tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)}
        tree[r.chr][r.strand].insert(r.start, r.end, i)
        recs.append(r)
        i += 1

    n = len(recs)
    chunk_size = (n // n_chunks) + (n % n_chunks > 0)
    #print("# of recs: {0}, cpus: {1}, chunk_size: {2}".format(n, n_chunks, chunk_size))

    split_files = []
    i = 0
    counter = 0
    f_gff = open(addon_gff + '.split' + str(i), 'w')
    f_group = open(addon_group + '.split' + str(i), 'w')
    for v1 in tree.values():
        for strand in ('+', '-'):
            v2 = v1[strand]
            for _start, _end, _indices in v2.getregions():
                for cur in _indices:
                    GFF.write_collapseGFF_format(f_gff, recs[cur])
                    f_group.write("{0}\t{1}\n".format(
                        recs[cur].seqid,
                        ",".join(addon_group_info[recs[cur].seqid])))
                    counter += 1
            # note: becuz we are limited by how the records are organized by (chrom, strand)
            # we may not end up using all the chunks, ex: if all records are on the same locus, we end up writing everything to one split file
            if counter >= (i + 1) * chunk_size:
                i += 1
                f_gff.close()
                f_group.close()
                split_files.append((f_gff.name, f_group.name))
                if i >= n_chunks or counter >= len(recs):
                    break
                f_gff = open(addon_gff + '.split' + str(i), 'w')
                f_group = open(addon_group + '.split' + str(i), 'w')
    if not f_gff.closed:
        f_gff.close()
        f_group.close()
        split_files.append((f_gff.name, f_group.name))

    result_prefixes = []
    pools = []
    for i, (split_gff, split_group) in enumerate(split_files):
        p = Process(target=chain_helper,
                    args=(ref_gff, ref_group, split_gff, split_group, ref_name,
                          addon_name + '.' + str(i), fuzzy_junction,
                          allow_5merge, max_3_diff))
        p.start()
        pools.append(p)
        result_prefixes.append((ref_name, addon_name + '.' + str(i)))
    for p in pools:
        p.join()
    #print("split files: {0}, result_prefix: {1}".format(split_files, result_prefixes))
    return result_prefixes, split_files
コード例 #35
0
def cleanup_scrubbed_files_redundancy(gff_filename, group_filename,
                                      count_filename, fastq_filename,
                                      output_prefix):

    junction_seen = defaultdict(lambda: defaultdict(lambda: [
    ]))  # key (chr,strand) -> dict of (series of junctions) -> record
    for r in GFF.collapseGFFReader(gff_filename):
        n = len(r.ref_exons)
        if n == 1:
            junc_str = str(r.start) + ',' + str(r.end)
            junction_seen[r.chr, r.strand][junc_str] = [r]
        else:
            junc_str = ",".join(
                str(r.ref_exons[i].end) + ',' + str(r.ref_exons[i + 1].start)
                for i in range(n - 1))
            junction_seen[r.chr, r.strand][junc_str].append(r)

    # write out cleaned GFF
    outf = open(output_prefix + '.gff', 'w')
    outf2 = open(output_prefix + '.merged_ids.txt', 'w')
    merged = {}
    keys = list(junction_seen.keys())
    keys.sort()
    for k in keys:
        for bunch in junction_seen[k].values():
            if len(bunch) == 1:  # just one record, write it out
                r = bunch[0]
                GFF.write_collapseGFF_format(outf, r)
                merged[r.seqid] = [r.seqid]
            else:
                # find the representative
                r = bunch[0]
                for r2 in bunch[1:]:
                    if r2.end - r2.start > r.end - r.start:
                        r = r2
                GFF.write_collapseGFF_format(outf, r)
                merged[r.seqid] = [x.seqid for x in bunch]
            outf2.write("{0}\t{1}\n".format(r.seqid,
                                            ",".join(merged[r.seqid])))
    outf.close()
    outf2.close()

    count_d, count_header = read_count_file(count_filename)
    # write out count file
    outf = open(output_prefix + '.abundance.txt', 'w')
    outf.write(count_header)
    writer = DictWriter(outf, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for pbid, bunch in merged.items():
        # combine the counts
        r = count_d[bunch[0]]
        r['pbid'] = pbid
        for field in fields_to_add:
            r[field] = float(r[field])
        for _id in bunch[1:]:
            for field in fields_to_add:
                r[field] += float(count_d[_id][field])
        writer.writerow(r)
    outf.close()

    group_info = read_group_file(group_filename)
    # write out group file
    outf = open(output_prefix + '.group.txt', 'w')
    for pbid, bunch in merged.items():
        # combine the groups
        g = [group_info[bunch[0]]]
        for _id in bunch[1:]:
            g.append(group_info[_id])
        outf.write("{0}\t{1}\n".format(pbid, ",".join(g)))
    outf.close()

    # write out fastq file if present
    if fastq_filename is not None:
        outf = open(output_prefix + '.rep.fq', 'w')
        for r in SeqIO.parse(open(fastq_filename), 'fastq'):
            if r.id.split('|')[0] in merged or r.id in merged:
                SeqIO.write(r, outf, 'fastq')
        outf.close()

    print(
        "scrubbed files written: {0}.gff, {0}.group.txt, {0}.abundance.txt, {0}.merged_ids.txt"
        .format(output_prefix),
        file=sys.stderr)
コード例 #36
0
def filter_by_count(input_prefix, output_prefix, min_count, dun_use_group_count=False):

    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'

    if not dun_use_group_count:
        # read group
        group_max_count_fl = {}
        group_max_count_p = {}
        f = open(group_filename)
        for line in f:
            #ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
            pbid, members = line.strip().split('\t')
            group_max_count_fl[pbid] = 0
            group_max_count_p[pbid] = 0
            members = members.split(',')
            for m in members:
                i = m.find('|')
                if i > 0:
                    tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16
                else:
                    tmp = m.split('/')[1]
                fl_count, p_count = tmp.split('p')
                fl_count = int(fl_count[1:])
                p_count = int(p_count)
                group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
                group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
        f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k,v in d.iteritems():
        print k,v
    f.close()

    # group_max_count_p NOT used for now
    good = filter(lambda x: int(d[x]['count_fl']) >= min_count and (dun_use_group_count or group_max_count_fl[x] >= min_count), d)

    # write output GFF
    f = open(output_prefix + '.gff', 'w')
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good: GFF.write_collapseGFF_format(f, r)
    f.close()


    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
           SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.abundance.txt'
コード例 #37
0
def filter_by_count(input_prefix,
                    output_prefix,
                    min_count,
                    dun_use_group_count=False):

    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'

    if not dun_use_group_count:
        # read group
        group_max_count_fl = {}
        group_max_count_p = {}
        f = open(group_filename)
        for line in f:
            #ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
            pbid, members = line.strip().split('\t')
            group_max_count_fl[pbid] = 0
            group_max_count_p[pbid] = 0
            members = members.split(',')
            for m in members:
                i = m.find('|')
                if i > 0:
                    tmp = m.split('|')[1].split('/')[1]  #ex: tmp = f30p16
                else:
                    tmp = m.split('/')[1]
                fl_count, p_count = tmp.split('p')
                fl_count = int(fl_count[1:])
                p_count = int(p_count)
                group_max_count_fl[pbid] = max(group_max_count_fl[pbid],
                                               fl_count)
                group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
        f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k, v in d.items():
        print(k, v)
    f.close()

    # group_max_count_p NOT used for now
    good = [
        x for x in d if int(d[x]['count_fl']) >= min_count and (
            dun_use_group_count or group_max_count_fl[x] >= min_count)
    ]

    # write output GFF
    f = open(output_prefix + '.gff', 'w')
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good: GFF.write_collapseGFF_format(f, r)
    f.close()

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print("Output written to:", output_prefix + '.gff', file=sys.stderr)
    print("Output written to:", output_prefix + '.rep.fq', file=sys.stderr)
    print("Output written to:",
          output_prefix + '.abundance.txt',
          file=sys.stderr)
コード例 #38
0
 def read_gff_as_interval_tree(self):
     """
     Read a collapsed GFF file into an IntervalTree
     """
     for r in GFF.collapseGFFReader(self.gff_filename):
         self.tree[r.chr][r.strand].insert(r.start, r.end, r)
コード例 #39
0
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon,
                             internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
        return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1  #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(
                r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename + '.fuzzy', 'w')
    f_group = open(group_filename + '.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(
            group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                               and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match
コード例 #40
0
#!/usr/bin/env python
import cupcake.io.GFF as GFF
from collections import defaultdict
import numpy as np

d = defaultdict(lambda: [])
for r in GFF.collapseGFFReader(
        'hq_isoforms.fastq.no5merge.collapsed.filtered.gff'):
    d[r.seqid.split('.')[1]].append(r.seqid)

p = np.array([len(v) for v in d.itervalues()])

print "Number of loci:", len(d)
print "Number of isoforms:", sum(p)
print "Avg. number of isoforms per loci:", np.mean(p)
f = open('hq_isoforms.fastq.no5merge.collapsed.filtered.isoform_per_loci.txt',
         'w')
f.write("loci\tnum_isoform\n")
for k, v in d.iteritems():
    f.write("PB.{0}\t{1}\n".format(k, len(v)))
f.close()
コード例 #41
0
#!/usr/bin/env python
import cupcake.io.GFF as GFF
from collections import defaultdict
import numpy as np

d = defaultdict(lambda: [])
for r in GFF.collapseGFFReader('hq_isoforms.fastq.no5merge.collapsed.filtered.gff'):
    d[r.seqid.split('.')[1]].append(r.seqid)
    
p = np.array([len(v) for v in d.itervalues()])

print "Number of loci:", len(d)
print "Number of isoforms:", sum(p)
print "Avg. number of isoforms per loci:", np.mean(p)
f = open('hq_isoforms.fastq.no5merge.collapsed.filtered.isoform_per_loci.txt', 'w')
f.write("loci\tnum_isoform\n")
for k,v in d.iteritems(): f.write("PB.{0}\t{1}\n".format(k, len(v)))
f.close()
コード例 #42
0
def summarize_junctions(sample_dirs, sample_names, gff_filename, output_prefix, genome_d=None, junction_known=None):
    """
    1. for each sample, read all the GFF, store the junction information (both 0-based)

    """
    junc_by_chr_strand = defaultdict(lambda: defaultdict(lambda: [])) # (chr,strand) --> (donor,acceptor) --> samples it show up in (more than once possible)

    for sample_name, d in sample_dirs.items():
        for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)):
            n = len(r.ref_exons)
            if n == 1: continue # ignore single exon transcripts
            for i in range(n-1):
                donor = r.ref_exons[i].end-1 # make it 0-based
                accep = r.ref_exons[i+1].start # start is already 0-based
                junc_by_chr_strand[r.chr, r.strand][donor, accep].append(sample_name)

    # write junction report
    f1 = open(output_prefix+'.junction.bed', 'w')
    f1.write("track name=junctions description=\"{0}\" useScore=1\n".format(output_prefix))

    JUNC_DETAIL_FIELDS = ['chr', 'left', 'right', 'strand', 'num_transcript', 'num_sample', 'genome', 'annotation', 'label']


    with open(output_prefix+'.junction_detail.txt', 'w') as f:
        writer = DictWriter(f, JUNC_DETAIL_FIELDS, delimiter='\t')
        writer.writeheader()
        keys = list(junc_by_chr_strand.keys())
        keys.sort()
        for _chr, _strand in keys:
            v = junc_by_chr_strand[_chr, _strand]
            v_keys = list(v.keys())
            v_keys.sort()
            labels = cluster_junctions(v_keys)
            for i,(_donor, _accep) in enumerate(v_keys):
                rec = {'chr': _chr,
                       'left': _donor,
                       'right': _accep,
                       'strand': _strand,
                       'num_transcript': len(v[_donor,_accep]),
                       'num_sample': len(set(v[_donor,_accep]))}
                #f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t".format(_chr, _donor, _accep, _strand, len(v[_donor,_accep]), len(set(v[_donor,_accep]))))
                f1.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(_chr, _donor, _accep+1, output_prefix, len(v[_donor,_accep]), _strand))
                # if genome is given, write acceptor-donor site
                if genome_d is None or _chr not in genome_d:
                    rec['genome'] = 'NA'
                    #f.write("NA\t")
                else:
                    up, down = genome_d[_chr][_donor+1:_donor+3], genome_d[_chr][_accep-2:_accep]
                    if _strand == '+':
                        rec['genome'] = "{0}-{1}".format(str(up.seq).upper(), str(down.seq).upper())
                        #f.write("{0}-{1}\t".format(str(up.seq).upper(), str(down.seq).upper()))
                    else:
                        rec['genome'] = "{0}-{1}".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper())
                        #f.write("{0}-{1}\t".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper()))
                # if annotation is given, check if matches with annotation
                if junction_known is None:
                    rec['annotation'] = 'NA'
                    #f.write("NA\n")
                else:
                    if (_chr, _strand) in junction_known and (_donor, _accep) in junction_known[_chr, _strand]:
                        rec['annotation'] = 'Y'
                        #f.write("Y\t")
                    else:
                        rec['annotation'] = 'N'
                        #f.write("N\t")
                rec['label'] = "{c}_{s}_{lab}".format(c=_chr, s=_strand, lab=labels[i])
                writer.writerow(rec)
                #f.write("{c}_{s}_{lab}\n".format(c=_chr, s=_strand, lab=labels[i]))
    f1.close()

    return junc_by_chr_strand