Ejemplo n.º 1
0
def scrub_sample_GFFs(sample_dirs, gff_filename, count_filename,
                      group_filename, fastq_filename, output_prefix, tree):

    for sample_name, d in sample_dirs.items():
        outf = open(os.path.join(d, output_prefix + '.gff.tmp'), 'w')
        for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)):
            n = len(r.ref_exons)
            if n == 1:
                GFF.write_collapseGFF_format(outf, r)

            new_ref_exons = scrub_ref_exons(r, tree)
            if new_ref_exons is None:
                print("No changes made due to error:",
                      r.seqid,
                      file=sys.stderr)
            else:
                #print "before:", r.ref_exons
                #print "after :", new_ref_exons
                r.ref_exons = new_ref_exons
            GFF.write_collapseGFF_format(outf, r)
        outf.close()
        cleanup_scrubbed_files_redundancy(outf.name, \
                                          os.path.join(d, group_filename), \
                                          os.path.join(d, count_filename), \
                                          os.path.join(d, fastq_filename) if fastq_filename is not None else None,
                                          os.path.join(d, output_prefix))
    def __init__(self,
                 gff_filename,
                 group_filename,
                 internal_fuzzy_max_dist=0,
                 self_prefix=None,
                 allow_5merge=False,
                 fastq_filename=None):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.internal_fuzzy_max_dist = internal_fuzzy_max_dist
        self.allow_5merge = allow_5merge
        self.record_d = dict(
            (r.seqid, r) for r in GFF.collapseGFFReader(gff_filename))
        #sanity_check_seqids(self.record_d.keys()) # sanity check all IDs look like PB.1.2
        self.tree = defaultdict(lambda: {
            '+': IntervalTree(),
            '-': IntervalTree()
        })  # chr --> strand --> tree
        self.fastq_dict = None
        if fastq_filename is not None:
            self.fastq_dict = MegaPBTree.read_fastq_to_dict(fastq_filename)

        #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist
        #raw_input()
        self.read_gff_as_interval_tree()
        self.group_info = MegaPBTree.read_group(
            self.group_filename,
            self.self_prefix)  # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
def sample_sanity_check(group_filename,
                        gff_filename,
                        count_filename,
                        fastq_filename=None):
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\
        group_filename, gff_filename, count_filename)
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [r.seqid for r in GFF.collapseGFFReader(gff_filename)]
    f = open(count_filename)
    while True:
        # advance through the headers which start with #
        cur = f.tell()
        if not f.readline().startswith('#') or f.tell(
        ) == cur:  # first non-# seen or EOF
            f.seek(cur)
            break
    ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')]
    if len(set(ids2).difference(ids1)) > 0 or len(
            set(ids2).difference(ids3)) > 0:
        raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\
            group_filename, gff_filename, count_filename)

    if fastq_filename is not None:
        ids4 = [
            r.id.split('|')[0]
            for r in SeqIO.parse(open(fastq_filename), 'fastq')
        ]
        if len(set(ids2).difference(ids4)) > 0:
            raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\
                fastq_filename, gff_filename)
    def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None):
        combined = [] # list of (<matches to r2 or None>, r2)
        unmatched_recs = set(self.record_d.keys())

        for r in GFF.collapseGFFReader(gff_filename):
            match_rec_list = [r for r in self.match_record_to_tree(r)]
            if len(match_rec_list) > 0:  # found match(es)! put longer of r1/r2 in
                #if len(match_rec_list) > 1: pdb.set_trace()  #DEBUG
                combined.append((match_rec_list, r))
                for match_rec in match_rec_list:
                    try:
                        unmatched_recs.remove(match_rec.seqid)
                    except KeyError:
                        pass # already deleted, OK, this can happen
            else:  # r is not present in current tree
                combined.append((None, r))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append(([self.record_d[seqid]], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)})
        for i,(r1s,r2) in enumerate(combined):
            if r1s is None:
                final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i)
            else:
                if r2 is not None:
                    rep = find_representative_in_iso_list(r1s + [r2])
                else:
                    rep = find_representative_in_iso_list(r1s)
                final_tree[rep.chr][rep.strand].insert(rep.start, rep.end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
Ejemplo n.º 5
0
def sample_sanity_check(group_filename,
                        gff_filename,
                        count_filename,
                        fastq_filename=None):
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\
        group_filename, gff_filename, count_filename)
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [r.seqid for r in GFF.collapseGFFReader(gff_filename)]
    f = open(count_filename)
    for i in xrange(14):
        f.readline()  # just through the header
    ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')]
    if len(set(ids2).difference(ids1)) > 0 or len(
            set(ids2).difference(ids3)) > 0:
        raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\
            group_filename, gff_filename, count_filename)

    if fastq_filename is not None:
        ids4 = [
            r.id.split('|')[0]
            for r in SeqIO.parse(open(fastq_filename), 'fastq')
        ]
        if len(set(ids2).difference(ids4)) > 0:
            raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\
                fastq_filename, gff_filename)
Ejemplo n.º 6
0
def sanity_check_collapse_input(input_prefix):
    """
    Check that
    1. the count, gff, rep files exist
    2. the number of records agree among the three
    """
    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'
    if not os.path.exists(count_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(count_filename)
        sys.exit(-1)
    if not os.path.exists(gff_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(gff_filename)
        sys.exit(-1)
    if not os.path.exists(rep_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(rep_filename)
        sys.exit(-1)

    pbids1 = set([r.id for r in SeqIO.parse(open(rep_filename),'fastq')])
    pbids2 = set([r.seqid for r in GFF.collapseGFFReader(gff_filename)])
    pbids3 = set(read_count_file(count_filename)[0].keys())

    if len(pbids1)!=len(pbids2) or len(pbids2)!=len(pbids3) or len(pbids1)!=len(pbids3):
        print >> sys.stderr, "The number of PBID records in the files disagree! Sanity check failed."
        print >> sys.stderr, "# of PBIDs in {0}: {1}".format(rep_filename, len(pbids1))
        print >> sys.stderr, "# of PBIDs in {0}: {1}".format(gff_filename, len(pbids2))
        print >> sys.stderr, "# of PBIDs in {0}: {1}".format(count_filename, len(pbids3))
        sys.exit(-1)

    return count_filename, gff_filename, rep_filename
Ejemplo n.º 7
0
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None):
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\
        group_filename, gff_filename, count_filename)
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [r.seqid for r in GFF.collapseGFFReader(gff_filename)]
    f = open(count_filename)
    while True:
        # advance through the headers which start with #
        cur = f.tell()
        if not f.readline().startswith('#') or f.tell() == cur:  # first non-# seen or EOF
            f.seek(cur)
            break
    ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')]
    if len(set(ids2).difference(ids1))>0 or len(set(ids2).difference(ids3))>0:
        raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\
            group_filename, gff_filename, count_filename)

    if fastq_filename is not None:
        ids4 = [r.id.split('|')[0] for r in SeqIO.parse(open(fastq_filename), 'fastq')]
        if len(set(ids2).difference(ids4))>0:
            raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\
                fastq_filename, gff_filename)
    def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None):
        combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None)
        unmatched_recs = self.record_d.keys()

        for r in GFF.collapseGFFReader(gff_filename):
            match_rec = self.match_record_to_tree(r)
            if match_rec is not None:  # found a match! put longer of r1/r2 in
                combined.append((match_rec, r))
                try:
                    unmatched_recs.remove(match_rec.seqid)
                except ValueError:
                    pass # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, r))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d[seqid], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)})
        for i,(r1,r2) in enumerate(combined):
            if r2 is None or (r1 is not None and r1.end-r1.start > r2.end-r2.start):
                final_tree[r1.chr][r1.strand].insert(r1.start, r1.end, i)
            else:
                final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
    def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None):
        combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None)
        unmatched_recs = self.record_d.keys()

        for r in GFF.collapseGFFReader(gff_filename):
            match_rec = self.match_record_to_tree(r)
            if match_rec is not None:  # found a match! put longer of r1/r2 in
                combined.append((match_rec, r))
                try:
                    unmatched_recs.remove(match_rec.seqid)
                except ValueError:
                    pass # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, r))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d[seqid], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)})
        for i,(r1,r2) in enumerate(combined):
            if r2 is None or (r1 is not None and r1.end-r1.start > r2.end-r2.start):
                final_tree[r1.chr][r1.strand].insert(r1.start, r1.end, i)
            else:
                final_tree[r2.chr][r2.strand].insert(r2.start, r2.end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
Ejemplo n.º 10
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: test.collapsed.min_fl_2)")
    parser.add_argument("--fuzzy_junction",
                        type=int,
                        default=5,
                        help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    count_filename, gff_filename, rep_filename, rep_type = sanity_check_collapse_input(
        args.input_prefix)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = list(recs.keys())
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.' + ('fq' if rep_type == 'fastq' else 'fa'),
             'w')
    for r in SeqIO.parse(open(rep_filename), rep_type):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, rep_type)
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print("Output written to:", output_prefix + '.gff', file=sys.stderr)
    print("Output written to:", rep_filename, file=sys.stderr)
    print("Output written to:", output_prefix + '.gff', file=sys.stderr)
Ejemplo n.º 11
0
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group,
                     addon_name, fuzzy_junction, allow_5merge, max_3_diff,
                     n_chunks):
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)
    recs = []
    tree = OrderedDict()
    i = 0
    for r in GFF.collapseGFFReader(addon_gff):
        if r.chr not in tree:
            tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)}
        tree[r.chr][r.strand].insert(r.start, r.end, i)
        recs.append(r)
        i += 1

    n = len(recs)
    chunk_size = (n // n_chunks) + (n % n_chunks > 0)

    split_files = []
    i = 0
    counter = 0
    f_gff = open(addon_gff + '.split' + str(i), 'w')
    f_group = open(addon_group + '.split' + str(i), 'w')
    for v1 in tree.values():
        for strand in ('+', '-'):
            v2 = v1[strand]
            for _start, _end, _indices in v2.getregions():
                for cur in _indices:
                    GFF.write_collapseGFF_format(f_gff, recs[cur])
                    f_group.write("{0}\t{1}\n".format(
                        recs[cur].seqid,
                        ",".join(addon_group_info[recs[cur].seqid])))
                    counter += 1
            if counter >= (i + 1) * chunk_size:
                i += 1
                f_gff.close()
                f_group.close()
                split_files.append((f_gff.name, f_group.name))
                f_gff = open(addon_gff + '.split' + str(i), 'w')
                f_group = open(addon_group + '.split' + str(i), 'w')
    if not f_gff.closed:
        f_gff.close()
        f_group.close()
        split_files.append((f_gff.name, f_group.name))

    result_prefixes = []
    pools = []
    for i, (split_gff, split_group) in enumerate(split_files):
        p = Process(target=chain_helper,
                    args=(ref_gff, ref_group, split_gff, split_group, ref_name,
                          addon_name + '.' + str(i), fuzzy_junction,
                          allow_5merge, max_3_diff))
        p.start()
        pools.append(p)
        result_prefixes.append((ref_name, addon_name + '.' + str(i)))
    for p in pools:
        p.join()
    return result_prefixes, split_files
Ejemplo n.º 12
0
def regroup_gff(pooled_gff,
                demux_count_file,
                output_prefix,
                out_group_dict,
                in_fafq=None):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None: type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(
        lambda: set())  # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file), delimiter=','):
        for k, v in r.items():
            if k == 'id': continue
            if int(v) > 0: in_tissue[r['id']].add(k)

    in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.values():
        handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w')
        if in_fafq is not None:
            handles_fafq[g] = open(
                "{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq),
                'w')

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = list(fafq_dict.keys())
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None: fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GFF.collapseGFFReader(pooled_gff)
    for r in reader:
        groups_to_write_in = set()
        pbid = r.seqid
        if pbid not in in_tissue:
            print(
                "WARNING: {0} does not belong to any group indicated by outgroup_dict"
                .format(pbid),
                file=sys.stderr)
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
Ejemplo n.º 13
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)")
    parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(args.input_prefix)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
Ejemplo n.º 14
0
def sanity_check_collapse_input(input_prefix):
    """
    Check that
    1. the count, gff, rep files exist
    2. the number of records agree among the three
    """
    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filenames = [(input_prefix + '.rep.fq', 'fastq'), (input_prefix + '.rep.fastq', 'fastq'), \
                     (input_prefix + '.rep.fa', 'fasta'), (input_prefix + '.rep.fasta', 'fasta')]

    rep_filename = None
    rep_type = None
    for x, type in rep_filenames:
        if os.path.exists(x):
            rep_filename = x
            rep_type = type

    if rep_filename is None:
        print(
            "Expected to find input fasta or fastq files {0}.rep.fa or {0}.rep.fq. Not found. Abort!"
            .format(input_prefix),
            file=sys.stderr)
        sys.exit(-1)

    if not os.path.exists(count_filename):
        print("File {0} does not exist. Abort!".format(count_filename),
              file=sys.stderr)
        sys.exit(-1)
    if not os.path.exists(gff_filename):
        print("File {0} does not exist. Abort!".format(gff_filename),
              file=sys.stderr)
        sys.exit(-1)

    pbids1 = set([r.id for r in SeqIO.parse(open(rep_filename), rep_type)])
    pbids2 = set([r.seqid for r in GFF.collapseGFFReader(gff_filename)])
    pbids3 = set(read_count_file(count_filename)[0].keys())

    if len(pbids1) != len(pbids2) or len(pbids2) != len(pbids3) or len(
            pbids1) != len(pbids3):
        print(
            "The number of PBID records in the files disagree! Sanity check failed.",
            file=sys.stderr)
        print("# of PBIDs in {0}: {1}".format(rep_filename, len(pbids1)),
              file=sys.stderr)
        print("# of PBIDs in {0}: {1}".format(gff_filename, len(pbids2)),
              file=sys.stderr)
        print("# of PBIDs in {0}: {1}".format(count_filename, len(pbids3)),
              file=sys.stderr)
        sys.exit(-1)

    return count_filename, gff_filename, rep_filename, rep_type
Ejemplo n.º 15
0
def make_fake_genome(genome_filename, gff_filename, ref_chr, ref_start, ref_end, ref_strand, output_prefix, output_name, genome_d=None):
    if genome_d is None:
        print("Reading genome file {0}...".format(genome_filename), file=sys.stderr)
        d = SeqIO.to_dict(SeqIO.parse(open(genome_filename),'fasta'))
    else:
        d = genome_d

    print("Reading GFF file {0}...".format(gff_filename), file=sys.stderr)
    good = []
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        if r.chr==ref_chr and r.strand==ref_strand and \
                (ref_start <= r.start < r.end <= ref_end) \
            and len(r.ref_exons) > 1:
            print("Adding {0} to fake genome.".format(r.seqid), file=sys.stderr)
            good.append(r)

    if len(good) == 0:
        print("Did not find any transcripts strictly within {0}:{1}-{2} on strand {3}. Abort!".format(\
            ref_chr, ref_start, ref_end, ref_strand), file=sys.stderr)
        sys.exit(-1)

    c = ClusterTree(0, 0)
    for r in good:
        for e in r.ref_exons:
            c.insert(e.start-extra_bp_around_junctions, e.end+extra_bp_around_junctions, 1)

    regions = [(a,b) for (a,b,junk) in c.getregions()]
    regions[0] = (regions[0][0]-__padding_before_after__, regions[0][1])
    regions[-1] = (regions[-1][0], regions[-1][1]+__padding_before_after__)

    with open(output_prefix+'.fasta', 'w') as f:
        f.write(">" + output_name + "\n")
        for a,b in regions:
            f.write(str(d[r.chr][a:b].seq))
        f.write("\n")
        f.close()

    # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome>
    with open(output_prefix+'.mapping.txt', 'w') as f:
        i = 0
        for a,b in regions:
            for j in range(a, b):
                f.write("{0},{1},{2}\n".format(i, ref_chr, j))
                i += 1

        with open(output_prefix+'.pbids.txt', 'w') as f:
            f.write("\n".join(r.seqid for r in good)+'\n')

    print("Output written to {0}.fasta, {0}.mapping.txt, {0}.pbids.txt.".format(output_prefix), file=sys.stderr)
Ejemplo n.º 16
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: test.collapsed.min_fl_2)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.nomono'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(
        args.input_prefix)

    good = []
    f = open(output_prefix + '.gff', 'w')
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        if len(r.ref_exons) > 1:
            good.append(r.seqid)
            GFF.write_collapseGFF_format(f, r)

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None, allow_5merge=False, fastq_filename=None):
        self.gff_filename = gff_filename
        self.group_filename = group_filename
        self.self_prefix = self_prefix
        self.internal_fuzzy_max_dist = internal_fuzzy_max_dist
        self.allow_5merge = allow_5merge
        self.record_d = dict((r.seqid, r) for r in GFF.collapseGFFReader(gff_filename))
        #sanity_check_seqids(self.record_d.keys()) # sanity check all IDs look like PB.1.2
        self.tree = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree
        self.fastq_dict = None
        if fastq_filename is not None:
            self.fastq_dict = MegaPBTree.read_fastq_to_dict(fastq_filename)


        #print >> sys.stderr, "self.internal_fuzzy_max_dist is", internal_fuzzy_max_dist
        #raw_input()
        self.read_gff_as_interval_tree()
        self.group_info = MegaPBTree.read_group(self.group_filename, self.self_prefix) # ex: PB.1.1 --> [ RatHeart|i3_c123.... ]
Ejemplo n.º 18
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.nomono'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(args.input_prefix)

    good = []
    f = open(output_prefix + '.gff', 'w')
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        if len(r.ref_exons) > 1:
            good.append(r.seqid)
            GFF.write_collapseGFF_format(f, r)

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        try:
            # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
            return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)
        except ValueError:
            return 0

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1 #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist, max_5_diff=args.max_5_diff, max_3_diff=args.max_3_diff)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename+'.fuzzy', 'w')
    f_group = open(group_filename+'.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            # note: get_fl_from_id only works on IsoSeq1 and 2 ID formats, will return 0 if IsoSeq3 format or other
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match
Ejemplo n.º 20
0
def get_gff_from_list(gff_filename, listfile, partial_ok=False):
    seqs = [line.strip() for line in open(listfile)]
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in seqs or r.seqid.split('|')[0] in seqs or (partial_ok and any(r.seqid.startswith(x) for x in seqs)):
            GFF.write_collapseGFF_format(sys.stdout, r)
Ejemplo n.º 21
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: filtered.microexon)")
    parser.add_argument(
        "--micro_exon_size",
        type=int,
        default=12,
        help="Filter away microexons < micro_exon_size (default: 12bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered.microexon'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(
        args.input_prefix)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        for r in xxx:
            min_exon_size = min(e.end - e.start for e in r.ref_exons)
            if min_exon_size > 12:  # minimum exon must be > default 12 bp
                GFF.write_collapseGFF_format(f, r)
                good.append(r.seqid)

    f.close()

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
Ejemplo n.º 22
0
def filter_by_count(input_prefix, output_prefix, min_count, dun_use_group_count=False):

    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'

    if not dun_use_group_count:
        # read group
        group_max_count_fl = {}
        group_max_count_p = {}
        f = open(group_filename)
        for line in f:
            #ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
            pbid, members = line.strip().split('\t')
            group_max_count_fl[pbid] = 0
            group_max_count_p[pbid] = 0
            members = members.split(',')
            for m in members:
                i = m.find('|')
                if i > 0:
                    tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16
                else:
                    tmp = m.split('/')[1]
                fl_count, p_count = tmp.split('p')
                fl_count = int(fl_count[1:])
                p_count = int(p_count)
                group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
                group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
        f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k,v in d.iteritems():
        print k,v
    f.close()

    # group_max_count_p NOT used for now
    good = filter(lambda x: int(d[x]['count_fl']) >= min_count and (dun_use_group_count or group_max_count_fl[x] >= min_count), d)

    # write output GFF
    f = open(output_prefix + '.gff', 'w')
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good: GFF.write_collapseGFF_format(f, r)
    f.close()


    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
           SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.abundance.txt'
 def read_gff_as_interval_tree(self):
     """
     Read a collapsed GFF file into an IntervalTree
     """
     for r in GFF.collapseGFFReader(self.gff_filename):
         self.tree[r.chr][r.strand].insert(r.start, r.end, r)
Ejemplo n.º 24
0
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group,
                     addon_name, fuzzy_junction, allow_5merge, max_3_diff,
                     n_chunks):
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)
    recs = []
    tree = OrderedDict()
    i = 0
    for r in GFF.collapseGFFReader(addon_gff):
        if r.chr not in tree:
            tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)}
        tree[r.chr][r.strand].insert(r.start, r.end, i)
        recs.append(r)
        i += 1

    n = len(recs)
    chunk_size = (n // n_chunks) + (n % n_chunks > 0)
    #print("# of recs: {0}, cpus: {1}, chunk_size: {2}".format(n, n_chunks, chunk_size))

    split_files = []
    i = 0
    counter = 0
    f_gff = open(addon_gff + '.split' + str(i), 'w')
    f_group = open(addon_group + '.split' + str(i), 'w')
    for v1 in tree.values():
        for strand in ('+', '-'):
            v2 = v1[strand]
            for _start, _end, _indices in v2.getregions():
                for cur in _indices:
                    GFF.write_collapseGFF_format(f_gff, recs[cur])
                    f_group.write("{0}\t{1}\n".format(
                        recs[cur].seqid,
                        ",".join(addon_group_info[recs[cur].seqid])))
                    counter += 1
            # note: becuz we are limited by how the records are organized by (chrom, strand)
            # we may not end up using all the chunks, ex: if all records are on the same locus, we end up writing everything to one split file
            if counter >= (i + 1) * chunk_size:
                i += 1
                f_gff.close()
                f_group.close()
                split_files.append((f_gff.name, f_group.name))
                if i >= n_chunks or counter >= len(recs):
                    break
                f_gff = open(addon_gff + '.split' + str(i), 'w')
                f_group = open(addon_group + '.split' + str(i), 'w')
    if not f_gff.closed:
        f_gff.close()
        f_group.close()
        split_files.append((f_gff.name, f_group.name))

    result_prefixes = []
    pools = []
    for i, (split_gff, split_group) in enumerate(split_files):
        p = Process(target=chain_helper,
                    args=(ref_gff, ref_group, split_gff, split_group, ref_name,
                          addon_name + '.' + str(i), fuzzy_junction,
                          allow_5merge, max_3_diff))
        p.start()
        pools.append(p)
        result_prefixes.append((ref_name, addon_name + '.' + str(i)))
    for p in pools:
        p.join()
    #print("split files: {0}, result_prefix: {1}".format(split_files, result_prefixes))
    return result_prefixes, split_files
Ejemplo n.º 25
0
def cleanup_scrubbed_files_redundancy(gff_filename, group_filename,
                                      count_filename, fastq_filename,
                                      output_prefix):

    junction_seen = defaultdict(lambda: defaultdict(lambda: [
    ]))  # key (chr,strand) -> dict of (series of junctions) -> record
    for r in GFF.collapseGFFReader(gff_filename):
        n = len(r.ref_exons)
        if n == 1:
            junc_str = str(r.start) + ',' + str(r.end)
            junction_seen[r.chr, r.strand][junc_str] = [r]
        else:
            junc_str = ",".join(
                str(r.ref_exons[i].end) + ',' + str(r.ref_exons[i + 1].start)
                for i in range(n - 1))
            junction_seen[r.chr, r.strand][junc_str].append(r)

    # write out cleaned GFF
    outf = open(output_prefix + '.gff', 'w')
    outf2 = open(output_prefix + '.merged_ids.txt', 'w')
    merged = {}
    keys = list(junction_seen.keys())
    keys.sort()
    for k in keys:
        for bunch in junction_seen[k].values():
            if len(bunch) == 1:  # just one record, write it out
                r = bunch[0]
                GFF.write_collapseGFF_format(outf, r)
                merged[r.seqid] = [r.seqid]
            else:
                # find the representative
                r = bunch[0]
                for r2 in bunch[1:]:
                    if r2.end - r2.start > r.end - r.start:
                        r = r2
                GFF.write_collapseGFF_format(outf, r)
                merged[r.seqid] = [x.seqid for x in bunch]
            outf2.write("{0}\t{1}\n".format(r.seqid,
                                            ",".join(merged[r.seqid])))
    outf.close()
    outf2.close()

    count_d, count_header = read_count_file(count_filename)
    # write out count file
    outf = open(output_prefix + '.abundance.txt', 'w')
    outf.write(count_header)
    writer = DictWriter(outf, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for pbid, bunch in merged.items():
        # combine the counts
        r = count_d[bunch[0]]
        r['pbid'] = pbid
        for field in fields_to_add:
            r[field] = float(r[field])
        for _id in bunch[1:]:
            for field in fields_to_add:
                r[field] += float(count_d[_id][field])
        writer.writerow(r)
    outf.close()

    group_info = read_group_file(group_filename)
    # write out group file
    outf = open(output_prefix + '.group.txt', 'w')
    for pbid, bunch in merged.items():
        # combine the groups
        g = [group_info[bunch[0]]]
        for _id in bunch[1:]:
            g.append(group_info[_id])
        outf.write("{0}\t{1}\n".format(pbid, ",".join(g)))
    outf.close()

    # write out fastq file if present
    if fastq_filename is not None:
        outf = open(output_prefix + '.rep.fq', 'w')
        for r in SeqIO.parse(open(fastq_filename), 'fastq'):
            if r.id.split('|')[0] in merged or r.id in merged:
                SeqIO.write(r, outf, 'fastq')
        outf.close()

    print(
        "scrubbed files written: {0}.gff, {0}.group.txt, {0}.abundance.txt, {0}.merged_ids.txt"
        .format(output_prefix),
        file=sys.stderr)
#!/usr/bin/env python
import cupcake.io.GFF as GFF
from collections import defaultdict
import numpy as np

d = defaultdict(lambda: [])
for r in GFF.collapseGFFReader(
        'hq_isoforms.fastq.no5merge.collapsed.filtered.gff'):
    d[r.seqid.split('.')[1]].append(r.seqid)

p = np.array([len(v) for v in d.itervalues()])

print "Number of loci:", len(d)
print "Number of isoforms:", sum(p)
print "Avg. number of isoforms per loci:", np.mean(p)
f = open('hq_isoforms.fastq.no5merge.collapsed.filtered.isoform_per_loci.txt',
         'w')
f.write("loci\tnum_isoform\n")
for k, v in d.iteritems():
    f.write("PB.{0}\t{1}\n".format(k, len(v)))
f.close()
Ejemplo n.º 27
0
def filter_by_count(input_prefix,
                    output_prefix,
                    min_count,
                    dun_use_group_count=False):

    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'

    if not dun_use_group_count:
        # read group
        group_max_count_fl = {}
        group_max_count_p = {}
        f = open(group_filename)
        for line in f:
            #ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
            pbid, members = line.strip().split('\t')
            group_max_count_fl[pbid] = 0
            group_max_count_p[pbid] = 0
            members = members.split(',')
            for m in members:
                i = m.find('|')
                if i > 0:
                    tmp = m.split('|')[1].split('/')[1]  #ex: tmp = f30p16
                else:
                    tmp = m.split('/')[1]
                fl_count, p_count = tmp.split('p')
                fl_count = int(fl_count[1:])
                p_count = int(p_count)
                group_max_count_fl[pbid] = max(group_max_count_fl[pbid],
                                               fl_count)
                group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
        f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k, v in d.items():
        print(k, v)
    f.close()

    # group_max_count_p NOT used for now
    good = [
        x for x in d if int(d[x]['count_fl']) >= min_count and (
            dun_use_group_count or group_max_count_fl[x] >= min_count)
    ]

    # write output GFF
    f = open(output_prefix + '.gff', 'w')
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good: GFF.write_collapseGFF_format(f, r)
    f.close()

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print("Output written to:", output_prefix + '.gff', file=sys.stderr)
    print("Output written to:", output_prefix + '.rep.fq', file=sys.stderr)
    print("Output written to:",
          output_prefix + '.abundance.txt',
          file=sys.stderr)
Ejemplo n.º 28
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: test.collapsed.min_fl_2)")
    parser.add_argument("--fuzzy_junction",
                        type=int,
                        default=5,
                        help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    #group_filename = args.input_prefix + '.group.txt'
    count_filename = args.input_prefix + '.abundance.txt'
    gff_filename = args.input_prefix + '.gff'
    rep_filename = args.input_prefix + '.rep.fq'

    if not os.path.exists(count_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(
            count_filename)
        sys.exit(-1)
    if not os.path.exists(gff_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(
            gff_filename)
        sys.exit(-1)
    if not os.path.exists(rep_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(
            rep_filename)
        sys.exit(-1)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k, v in d.iteritems():
        print k, v
    f.close()

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
 def read_gff_as_interval_tree(self):
     """
     Read a collapsed GFF file into an IntervalTree
     """
     for r in GFF.collapseGFFReader(self.gff_filename):
         self.tree[r.chr][r.strand].insert(r.start, r.end, r)
Ejemplo n.º 30
0
def combine_split_chained_results(output_prefixes, final_prefix, ref_gff,
                                  ref_group, ref_name, ref_fq, addon_gff,
                                  addon_group, addon_name, addon_fq):
    """
    Each <output_prefix> will have .gff, .group.txt, .mega_info.txt.
    There should be NO overlap between the split files, so clean merge should be possible!

    1. read the .gff files, record the group and mega (id-map) info
    2. sort the total records so can properly put on a unified superPBID
    3. write out the unified result
    4. delete the split files
    """

    # sanity check files are all there
    split_files = []  # tuple of (gff, group, mega)
    for ref_name, o in output_prefixes:
        gff_file = 'tmp_' + o + '.gff'
        mega_file = 'tmp_' + o + '.mega_info.txt'
        group_file = 'tmp_' + o + '.group.txt'
        if not os.path.exists(gff_file) or not os.path.exists(
                mega_file) or not os.path.exists(group_file):
            print(
                "Expects to see {0},{1},{2} but one or more files are missing! Abort!"
                .format(gff_file, mega_file, group_file),
                file=sys.stderr)
            sys.exit(-1)
        split_files.append((ref_name, o, gff_file, group_file, mega_file))

    use_fq = False
    if ref_fq is not None and addon_fq is not None:
        use_fq = True
        ref_fq_dict = dict((r.id.split('|')[0], r)
                           for r in SeqIO.parse(open(ref_fq), 'fastq'))
        addon_fq_dict = dict((r.id.split('|')[0], r)
                             for r in SeqIO.parse(open(addon_fq), 'fastq'))

    mega_info = {}  # ref id -> list of matching query_id, or empty list
    split_unmatched = set()

    for (ref_name, split_name, gff_file, group_file, mega_file) in split_files:
        for r in DictReader(open(mega_file), delimiter='\t'):
            if r[ref_name] != 'NA':
                if r[ref_name] not in mega_info:
                    mega_info[r[ref_name]] = []
                if r[split_name] != 'NA':
                    mega_info[r[ref_name]].append(r[split_name])
            else:  # ref is NA, non-ref is not NA
                split_unmatched.add(r[split_name])

    # make a rec list of matches of (ref_id, addon_id, representative record, combined group info) where rec_ref or ref_addon could be None, but not both
    rec_list = []
    d_ref = dict((r.seqid, r) for r in GFF.collapseGFFReader(ref_gff))
    d_addon = dict((r.seqid, r) for r in GFF.collapseGFFReader(addon_gff))

    ref_group_info = sp.MegaPBTree.read_group(ref_group, None)
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)

    for ref_id, matches in mega_info.items():
        if len(matches) == 0:
            rec_list.append(
                sp.MatchRecord(ref_id=ref_id,
                               addon_id='NA',
                               rec=d_ref[ref_id],
                               members=ref_group_info[ref_id],
                               seqrec=ref_fq_dict[ref_id] if use_fq else None))
        else:
            for addon_id in matches:
                r1 = d_ref[ref_id]
                r2 = d_addon[addon_id]
                if (r1.end - r1.start) > (r2.end - r2.start):
                    rec_list.append(
                        sp.MatchRecord(
                            ref_id=ref_id,
                            addon_id=addon_id,
                            rec=r1,
                            members=ref_group_info[ref_id] +
                            addon_group_info[addon_id],
                            seqrec=ref_fq_dict[ref_id] if use_fq else None))
                else:
                    rec_list.append(
                        sp.MatchRecord(ref_id=ref_id,
                                       addon_id=addon_id,
                                       rec=r2,
                                       members=ref_group_info[ref_id] +
                                       addon_group_info[addon_id],
                                       seqrec=addon_fq_dict[addon_id]
                                       if use_fq else None))
    for addon_id in split_unmatched:
        rec_list.append(
            sp.MatchRecord(ref_id='NA',
                           addon_id=addon_id,
                           rec=d_addon[addon_id],
                           members=addon_group_info[addon_id],
                           seqrec=addon_fq_dict[addon_id] if use_fq else None))

    sp.write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name,
                                   addon_name, use_fq)
    for (ref_name, split_name, gff_file, group_file, mega_file) in split_files:
        os.remove(gff_file)
        os.remove(group_file)
        os.remove(mega_file)
Ejemplo n.º 31
0
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon,
                             internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
        return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1  #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(
                r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename + '.fuzzy', 'w')
    f_group = open(group_filename + '.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(
            group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                               and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match
#!/usr/bin/env python
import cupcake.io.GFF as GFF
from collections import defaultdict
import numpy as np

d = defaultdict(lambda: [])
for r in GFF.collapseGFFReader('hq_isoforms.fastq.no5merge.collapsed.filtered.gff'):
    d[r.seqid.split('.')[1]].append(r.seqid)
    
p = np.array([len(v) for v in d.itervalues()])

print "Number of loci:", len(d)
print "Number of isoforms:", sum(p)
print "Avg. number of isoforms per loci:", np.mean(p)
f = open('hq_isoforms.fastq.no5merge.collapsed.filtered.isoform_per_loci.txt', 'w')
f.write("loci\tnum_isoform\n")
for k,v in d.iteritems(): f.write("PB.{0}\t{1}\n".format(k, len(v)))
f.close()
Ejemplo n.º 33
0
def summarize_junctions(sample_dirs, sample_names, gff_filename, output_prefix, genome_d=None, junction_known=None):
    """
    1. for each sample, read all the GFF, store the junction information (both 0-based)

    """
    junc_by_chr_strand = defaultdict(lambda: defaultdict(lambda: [])) # (chr,strand) --> (donor,acceptor) --> samples it show up in (more than once possible)

    for sample_name, d in sample_dirs.items():
        for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)):
            n = len(r.ref_exons)
            if n == 1: continue # ignore single exon transcripts
            for i in range(n-1):
                donor = r.ref_exons[i].end-1 # make it 0-based
                accep = r.ref_exons[i+1].start # start is already 0-based
                junc_by_chr_strand[r.chr, r.strand][donor, accep].append(sample_name)

    # write junction report
    f1 = open(output_prefix+'.junction.bed', 'w')
    f1.write("track name=junctions description=\"{0}\" useScore=1\n".format(output_prefix))

    JUNC_DETAIL_FIELDS = ['chr', 'left', 'right', 'strand', 'num_transcript', 'num_sample', 'genome', 'annotation', 'label']


    with open(output_prefix+'.junction_detail.txt', 'w') as f:
        writer = DictWriter(f, JUNC_DETAIL_FIELDS, delimiter='\t')
        writer.writeheader()
        keys = list(junc_by_chr_strand.keys())
        keys.sort()
        for _chr, _strand in keys:
            v = junc_by_chr_strand[_chr, _strand]
            v_keys = list(v.keys())
            v_keys.sort()
            labels = cluster_junctions(v_keys)
            for i,(_donor, _accep) in enumerate(v_keys):
                rec = {'chr': _chr,
                       'left': _donor,
                       'right': _accep,
                       'strand': _strand,
                       'num_transcript': len(v[_donor,_accep]),
                       'num_sample': len(set(v[_donor,_accep]))}
                #f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t".format(_chr, _donor, _accep, _strand, len(v[_donor,_accep]), len(set(v[_donor,_accep]))))
                f1.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(_chr, _donor, _accep+1, output_prefix, len(v[_donor,_accep]), _strand))
                # if genome is given, write acceptor-donor site
                if genome_d is None or _chr not in genome_d:
                    rec['genome'] = 'NA'
                    #f.write("NA\t")
                else:
                    up, down = genome_d[_chr][_donor+1:_donor+3], genome_d[_chr][_accep-2:_accep]
                    if _strand == '+':
                        rec['genome'] = "{0}-{1}".format(str(up.seq).upper(), str(down.seq).upper())
                        #f.write("{0}-{1}\t".format(str(up.seq).upper(), str(down.seq).upper()))
                    else:
                        rec['genome'] = "{0}-{1}".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper())
                        #f.write("{0}-{1}\t".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper()))
                # if annotation is given, check if matches with annotation
                if junction_known is None:
                    rec['annotation'] = 'NA'
                    #f.write("NA\n")
                else:
                    if (_chr, _strand) in junction_known and (_donor, _accep) in junction_known[_chr, _strand]:
                        rec['annotation'] = 'Y'
                        #f.write("Y\t")
                    else:
                        rec['annotation'] = 'N'
                        #f.write("N\t")
                rec['label'] = "{c}_{s}_{lab}".format(c=_chr, s=_strand, lab=labels[i])
                writer.writerow(rec)
                #f.write("{c}_{s}_{lab}\n".format(c=_chr, s=_strand, lab=labels[i]))
    f1.close()

    return junc_by_chr_strand
def summarize_junctions(sample_dirs, sample_names, gff_filename, output_prefix, genome_d=None, junction_known=None):
    """
    1. for each sample, read all the GFF, store the junction information (both 0-based)

    """
    junc_by_chr_strand = defaultdict(lambda: defaultdict(lambda: [])) # (chr,strand) --> (donor,acceptor) --> samples it show up in (more than once possible)

    for sample_name, d in sample_dirs.iteritems():
        for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)):
            n = len(r.ref_exons)
            if n == 1: continue # ignore single exon transcripts
            for i in xrange(n-1):
                donor = r.ref_exons[i].end-1 # make it 0-based
                accep = r.ref_exons[i+1].start # start is already 0-based
                junc_by_chr_strand[r.chr, r.strand][donor, accep].append(sample_name)

    # write junction report
    f1 = open(output_prefix+'.junction.bed', 'w')
    f1.write("track name=junctions description=\"{0}\" useScore=1\n".format(output_prefix))

    JUNC_DETAIL_FIELDS = ['chr', 'left', 'right', 'strand', 'num_transcript', 'num_sample', 'genome', 'annotation', 'label']


    with open(output_prefix+'.junction_detail.txt', 'w') as f:
        writer = DictWriter(f, JUNC_DETAIL_FIELDS, delimiter='\t')
        writer.writeheader()
        keys = junc_by_chr_strand.keys()
        keys.sort()
        for _chr, _strand in keys:
            v = junc_by_chr_strand[_chr, _strand]
            v_keys = v.keys()
            v_keys.sort()
            labels = cluster_junctions(v_keys)
            for i,(_donor, _accep) in enumerate(v_keys):
                rec = {'chr': _chr,
                       'left': _donor,
                       'right': _accep,
                       'strand': _strand,
                       'num_transcript': len(v[_donor,_accep]),
                       'num_sample': len(set(v[_donor,_accep]))}
                #f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t".format(_chr, _donor, _accep, _strand, len(v[_donor,_accep]), len(set(v[_donor,_accep]))))
                f1.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(_chr, _donor, _accep+1, output_prefix, len(v[_donor,_accep]), _strand))
                # if genome is given, write acceptor-donor site
                if genome_d is None or _chr not in genome_d:
                    rec['genome'] = 'NA'
                    #f.write("NA\t")
                else:
                    up, down = genome_d[_chr][_donor+1:_donor+3], genome_d[_chr][_accep-2:_accep]
                    if _strand == '+':
                        rec['genome'] = "{0}-{1}".format(str(up.seq).upper(), str(down.seq).upper())
                        #f.write("{0}-{1}\t".format(str(up.seq).upper(), str(down.seq).upper()))
                    else:
                        rec['genome'] = "{0}-{1}".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper())
                        #f.write("{0}-{1}\t".format(str(down.reverse_complement().seq).upper(), str(up.reverse_complement().seq).upper()))
                # if annotation is given, check if matches with annotation
                if junction_known is None:
                    rec['annotation'] = 'NA'
                    #f.write("NA\n")
                else:
                    if (_chr, _strand) in junction_known and (_donor, _accep) in junction_known[_chr, _strand]:
                        rec['annotation'] = 'Y'
                        #f.write("Y\t")
                    else:
                        rec['annotation'] = 'N'
                        #f.write("N\t")
                rec['label'] = "{c}_{s}_{lab}".format(c=_chr, s=_strand, lab=labels[i])
                writer.writerow(rec)
                #f.write("{c}_{s}_{lab}\n".format(c=_chr, s=_strand, lab=labels[i]))
    f1.close()

    return junc_by_chr_strand