def main(cons_filename, in_filename):
    run_gmap(cons_filename)
    run_gmap(in_filename)

    cons = GFF.gmapGFFReader(cons_filename+'.gff').next()
    good, bad = 0, []
    for r in GFF.gmapGFFReader(in_filename+'.gff'):
        if compare_gff(cons, r): good += 1
        else: bad.append(r.seqid)
    
    print "{0}/{1} agree with consensus".format(good, good+len(bad))
    print "Disagreement:"
    for x in bad: print x
    return good*1./(good+len(bad))
def main(gmap_filename):
    from collections import defaultdict
    output = []
    CT = defaultdict(lambda: ClusterTree(0,0))
    for r in GFF.gmapGFFReader(gmap_filename):
        if r.coverage >= 95. and all(x>=80 for x in r.scores):
            output.append(r)
            i = len(output)-1
            CT[r.chr].insert(r.start, r.end, i)

    f = open(gmap_filename+'.cov95score80_consolidated_UCSC.gff', 'w')
    for ct in CT.itervalues():
        for a,b,record_indices in ct.getregions():
            records = [output[ind] for ind in record_indices]
           # continuously merge potentially compatiable records
            i = 0
            while i < len(records)-1:
                j = i + 1
                while j < len(records) and records[j].start < records[i].end:
                    if consolidate(records[i], records[j]) is None: j += 1
                    else: records.pop(j) # j is merged into i, delete
                i += 1
            for r in records:
                GFF.write_GFF_UCSCformat(f, r)
    
    f.close()
def test_random_subset_gcon(records, iterations=10, size=10):
    for iter in xrange(iterations):
        picked = random.sample(records, size)
        f = open("tmp.fa", "w")
        for r in picked:
            f.write(">{0}\n{1}\n".format(r.id, r.seq))
        f.close()
        if os.path.exists("tmp_cons.fa"):
            os.remove("tmp_cons.fa")
        if os.path.exists("tmp_cons.fa.gff"):
            os.remove("tmp_cons.fa.gff")
        os.system("ice_pbdagcon.py tmp.fa tmp_cons tmp_cons --nproc 12")
        run_gmap("tmp_cons.fa")
        r2 = GFF.gmapGFFReader("tmp_cons.fa.gff").next()
        yield r2
def process_contig(dirname, f):
    d = defaultdict(lambda: []) # path_number --> (start, end, scaffold)
    reader = GFF.gmapGFFReader(os.path.join(dirname,'aloha2.fa.cuttlefish.gff'))
    for r in reader:
        if r.strand == '+': s,e = r.seq_exons[0].start, r.seq_exons[-1].end
        else: s,e = r.seq_exons[-1].start, r.seq_exons[0].end
        d[r.seqid].append((s, e, r.chr, r.start, r.end))

    if len(d) == 0: return
    
    for path_i, x in d.iteritems():
        x.sort(key=lambda x: x[0])
        xx = [_chr for s,e,_chr,_chr_s,_chr_e in x]
        f.write("{0}\t{1}\t{2}\n".format(dirname, path_i, ",".join(xx)))
        for s,e,_chr,_chr_s,_chr_e in x:
            f.write("#{0}:{1}-{2}\t{3}:{4}-{5}\n".format(path_i,s,e,_chr,_chr_s,_chr_e))
Exemple #5
0
def process_contig(dirname, f):
    d = defaultdict(lambda: [])  # path_number --> (start, end, scaffold)
    reader = GFF.gmapGFFReader(
        os.path.join(dirname, 'aloha2.fa.cuttlefish.gff'))
    for r in reader:
        if r.strand == '+': s, e = r.seq_exons[0].start, r.seq_exons[-1].end
        else: s, e = r.seq_exons[-1].start, r.seq_exons[0].end
        d[r.seqid].append((s, e, r.chr, r.start, r.end))

    if len(d) == 0: return

    for path_i, x in d.iteritems():
        x.sort(key=lambda x: x[0])
        xx = [_chr for s, e, _chr, _chr_s, _chr_e in x]
        f.write("{0}\t{1}\t{2}\n".format(dirname, path_i, ",".join(xx)))
        for s, e, _chr, _chr_s, _chr_e in x:
            f.write("#{0}:{1}-{2}\t{3}:{4}-{5}\n".format(
                path_i, s, e, _chr, _chr_s, _chr_e))
Exemple #6
0
def read_cogent2_aligned_to_genome_gff(filename):
    """
    Read cogent2 mapped to a genome.

    Return: dict of {cogent path} --> list of gmapRecord; set of mapped genome contigs

    NOTE: (gmap was run with -n 0 so if multiple must be chimeric)
    """
    d = defaultdict(lambda: [])
    contigs_seen = set()

    if not os.path.exists(filename):
        return {}, set()

    try:
        for r in GFF.gmapGFFReader(filename):
            d[r.seqid].append(r)
            contigs_seen.add(r.chr)
    except IndexError:
        pass
    return dict(d), contigs_seen
def read_cogent2_aligned_to_genome_gff(filename):
    """
    Read cogent2 mapped to a genome.

    Return: dict of {cogent path} --> list of gmapRecord; set of mapped genome contigs

    NOTE: (gmap was run with -n 0 so if multiple must be chimeric)
    """
    d = defaultdict(lambda: [])
    contigs_seen = set()

    if not os.path.exists(filename):
        return {}, set()

    try:
        for r in GFF.gmapGFFReader(filename):
            d[r.seqid].append(r)
            contigs_seen.add(r.chr)
    except IndexError:
        pass
    return dict(d), contigs_seen
def main(gmap_filename, fasta_filename):
    """
    Given a GMAP output (.gff) compare the aligned start/end
    to Gencode annotations (transcript & polyA)

    Need the original fasta to get sequence length
    """
    seqlen_dict = dict([(r.id,len(r.seq)) for r in SeqIO.parse(open(fasta_filename),'fasta')])
    gtf_f = '/home/UNIXHOME/etseng/share/gencode/gencode.v15.annotation.gtf'
    gtfA_f = '/home/UNIXHOME/etseng/share/gencode/gencode.v15.polyAs.gtf'

    gtf = GFF.GTF(gtf_f)
    gtfA = GFF.polyAGFF(gtfA_f)

    f = open(gmap_filename+'.summary', 'w')
    f.write("ID\thit5_exon\thit5_dist\thit5_id\thit3_exon\thit3_dist\thit3_id\thitA_dist\n")
    reader = GFF.gmapGFFReader(gmap_filename)
    while True:
        try:
            r = reader.next()
        except AssertionError: #ignore bad gmap output
            continue
        except StopIteration:
            break
        except:
            continue
        if r.coverage < min_coverage: continue
        # IMPORTANT! if r.start/r.end is not complete, extend it!
        r_start_corrected = r.start - r.seq_exons[0].start
        r_end_corrected = r.end + (seqlen_dict[r.seqid] - r.seq_exons[-1].end)
        hit5, hit3 = validate_53seen(gtf, r.chr, r.start, r.end, r.strand)
        hitA = validate_polyA(gtfA, r.chr, r.start, r.end, r.strand)
        f.write("{id}\t{e5}\t{d5}\t{i5}\t{e3}\t{d3}\t{i3}\t{dA}\n".format(\
                id=r.seqid, e5=hit5[0], d5=hit5[1], i5=hit5[2], e3=hit3[0], d3=hit3[1], i3=hit3[2], dA=hitA))

    f.close()
Exemple #9
0
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2):
    """
    1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff)
    2. read cogent2 mapped to genome1
    3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1)
    """
    if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')):
        return
    seq_info = defaultdict(lambda: [])
    contigs_seen = set()
    # input mapped to Cogent contigs
    filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.gff')
    reader = GFF.gmapGFFReader(filename)
    for r in reader:
        seq_info[r.seqid].append(r)
        contigs_seen.add(r.chr)
    # sanity check that all sequences in in.fa are mapped to cogent2.fa
    for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'):
        assert r.id in seq_info

    d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, 'cogent2.fa.' + genome1 + '.gff'))
    d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, 'cogent2.fa.' + genome2 + '.gff'))

    # write:
    # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden
    f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info),
                                      len(contigs_seen)))
    cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format(
        len(contig_genome1), cov1, acc1, has_chimeric1,
        ",".join(contig_genome1)))
    # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs
    cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\n".format(
        len(contig_genome2), cov2, acc2, has_chimeric2,
        ",".join(contig_genome2)))

    in_aligned_to_genome1 = os.path.join(dirname,
                                         'in.trimmed.fa.' + genome1 + '.gff')
    if os.path.exists(in_aligned_to_genome1):
        d3, junk = read_cogent2_aligned_to_genome_gff(in_aligned_to_genome1)
    else:
        d3 = {}

    for seqid, v in seq_info.iteritems():
        contigs = [x.chr for x in v]
        acc = sum(x.identity * x.coverage for x in v) / sum(x.coverage
                                                            for x in v)
        f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname,
                                               ",".join(contigs), acc))

        if not seqid in d3:
            f2.write("NA\t0\tNA\tNA\n")
        else:
            scaffolds = [x.chr for x in d3[seqid]]
            cov = sum(x.coverage for x in d3[seqid])
            acc = sum(x.identity * x.coverage for x in d3[seqid]) / cov
            f2.write("{0}\t{1}\t{2}\t{3}\n".format(",".join(scaffolds),
                                                   len(scaffolds), cov, acc))
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2):
    """
    1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff)
    2. read cogent2 mapped to genome1
    3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1)
    """
    if not os.path.exists(os.path.join(dirname, "COGENT.DONE")):
        return
    seq_info = defaultdict(lambda: [])
    contigs_seen = set()
    # input mapped to Cogent contigs
    filename = os.path.join(dirname, "in.trimmed.fa.cogent2.gff")
    reader = GFF.gmapGFFReader(filename)
    for r in reader:
        seq_info[r.seqid].append(r)
        contigs_seen.add(r.chr)
    # sanity check that all sequences in in.fa are mapped to cogent2.fa
    for r in SeqIO.parse(open(os.path.join(dirname, "in.fa")), "fasta"):
        assert r.id in seq_info

    d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, "cogent2.fa." + genome1 + ".gff")
    )
    d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_gff(
        os.path.join(dirname, "cogent2.fa." + genome2 + ".gff")
    )

    # write:
    # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden
    f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info), len(contigs_seen)))
    cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1)
    f1.write(
        "{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format(
            len(contig_genome1), cov1, acc1, has_chimeric1, ",".join(contig_genome1)
        )
    )
    # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs
    cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2)
    f1.write(
        "{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\n".format(
            len(contig_genome2), cov2, acc2, has_chimeric2, ",".join(contig_genome2)
        )
    )

    in_aligned_to_genome1 = os.path.join(dirname, "in.trimmed.fa." + genome1 + ".gff")
    if os.path.exists(in_aligned_to_genome1):
        d3, junk = read_cogent2_aligned_to_genome_gff(in_aligned_to_genome1)
    else:
        d3 = {}

    for seqid, v in seq_info.iteritems():
        contigs = [x.chr for x in v]
        acc = sum(x.identity * x.coverage for x in v) / sum(x.coverage for x in v)
        f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname, ",".join(contigs), acc))

        if not seqid in d3:
            f2.write("NA\t0\tNA\tNA\n")
        else:
            scaffolds = [x.chr for x in d3[seqid]]
            cov = sum(x.coverage for x in d3[seqid])
            acc = sum(x.identity * x.coverage for x in d3[seqid]) / cov
            f2.write("{0}\t{1}\t{2}\t{3}\n".format(",".join(scaffolds), len(scaffolds), cov, acc))
#!/usr/bin/env python
import os, sys
import GFF

input = sys.argv[1]
output = input[:input.rfind('.')] + '.collapsed.gff'

f = open(output, 'w')
reader = GFF.gmapGFFReader(input)
for r in reader: GFF.write_collapseGFF_format(f, r)
f.close()