Python read_bed_transcripts Beispiele, pita.io.read_bed_transcripts Python Beispiele

Beispiel #1

0

Datei anzeigen

def models_to_compare():
    from pita.io import read_bed_transcripts
    f1 = open("tests/data/annotation1.bed")
    f2 = open("tests/data/annotation2.bed")

    t1 = read_bed_transcripts(f1)
    t2 = read_bed_transcripts(f2)

    result = {}
    f = open("tests/data/ann1_vs_ann2.txt")
    f.readline()  # header
    for line in f.readlines():
        vals = line.strip().split("\t")
        result.setdefault(vals[0], {})
        result[vals[0]][vals[1]] = [float(x) for x in vals[2:]]

    return t1, t2, result

Beispiel #2

0

Datei anzeigen

def test_long_exon_filter(db, t1, t2):
    from pita.dbcollection import DbCollection
    from pita.io import read_bed_transcripts

    for tname, source, exons in read_bed_transcripts(open(t1)):
        db.add_transcript("{0}{1}{2}".format("t1", "|", tname), source, exons)
    for tname, source, exons in read_bed_transcripts(open(t2)):
        db.add_transcript("{0}{1}{2}".format("t2", "|", tname), source, exons)

    c = DbCollection(db, [], chrom="chr1")
    c.filter_long(l=500, evidence=1)

    models = []
    for cluster in c.get_best_variants([]):
        models.append(cluster)

    assert [3, 5] == sorted([len(m) for m in models])

Beispiel #3

0

Datei anzeigen

Datei: test_compare_annotation.py Projekt: Jorisvansteenbrugge/pita

def models_to_compare():
    from pita.io import read_bed_transcripts
    f1 = open("tests/data/annotation1.bed")
    f2 = open("tests/data/annotation2.bed")
    
    t1 = read_bed_transcripts(f1)
    t2 = read_bed_transcripts(f2)
    
    result = {}
    f = open("tests/data/ann1_vs_ann2.txt")
    f.readline() # header
    for line in f.readlines():
        vals = line.strip().split("\t")
        result.setdefault(vals[0], {})
        result[vals[0]][vals[1]] = [float(x) for x in vals[2:]]

    return t1, t2, result

Beispiel #4

0

Datei anzeigen

Datei: test_collection.py Projekt: simonvh/pita

def test_long_exon_filter(db, t1, t2):
    from pita.dbcollection import DbCollection
    from pita.io import read_bed_transcripts


    for tname, source, exons in read_bed_transcripts(open(t1)):
        db.add_transcript("{0}{1}{2}".format("t1", "|", tname), source, exons)
    for tname, source, exons in read_bed_transcripts(open(t2)):
        db.add_transcript("{0}{1}{2}".format("t2", "|", tname), source, exons)
    
    c = DbCollection(db, chrom="chr1")
    c.filter_long(evidence=1)

    models = []
    for cluster in c.get_connected_models():
        for m in cluster:
            models.append(m)
    
    assert [1,3,5] == sorted([len(m) for m in models])

Beispiel #5

0

Datei anzeigen

Datei: test_genome_index.py Projekt: simonvh/pita

def collection(db):
    from pita.dbcollection import DbCollection
    from pita.io import read_bed_transcripts

    bed = "tests/data/scaffold_54_genes.bed"
    for tname, source, exons in read_bed_transcripts(open(bed), "test", 0):
        db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source, exons)
    
    mc = DbCollection(db)
    return mc

Beispiel #6

0

Datei anzeigen

Datei: test_get_overlapping_models.py Projekt: Jorisvansteenbrugge/pita

def test_get_overlapping_models(db, bedfile):
    from pita.io import read_bed_transcripts
    from pita.util import get_overlapping_models

    for tname, source, exons in read_bed_transcripts(open(bedfile), "test", 0):
        db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source, exons)

    exons = db.get_exons("JGIv7b.000000226")
    
    assert 72 == len(get_overlapping_models(exons))

Beispiel #7

0

Datei anzeigen

def collection(db):
    from pita.dbcollection import DbCollection
    from pita.io import read_bed_transcripts

    bed = "tests/data/scaffold_54_genes.bed"
    for tname, source, exons in read_bed_transcripts(open(bed), "test", 0):
        db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source,
                          exons)

    mc = DbCollection(db, [])
    return mc

Beispiel #8

0

Datei anzeigen

Datei: test_get_overlapping_models.py Projekt: Jorisvansteenbrugge/pita

def test_get_overlapping_models(db, bedfile):
    from pita.io import read_bed_transcripts
    from pita.util import get_overlapping_models

    for tname, source, exons in read_bed_transcripts(open(bedfile), "test", 0):
        db.add_transcript("{0}{1}{2}".format("test", ":::", tname), source,
                          exons)

    exons = db.get_exons("JGIv7b.000000226")

    assert 72 == len(get_overlapping_models(exons))

Beispiel #9

0

Datei anzeigen

def load_chrom_data(conn, new, chrom, anno_files, data, index=None):
    logger = logging.getLogger("pita")

    try:
        # Read annotation files
        db = AnnotationDb(index=index, conn=conn, new=new)
        logger.debug("%s %s", chrom, id(db.session))
        logger.info("Reading annotation for %s", chrom)
        for name, fname, tabix_file, ftype, min_exons in anno_files:
            logger.info("Reading annotation from %s", fname)
            tabixfile = pysam.Tabixfile(tabix_file)
            #tabixfile = fname
            if chrom in tabixfile.contigs:
                fobj = TabixIteratorAsFile(tabixfile.fetch(chrom))
                if ftype == "bed":
                    it = read_bed_transcripts(fobj,
                                              fname,
                                              min_exons=min_exons,
                                              merge=10)
                elif ftype in ["gff", "gtf", "gff3"]:
                    it = read_gff_transcripts(fobj,
                                              fname,
                                              min_exons=min_exons,
                                              merge=10)
                for tname, source, exons in it:
                    db.add_transcript("{0}{1}{2}".format(name, SEP, tname),
                                      source, exons)
                del fobj
            tabixfile.close()
            del tabixfile

        logger.info("Loading data for %s", chrom)

        for name, fname, span, extend in data:
            if span == "splice":
                logger.info("Reading splice data %s from %s", name, fname)
                db.get_splice_statistics(chrom, fname, name)
            else:
                logger.info("Reading BAM data %s from %s", name, fname)
                db.get_read_statistics(chrom,
                                       fname,
                                       name=name,
                                       span=span,
                                       extend=extend,
                                       nreads=None)

    except:
        logger.exception("Error on %s", chrom)
        raise

Beispiel #10

0

Datei anzeigen

Datei: test_collection.py Projekt: simonvh/pita

def test_variants(db, variant_track):
    from pita.dbcollection import DbCollection
    from pita.io import read_bed_transcripts
    from pita.util import model_to_bed

    for tname, source, exons in read_bed_transcripts(open(variant_track)):
         db.add_transcript("{0}{1}{2}".format("t1", "|", tname), source, exons)
    c = DbCollection(db)

    best_model = [m for m in  c.get_connected_models()][0][0]
    cuts = [str(e) for e in c.get_node_cuts(best_model)]
    assert ["chr1:800+900", "chr1:1400+1500"] == cuts 
    
    best_variant = c.get_best_variant(best_model, [{"weight":1,"type":"length","name":"length"}])
    s = [str(s) for s in best_variant]
    assert ["chr1:100+200", "chr1:400+700", "chr1:800+900", "chr1:1000+1300", "chr1:1400+1500", "chr1:1600+1900", "chr1:2000+2100"] == s

Beispiel #11

0

Datei anzeigen

Datei: model.py Projekt: Jorisvansteenbrugge/pita

def load_chrom_data(conn, new, chrom, anno_files, data, index=None):
    logger = logging.getLogger("pita")
    
    try:
        # Read annotation files
        db = AnnotationDb(index=index, conn=conn, new=new)
        logger.debug("%s %s", chrom, id(db.session))
        logger.info("Reading annotation for %s", chrom)
        for name, fname, tabix_file, ftype, min_exons in anno_files:
            logger.info("Reading annotation from %s", fname)
            tabixfile = pysam.Tabixfile(tabix_file)
            #tabixfile = fname
            if chrom in tabixfile.contigs:
                fobj = TabixIteratorAsFile(tabixfile.fetch(chrom))
                if ftype == "bed":
                    it = read_bed_transcripts(fobj, fname, 
                            min_exons=min_exons, merge=10)
                elif ftype in ["gff", "gtf", "gff3"]:
                    it = read_gff_transcripts(fobj, fname, 
                            min_exons=min_exons, merge=10)
                for tname, source, exons in it:
                    db.add_transcript(
                            "{0}{1}{2}".format(name, SEP, tname), 
                            source, exons)
                del fobj    
            tabixfile.close()
            del tabixfile
        
        logger.info("Loading data for %s", chrom)

        for name, fname, span, extend in data:
            if span == "splice":
                logger.info("Reading splice data %s from %s", name, fname)
                db.get_splice_statistics(chrom, fname, name)
            else:
                logger.info("Reading BAM data %s from %s", name, fname)
                db.get_read_statistics(chrom, fname, name=name, span=span, extend=extend, nreads=None)
 
    except:
        logger.exception("Error on %s", chrom)
        raise

Beispiel #12

0

Datei anzeigen

Datei: utr.py Projekt: simonvh/pita

def call_utr(inbed, bamfiles, utr5=False, utr3=True):
    """
    Call 3' UTR for all genes in a BED12 file based on RNA-seq reads 
    in BAM files.
    """
    
    # Load genes in BED file
    transcripts = read_bed_transcripts(open(inbed))
    
    # No genes
    if len(transcripts) == 0:
        return 

    td = dict([(t[0].split("|")[1] + "_", t[2]) for t in transcripts])
    
    #Trying to fix the scaffold struggles
    #td = {}
    #for t in transcripts:
    #    if "scaffold" not in t[0]:
    #        td[t[0].split("_")[1]+"_"] = t[2]
#	else:
#	    inter = t[0].split(":")
#            scafName = "_".join(inter[0].split("_")[2:4])
#	    pos = inter[1].split("_")[0]
#	    td[scafName+":"+pos+"_"] = t[2]

    # Create a BED6 file with exons, used to determine UTR boundaries 
    sys.stderr.write("Preparing temporary BED files\n")
    exonbed = NamedTemporaryFile(prefix="pita.", suffix=".bed")
    bed2exonbed(inbed, exonbed.name)

    # Determine boundaries using bedtools
    genes = pybedtools.BedTool(inbed)
    exons = pybedtools.BedTool(exonbed.name)
    
    tmp = NamedTemporaryFile(prefix="pita.", suffix=".bed")
    
    EXTEND = 10000
    sys.stderr.write("Determining gene boundaries determined by closest gene\n")
    for x in genes.closest(exons, D="a", io=True, iu=True):
        transcript = td[x[3]]
        
        # Extend to closest exon or EXTEND, whichever is closer
        extend = EXTEND
        if (int(x[-1]) >= 0) and (int(x[-1]) < extend):
            extend = int(x[-1])
        
        if transcript[0][-1] == "+":
            first = transcript[-1]
            first[2] += extend
        else:
            first = transcript[-0]
            first[1] -= extend
    
            if first[1] < 0:
                first[1] = 0
       
        tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
            first[0],
            first[1],
            first[2],
            x[3],
            0,
            first[3]
            ))
        
    tmp.flush()
    
    tmpsam = NamedTemporaryFile(prefix="pita.", suffix=".sam")
    tmpbam = NamedTemporaryFile(prefix="pita.")
    
    # Retrieve header from first BAM file
    sp.call("samtools view -H {} > {}".format(bamfiles[0], tmpsam.name), shell=True)
    
    # Filter all BAM files for the specific regions. This runs much faster
    # then running bedtools coverage on all individual BAM files
    tmp_check = NamedTemporaryFile(prefix="pita.", suffix=".bam")
    cmd = "samtools view -L {} {} > {}"
    sys.stderr.write("Merging bam files\n")
    for bamfile in bamfiles:
        try:
            sp.check_call(cmd.format(tmp.name, bamfile, tmp_check.name), shell=True)
            sp.call("cat {} >> {}".format(tmp_check.name, tmpsam.name), shell=True)
        except sp.CalledProcessError as e:
            sys.stderr.write("Error in file {}, skipping:\n".format(bamfile))
            sys.stderr.write("{}\n".format(e))
    
    tmp_check.close()

Beispiel #13

0

Datei anzeigen

def compare_annotation_files(fname1, fname2):
    file1 = open(fname1)
    file2 = open(fname2)

    data1 = read_bed_transcripts(file1)
    data2 = read_bed_transcripts(file2)

    result = compare_annotation(data1, data2)
    for t2, x, exons in data2:
        if not result.has_key(t2) or result[t2] == {}:
            print "{0}\t{1}\t{2}\t{3}".format(exons[0][0], exons[0][1],
                                              exons[-1][2], 1)

    #print result
    cluster = {}
    t2_cluster = {}
    for transcript2, d in result.items():
        for transcript1 in d.keys():
            if not cluster.has_key(transcript1):
                cluster[transcript1] = d.keys()
            else:
                for t in d.keys():
                    if not t in cluster[transcript1]:
                        cluster[transcript1].append(t)
            for t1 in cluster[transcript1]:
                t2_cluster.setdefault(t1, []).append(transcript2)

    clusters = set([tuple(x) for x in cluster.values()])

    loc = {}
    for name, x, exons in data1:
        loc[name] = [exons[0][0], exons[0][1], exons[-1][2]]

    for cluster in clusters:
        t2s = set(t2_cluster[cluster[0]])
        min_acs = []
        for transcript1 in cluster:
            acs = []
            for transcript2 in t2s:
                if result[transcript2].has_key(transcript1):
                    sn, sp, ac = result[transcript2][transcript1]
                    #print transcript1, transcript2, sn, sp, ac
                    acs.append(1 - sp)
            min_acs.append(np.min(acs))
            #print transcript1, np.min(acs)
        #print "###"
        #print cluster, min_acs
        chrom = loc[cluster[0]][0]
        start = np.min([loc[t][1] for t in cluster])
        end = np.max([loc[t][2] for t in cluster])
        print "{0}\t{1}\t{2}\t{3}".format(chrom, start, end, np.mean(min_acs))
    #print loc

    #for t in cluster:
    sys.exit()

    for k, v in result.items():
        if len(v.keys()) > 0:
            print "{0}\t{1}\t{2}".format(loc[k], k, 1 - np.max(v.values()))
        else:
            print "{0}\t{1}\t{2}".format(loc[k], k, 1.0)

Beispiel #14

0

Datei anzeigen

Datei: compare.py Projekt: Jorisvansteenbrugge/pita

def compare_annotation_files(fname1, fname2):        
    file1 = open(fname1)
    file2 = open(fname2)

    data1 = read_bed_transcripts(file1)
    data2 = read_bed_transcripts(file2)
       
    result = compare_annotation(data1, data2)
    for t2, x, exons in data2:
        if not result.has_key(t2) or result[t2] == {}:
            print "{0}\t{1}\t{2}\t{3}".format(exons[0][0], exons[0][1], exons[-1][2], 1)

    #print result 
    cluster = {}
    t2_cluster = {}
    for transcript2,d in result.items():
        for transcript1 in d.keys():
            if not cluster.has_key(transcript1):
                cluster[transcript1] = d.keys()
            else:
                for t in d.keys(): 
                    if not t in cluster[transcript1]:
                        cluster[transcript1].append(t) 
            for t1 in cluster[transcript1]:
                t2_cluster.setdefault(t1, []).append(transcript2)
    
    clusters = set([tuple(x) for x in cluster.values()])
    
    loc = {}
    for name,x,exons in data1:
        loc[name] = [exons[0][0], exons[0][1], exons[-1][2]]
    
    for cluster in clusters:
        t2s = set(t2_cluster[cluster[0]])
        min_acs = []
        for transcript1 in cluster:
            acs = []
            for transcript2 in t2s:
                if result[transcript2].has_key(transcript1):
                    sn, sp, ac = result[transcript2][transcript1]
                    #print transcript1, transcript2, sn, sp, ac
                    acs.append(1 - sp)
            min_acs.append(np.min(acs))
            #print transcript1, np.min(acs)
        #print "###"
        #print cluster, min_acs
        chrom = loc[cluster[0]][0]
        start = np.min([loc[t][1] for t in cluster])
        end = np.max([loc[t][2] for t in cluster])
        print "{0}\t{1}\t{2}\t{3}".format(chrom, start, end, np.mean(min_acs))
    #print loc   

        
        #for t in cluster:
    sys.exit()
    
    for k,v in result.items():
        if len(v.keys()) > 0:
            print "{0}\t{1}\t{2}".format(loc[k], k, 1 - np.max(v.values()))
        else:
            print "{0}\t{1}\t{2}".format(loc[k], k, 1.0)

Beispiel #15

0

Datei anzeigen

Datei: utr.py Projekt: Jorisvansteenbrugge/pita

def call_utr(inbed, bamfiles, utr5=False, utr3=True):
    """
    Call 3' UTR for all genes in a BED12 file based on RNA-seq reads 
    in BAM files.
    """

    # Load genes in BED file
    transcripts = read_bed_transcripts(open(inbed))

    # No genes
    if len(transcripts) == 0:
        return

    td = dict([(t[0].split("|")[1] + "_", t[2]) for t in transcripts])

    #Trying to fix the scaffold struggles
    #td = {}
    #for t in transcripts:
    #    if "scaffold" not in t[0]:
    #        td[t[0].split("_")[1]+"_"] = t[2]
    #	else:
    #	    inter = t[0].split(":")
    #            scafName = "_".join(inter[0].split("_")[2:4])
    #	    pos = inter[1].split("_")[0]
    #	    td[scafName+":"+pos+"_"] = t[2]

    # Create a BED6 file with exons, used to determine UTR boundaries
    sys.stderr.write("Preparing temporary BED files\n")
    exonbed = NamedTemporaryFile(prefix="pita.", suffix=".bed")
    bed2exonbed(inbed, exonbed.name)

    # Determine boundaries using bedtools
    genes = pybedtools.BedTool(inbed)
    exons = pybedtools.BedTool(exonbed.name)

    tmp = NamedTemporaryFile(prefix="pita.", suffix=".bed")

    EXTEND = 10000
    sys.stderr.write(
        "Determining gene boundaries determined by closest gene\n")
    for x in genes.closest(exons, D="a", io=True, iu=True):
        transcript = td[x[3]]

        # Extend to closest exon or EXTEND, whichever is closer
        extend = EXTEND
        if (int(x[-1]) >= 0) and (int(x[-1]) < extend):
            extend = int(x[-1])

        if transcript[0][-1] == "+":
            first = transcript[-1]
            first[2] += extend
        else:
            first = transcript[-0]
            first[1] -= extend

            if first[1] < 0:
                first[1] = 0

        tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(first[0], first[1],
                                                    first[2], x[3], 0,
                                                    first[3]))

    tmp.flush()

    tmpsam = NamedTemporaryFile(prefix="pita.", suffix=".sam")
    tmpbam = NamedTemporaryFile(prefix="pita.")

    # Retrieve header from first BAM file
    sp.call("samtools view -H {} > {}".format(bamfiles[0], tmpsam.name),
            shell=True)

    # Filter all BAM files for the specific regions. This runs much faster
    # then running bedtools coverage on all individual BAM files
    tmp_check = NamedTemporaryFile(prefix="pita.", suffix=".bam")
    cmd = "samtools view -L {} {} > {}"
    sys.stderr.write("Merging bam files\n")
    for bamfile in bamfiles:
        try:
            sp.check_call(cmd.format(tmp.name, bamfile, tmp_check.name),
                          shell=True)
            sp.call("cat {} >> {}".format(tmp_check.name, tmpsam.name),
                    shell=True)
        except sp.CalledProcessError as e:
            sys.stderr.write("Error in file {}, skipping:\n".format(bamfile))
            sys.stderr.write("{}\n".format(e))

    tmp_check.close()

    # Created sorted and index bam
    cmd = "samtools view -Sb {} | samtools sort -m 4G - {}"
    sp.call(cmd.format(tmpsam.name, tmpbam.name), shell=True)
    sp.call("samtools index {}.bam".format(tmpbam.name), shell=True)

    # Close and remove temporary SAM file
    tmpsam.close()

    sys.stderr.write("Calculating coverage\n")
    cmd = "bedtools coverage -abam {} -b {} -d -split "

    p = sp.Popen(cmd.format(tmpbam.name + ".bam", tmp.name),
                 shell=True,
                 stdout=sp.PIPE,
                 bufsize=1)

    sys.stderr.write("Calling UTRs\n")

    data = []
    current = [None]
    utr = {}
    for line in iter(p.stdout.readline, b''):
        vals = line.strip().split("\t")
        if vals[3] != current[0]:
            if len(data) > 0:
                result = call_cpt(current[1], current[2], current[3], data,
                                  len(bamfile))
                #print result
                if result:
                    utr[current[0]] = result
            data = []
            current = [vals[3], int(vals[1]), int(vals[2]), vals[5]]
        data.append(int(vals[7]))
    if current[0]:
        result = call_cpt(current[1], current[2], current[3], data,
                          len(bamfiles))
        if result:
            utr[current[0]] = result

    for fname in [tmpbam.name + ".bam", tmpbam.name + ".bam.bai"]:
        if os.path.exists(fname):
            os.unlink(fname)

    tmpbam.close()
    tmp.close()

    return utr

Beispiel #16

0

Datei anzeigen

Datei: utr.py Projekt: Jorisvansteenbrugge/pita

def call_utr(inbed, bamfiles, utr5=False, utr3=True):
    """
    Call 3' UTR for all genes in a BED12 file based on RNA-seq reads 
    in BAM files.
    """
    
    # Load genes in BED file
    transcripts = read_bed_transcripts(open(inbed))
    
    # No genes
    if len(transcripts) == 0:
        return 

    td = dict([(t[0].split("|")[1] + "_", t[2]) for t in transcripts])
    
    #Trying to fix the scaffold struggles
    #td = {}
    #for t in transcripts:
    #    if "scaffold" not in t[0]:
    #        td[t[0].split("_")[1]+"_"] = t[2]
#	else:
#	    inter = t[0].split(":")
#            scafName = "_".join(inter[0].split("_")[2:4])
#	    pos = inter[1].split("_")[0]
#	    td[scafName+":"+pos+"_"] = t[2]

    # Create a BED6 file with exons, used to determine UTR boundaries 
    sys.stderr.write("Preparing temporary BED files\n")
    exonbed = NamedTemporaryFile(prefix="pita.", suffix=".bed")
    bed2exonbed(inbed, exonbed.name)

    # Determine boundaries using bedtools
    genes = pybedtools.BedTool(inbed)
    exons = pybedtools.BedTool(exonbed.name)
    
    tmp = NamedTemporaryFile(prefix="pita.", suffix=".bed")
    
    EXTEND = 10000
    sys.stderr.write("Determining gene boundaries determined by closest gene\n")
    for x in genes.closest(exons, D="a", io=True, iu=True):
        transcript = td[x[3]]
        
        # Extend to closest exon or EXTEND, whichever is closer
        extend = EXTEND
        if (int(x[-1]) >= 0) and (int(x[-1]) < extend):
            extend = int(x[-1])
        
        if transcript[0][-1] == "+":
            first = transcript[-1]
            first[2] += extend
        else:
            first = transcript[-0]
            first[1] -= extend
    
            if first[1] < 0:
                first[1] = 0
       
        tmp.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
            first[0],
            first[1],
            first[2],
            x[3],
            0,
            first[3]
            ))
        
    tmp.flush()
    
    tmpsam = NamedTemporaryFile(prefix="pita.", suffix=".sam")
    tmpbam = NamedTemporaryFile(prefix="pita.")
    
    # Retrieve header from first BAM file
    sp.call("samtools view -H {} > {}".format(bamfiles[0], tmpsam.name), shell=True)
    
    # Filter all BAM files for the specific regions. This runs much faster
    # then running bedtools coverage on all individual BAM files
    tmp_check = NamedTemporaryFile(prefix="pita.", suffix=".bam")
    cmd = "samtools view -L {} {} > {}"
    sys.stderr.write("Merging bam files\n")
    for bamfile in bamfiles:
        try:
            sp.check_call(cmd.format(tmp.name, bamfile, tmp_check.name), shell=True)
            sp.call("cat {} >> {}".format(tmp_check.name, tmpsam.name), shell=True)
        except sp.CalledProcessError as e:
            sys.stderr.write("Error in file {}, skipping:\n".format(bamfile))
            sys.stderr.write("{}\n".format(e))
    
    tmp_check.close()

    # Created sorted and index bam
    cmd = "samtools view -Sb {} | samtools sort -m 4G - {}"
    sp.call(cmd.format(tmpsam.name, tmpbam.name), shell=True)
    sp.call("samtools index {}.bam".format(tmpbam.name), shell=True)
    
    # Close and remove temporary SAM file
    tmpsam.close()
    
    sys.stderr.write("Calculating coverage\n")
    cmd = "bedtools coverage -abam {} -b {} -d -split "
    
    p = sp.Popen(cmd.format(tmpbam.name + ".bam", tmp.name), shell=True, stdout=sp.PIPE, bufsize=1)
    
    sys.stderr.write("Calling UTRs\n")
    
    data = []
    current = [None]
    utr = {}
    for line in iter(p.stdout.readline, b''):
        vals = line.strip().split("\t")
        if vals[3] != current[0]:
            if len(data) > 0:
                result = call_cpt(current[1], current[2], current[3], data, len(bamfile))
                #print result
                if result:
                    utr[current[0]] = result
            data = []
            current = [vals[3], int(vals[1]), int(vals[2]), vals[5]]
        data.append(int(vals[7]))
    if current[0]:
        result = call_cpt(current[1], current[2], current[3], data, len(bamfiles))
        if result:
            utr[current[0]] = result
    
    for fname in [tmpbam.name + ".bam", tmpbam.name + ".bam.bai"]:
        if os.path.exists(fname):
            os.unlink(fname)
    
    tmpbam.close()
    tmp.close()
    
    
    return utr