コード例 #1
0
ファイル: Intervals_test.py プロジェクト: Charlie-George/cgat
 def testNoOverlap(self):
     """test empty input."""
     self.assertEqual(
         Intervals.truncate([(0, 5), (10, 15)], [(5, 10)]), [(0, 5), (10, 15)])
     self.assertEqual(
         Intervals.truncate([(5, 10)], [(0, 5), (10, 15)]), [(5, 10)])
     self.assertEqual(
         Intervals.truncate([(0, 5), (5, 10)], [(10, 15)]), [(0, 5), (5, 10)])
コード例 #2
0
ファイル: regions2gff.py プロジェクト: siping/cgat
    def processChunk( contig, regions ):
        if contig == None: return
        
        start = 0
        end = contigs[contig]

        regions = Intervals.combineIntervals( regions )
        for xstart, xend in Intervals.complementIntervals( regions, start, end ):
            locations.append( ("intergenic", "intergenic", contig, "+", xstart, xend, ".") )
コード例 #3
0
ファイル: Intervals_test.py プロジェクト: Charlie-George/cgat
 def testMultiple(self):
     """test empty input."""
     self.assertEqual(
         Intervals.intersect([(0, 5), (10, 15)], [(0, 5)]), [(0, 5)])
     self.assertEqual(
         Intervals.intersect([(0, 5), (10, 15)], [(0, 10)]), [(0, 5)])
     self.assertEqual(
         Intervals.intersect([(0, 5), (10, 15)], [(0, 15)]), [(0, 5), (10, 15)])
     self.assertEqual(
         Intervals.intersect([(0, 5), (5, 10)], [(0, 10)]), [(0, 5), (5, 10)])
コード例 #4
0
ファイル: Intervals_test.py プロジェクト: Charlie-George/cgat
 def testSingle(self):
     """test empty input."""
     self.assertEqual(Intervals.truncate([(0, 5)], [(0, 5)]), [])
     self.assertEqual(Intervals.truncate([(0, 5)], [(0, 3)]), [(3, 5)])
     self.assertEqual(Intervals.truncate([(0, 3)], [(0, 5)]), [])
     self.assertEqual(Intervals.truncate([(0, 5)], [(3, 5)]), [(0, 3)])
     self.assertEqual(Intervals.truncate([(3, 5)], [(0, 5)]), [])
     self.assertEqual(Intervals.truncate([(5, 10)], [(5, 10)]), [])
     self.assertEqual(Intervals.truncate([(5, 10)], [(5, 20)]), [])
     self.assertEqual(Intervals.truncate([(5, 10)], [(0, 10)]), [])
     self.assertEqual(Intervals.truncate([(5, 10)], [(0, 10)]), [])
     self.assertEqual(Intervals.truncate([(5, 10)], [(0, 20)]), [])
コード例 #5
0
ファイル: bed2table.py プロジェクト: siping/cgat
    def count( self, bed ):
        '''update internal counts.'''

        results = []
        for track in self.tracks:
            try:
                overlaps = [ (x[0],x[1]) for x in self.index[track][bed.contig].find( bed.start, bed.end ) ]
            except KeyError:
                overlaps = []

            results.append( (len(overlaps), 
                             Intervals.calculateOverlap( [(bed.start, bed.end),],
                                                         Intervals.combine( overlaps ) ) ) )

        self.data = results
コード例 #6
0
ファイル: beds2beds.py プロジェクト: Charlie-George/cgat
def combineMergedIntervals(bedfiles):
    '''combine intervals in a collection of bed files.

    Overlapping intervals between tracks are merged.

    Algorithm:

    1. collect all intervals in all tracks into a single track
    2. merge overlapping intervals 
    3. report all intervals that overlap with an interval in each track.

    '''

    # get all intervals
    data_per_contig = collections.defaultdict(list)
    for bedfile in bedfiles:
        for contig in bedfile.contigs:
            i = []
            for bed in bedfile.fetch(contig, parser=pysam.asBed()):
                i.append((bed.start, bed.end))
            data_per_contig[contig].extend(i)

    # merge intervals
    for contig in data_per_contig.keys():
        data_per_contig[contig] = Intervals.combine(data_per_contig[contig])

    # filter intervals - take only those present in all bedfiles
    for contig, data in data_per_contig.iteritems():
        for start, end in data:
            if isContainedInAll(contig, start, end, bedfiles):
                yield contig, start, end
コード例 #7
0
ファイル: transcript_regions.py プロジェクト: sudlab/iCLIPlib
def UTR3(transcript):
    
    exons = GTF.asRanges(transcript, "exon")
    cds = GTF.asRanges(transcript, "CDS")

    if len(cds) == 0:
        return list()
    
    utrs = Intervals.truncate(exons, cds)

    if transcript[0].strand == "+":
        utr3 = [exon for exon in utrs
                if exon[0] >= cds[-1][1]]
    else:
        utr3 = [exon for exon in utrs
                if exon[-1] <= cds[0][0]]

    for e in transcript:
        if e.feature == "exon":
            template_exon = e
            break
            
    returned_exons = []     
    for e in utr3:
        gtf = GTF.Entry().fromGTF(template_exon)
        gtf.start = e[0]
        gtf.end = e[1]
        returned_exons.append(gtf)
        
    return returned_exons
コード例 #8
0
def combineMergedIntervals(bedfiles):
    '''combine intervals in a collection of bed files.

    Overlapping intervals between tracks are merged.

    Algorithm:

    1. collect all intervals in all tracks into a single track
    2. merge overlapping intervals
    3. report all intervals that overlap with an interval in each track.

    '''

    # get all intervals
    data_per_contig = collections.defaultdict(list)

    for bedfile in bedfiles:
        for contig in bedfile.contigs:
            i = []
            for bed in bedfile.fetch(contig, parser=pysam.asBed()):
                i.append((bed.start, bed.end))
            data_per_contig[contig].extend(i)

    # merge intervals
    for contig in list(data_per_contig.keys()):
        data_per_contig[contig] = Intervals.combine(data_per_contig[contig])

    # filter intervals - take only those present in all bedfiles
    for contig, data in sorted(data_per_contig.items()):
        for start, end in data:
            if isContainedInAll(contig, start, end, bedfiles):
                yield contig, start, end
コード例 #9
0
def toSequence(chunk, fasta):
    """convert a list of gff attributes to a single sequence.

    This function ensures correct in-order concatenation on
    positive/negative strand. Overlapping regions are merged.
    """
    if len(chunk) == 0:
        return ""

    contig, strand = chunk[0].contig, chunk[0].strand

    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    lcontig = fasta.getLength(contig)
    positive = Genomics.IsPositiveStrand(strand)

    if not positive:
        intervals = [(lcontig - end, lcontig - start)
                     for start, end in intervals]
        intervals.reverse()

    s = [
        fasta.getSequence(contig, strand, start, end)
        for start, end in intervals
    ]

    return "".join(s)
コード例 #10
0
ファイル: GTF.py プロジェクト: CGATOxford/cgat
def toIntronIntervals(chunk):
    """convert a set of gtf elements within a transcript to intron coordinates.

    Will use first transcript_id found.

    Note that coordinates will still be forward strand coordinates
    """
    if len(chunk) == 0:
        return []
    contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id)
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"])
    return Intervals.complement(intervals)
コード例 #11
0
ファイル: GTF.py プロジェクト: prasoonnema/cgat
def toSequence(chunk, fasta):
    """convert a list of gff attributes to a single sequence.

    This function ensures correct in-order concatenation on
    positive/negative strand. Overlapping regions are merged.
    """
    if len(chunk) == 0:
        return ""

    contig, strand = chunk[0].contig, chunk[0].strand

    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    lcontig = fasta.getLength(contig)
    positive = Genomics.IsPositiveStrand(strand)

    if not positive:
        intervals = [(lcontig - end, lcontig - start) for start, end in intervals]
        intervals.reverse()

    s = [fasta.getSequence(contig, strand, start, end) for start, end in intervals]

    return "".join(s)
コード例 #12
0
ファイル: gff2gff.py プロジェクト: yangjl/cgat
def cropGFF(gffs, options):
    """crop intervals in gff file."""

    # read regions to crop with and convert intervals to intersectors
    E.info("reading gff for cropping: started.")

    other_gffs = GTF.iterator(IOTools.openFile(options.crop, "r"))
    cropper = GTF.readAsIntervals(other_gffs)
    ntotal = 0
    for contig in cropper.keys():
        intersector = bx.intervals.intersection.Intersecter()
        for start, end in cropper[contig]:
            intersector.add_interval(bx.intervals.Interval(start, end))
            ntotal += 1
        cropper[contig] = intersector

    E.info("reading gff for cropping: finished.")
    E.info("reading gff for cropping: %i contigs with %i intervals." %
           (len(cropper), ntotal))

    ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0

    # do the actual cropping
    for gff in gffs:

        ninput += 1

        if gff.contig in cropper:
            start, end = gff.start, gff.end
            overlaps = cropper[gff.contig].find(start, end)

            if overlaps:
                l = end - start
                a = numpy.ones(l)
                for i in overlaps:
                    s = max(0, i.start - start)
                    e = min(l, i.end - start)
                    a[s:e] = 0

                segments = Intervals.fromArray(a)

                if len(segments) == 0:
                    ndeleted += 1
                else:
                    ncropped += 1

                for s, e in segments:
                    gff.start, gff.end = s + start, e + start
                    noutput += 1
                    options.stdout.write("%s\n" % gff)

                continue

        noutput += 1
        options.stdout.write("%s\n" % gff)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i\n" %
            (ninput, noutput, ncropped, ndeleted))
コード例 #13
0
ファイル: gtf2gff.py プロジェクト: CGATOxford/cgat
def annotateRegulons(iterator, fasta, tss, options):
    """annotate regulons within iterator.

    Entries specied with ``--restrict-source`` are annotated.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nregulons = 0, 0, 0

    upstream, downstream = options.upstream, options.downstream

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        regulons = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript]), max([x.end for x in transcript])
            if tss:
                # add range to both sides of tss
                if is_negative_strand:
                    interval = ma - options.downstream, ma + options.upstream
                else:
                    interval = mi - options.upstream, mi + options.downstream
            else:
                # add range to both sides of tts
                if is_negative_strand:
                    interval = mi - options.downstream, mi + options.upstream
                else:
                    interval = ma - options.upstream, ma + options.downstream

            interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1])))

            regulons.append(interval)
            transcript_ids.append(transcript[0].transcript_id)

        if options.merge_promotors:
            # merge the regulons (and rename - as sort order might have
            # changed)
            regulons = Intervals.combine(regulons)
            transcript_ids = ["%i" % (x + 1) for x in range(len(regulons))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "regulon"

        x = 0
        for start, end in regulons:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            nregulons += 1
            x += 1

    E.info("ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons))
コード例 #14
0
ファイル: gtf2gff.py プロジェクト: yangjl/cgat
def annotateRegulons( iterator, fasta, tss, options ):
    """annotate regulons within iterator.

    Entries specied with ``--restrict-source`` are annotated.
    """

    gene_iterator = GTF.gene_iterator( iterator )

    ngenes, ntranscripts, nregulons = 0, 0, 0

    upstream, downstream = options.upstream, options.downstream

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand( gene[0][0].strand )
        lcontig = fasta.getLength( gene[0][0].contig )
        regulons = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min( [x.start for x in transcript ] ), max( [x.end for x in transcript ] )
            if tss:
                # add range to both sides of tss
                if is_negative_strand:
                    interval = ma - options.downstream, ma + options.upstream
                else:
                    interval = mi - options.upstream, mi + options.downstream
            else:
                # add range to both sides of tts
                if is_negative_strand:
                    interval = mi - options.downstream, mi + options.upstream
                else:
                    interval = ma - options.upstream, ma + options.downstream

            interval = ( min( lcontig, max( 0, interval[0] ) ),
                         min( lcontig, max( 0, interval[1] ) ) )
            
            regulons.append( interval )
            transcript_ids.append( transcript[0].transcript_id )

        if options.merge_promotors:
            # merge the regulons (and rename - as sort order might have changed)
            regulons = Intervals.combine( regulons )
            transcript_ids = ["%i" % (x+1) for x in range(len(regulons) )]
            
        gtf = GTF.Entry()
        gtf.fromGTF( gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id )
        gtf.source = "regulon"

        x = 0
        for start, end in regulons:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write( "%s\n" % str(gtf) )
            nregulons += 1
            x += 1

    E.info( "ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons) )
コード例 #15
0
ファイル: gff2gff.py プロジェクト: Charlie-George/cgat
def cropGFF(gffs, options):
    """crop intervals in gff file."""

    # read regions to crop with and convert intervals to intersectors
    E.info("reading gff for cropping: started.")

    other_gffs = GTF.iterator(IOTools.openFile(options.crop, "r"))
    cropper = GTF.readAsIntervals(other_gffs)
    ntotal = 0
    for contig in cropper.keys():
        intersector = bx.intervals.intersection.Intersecter()
        for start, end in cropper[contig]:
            intersector.add_interval(bx.intervals.Interval(start, end))
            ntotal += 1
        cropper[contig] = intersector

    E.info("reading gff for cropping: finished.")
    E.info("reading gff for cropping: %i contigs with %i intervals." %
           (len(cropper), ntotal))

    ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0

    # do the actual cropping
    for gff in gffs:

        ninput += 1

        if gff.contig in cropper:
            start, end = gff.start, gff.end
            overlaps = cropper[gff.contig].find(start, end)

            if overlaps:
                l = end - start
                a = numpy.ones(l)
                for i in overlaps:
                    s = max(0, i.start - start)
                    e = min(l, i.end - start)
                    a[s:e] = 0

                segments = Intervals.fromArray(a)

                if len(segments) == 0:
                    ndeleted += 1
                else:
                    ncropped += 1

                for s, e in segments:
                    gff.start, gff.end = s + start, e + start
                    noutput += 1
                    options.stdout.write("%s\n" % gff)

                continue

        noutput += 1
        options.stdout.write("%s\n" % gff)

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i\n" % (
            ninput, noutput, ncropped, ndeleted))
コード例 #16
0
    def count(self, bed):
        '''update internal counts.'''

        results = []
        for track in self.tracks:
            try:
                overlaps = [(x[0], x[1])
                            for x in self.index[track][bed.contig].find(bed.start, bed.end)]
            except KeyError:
                overlaps = []

            results.append((len(overlaps),
                            Intervals.calculateOverlap(
                                [(bed.start, bed.end), ],
                                Intervals.combine(overlaps))))

        self.data = results
コード例 #17
0
ファイル: GTF.py プロジェクト: prasoonnema/cgat
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"):
    """select only those genes with a minimum length of a given feature."""
    for gffs in gff_iterator:
        intervals = [(x.start, x.end) for x in gffs if x.feature == feature]
        intervals = Intervals.combine(intervals)
        t = sum((x[1] - x[0] for x in intervals))
        if t >= min_length:
            yield gffs
コード例 #18
0
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"):
    """select only those genes with a minimum length of a given feature."""
    for gffs in gff_iterator:
        intervals = [(x.start, x.end) for x in gffs if x.feature == feature]
        intervals = Intervals.combine(intervals)
        t = sum((x[1] - x[0] for x in intervals))
        if t >= min_length:
            yield gffs
コード例 #19
0
ファイル: regions2gff.py プロジェクト: siping/cgat
    def processChunk( gene_id, contig, strand, frame, regions ):
        if gene_id == None: return
        
        start = min( map( lambda x: x[0], regions ) )
        end = max( map( lambda x: x[0], regions ) )

        intervals = Intervals.complementIntervals( regions, start, end )
        for start, end in intervals:
            locations.append( (gene_id, gene_id, contig, strand, start, end, frame ) )
コード例 #20
0
def FilterEliminateOverlappingTranscripts(exons, filter_exons,
                                          eliminated_predictions, contig_sizes,
                                          options):
    """eliminate predictions that overlap or span a positive set of transcripts.
    """

    eliminated = []

    # convert list of filter exons into a list of ranges.
    filter_ranges = getRangesFromExons(
        filter_exons,
        both_strands=options.filter_remove_spanning_both_strands,
        contig_sizes=contig_sizes)

    for k, r in filter_ranges.items():
        filter_ranges[k] = Intervals.combineIntervals(map(lambda x: x[:2], r))

    exon_ranges = getRangesFromExons(exons, both_strands=False)

    # and now go through exons and delete transcripts whose
    # exons overlap one of the forbidden ranges
    for k, ee in exon_ranges.items():

        if k not in filter_ranges:
            continue

        ff = filter_ranges[k]
        ee.sort()

        # set exon index e and filter index f
        # (both are indices in sorted lists)
        e, f = 0, 0

        while e < len(ee):

            efrom, eto, id = ee[e]

            # increment filter, such that its extent
            # is larger than current range ee[e] to test.
            while f < len(ff) and ff[f][1] < efrom:
                f += 1
            if f == len(ff):
                break

            if eto < ff[f][0]:
                # no overlap
                pass
            else:
                options.stdout.write("%s\t%s\n" %
                                     (id, "eliminated: filtered by %s:%i:%i" %
                                      (k, ff[f][0], ff[f][1])))
                eliminated_predictions[id] = 0
                eliminated.append((id, "f"))

            e += 1

    return eliminated
コード例 #21
0
def toIntronIntervals(chunk):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will use first transcript_id found.

    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0:
        return []
    contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand,
                                     chunk[0].transcript_id)
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk
                                   if x.feature == "exon"])
    return Intervals.complement(intervals)
コード例 #22
0
def FilterEliminateOverlappingTranscripts(
        exons, filter_exons,
        eliminated_predictions, contig_sizes, options):
    """eliminate predictions that overlap or span a positive set of transcripts.
    """

    eliminated = []

    # convert list of filter exons into a list of ranges.
    filter_ranges = getRangesFromExons(
        filter_exons,
        both_strands=options.filter_remove_spanning_both_strands,
        contig_sizes=contig_sizes)

    for k, r in filter_ranges.items():
        filter_ranges[k] = Intervals.combineIntervals(map(lambda x: x[:2], r))

    exon_ranges = getRangesFromExons(exons,
                                     both_strands=False)

    # and now go through exons and delete transcripts whose
    # exons overlap one of the forbidden ranges
    for k, ee in exon_ranges.items():

        if k not in filter_ranges:
            continue

        ff = filter_ranges[k]
        ee.sort()

        # set exon index e and filter index f
        # (both are indices in sorted lists)
        e, f = 0, 0

        while e < len(ee):

            efrom, eto, id = ee[e]

            # increment filter, such that its extent
            # is larger than current range ee[e] to test.
            while f < len(ff) and ff[f][1] < efrom:
                f += 1
            if f == len(ff):
                break

            if eto < ff[f][0]:
                # no overlap
                pass
            else:
                options.stdout.write(
                    "%s\t%s\n" % (id, "eliminated: filtered by %s:%i:%i" % (k, ff[f][0], ff[f][1])))
                eliminated_predictions[id] = 0
                eliminated.append((id, "f"))

            e += 1

    return eliminated
コード例 #23
0
def get_windows(pvalues, window_size, threshold):

    # intervals are close closed
    windows = [(pos-window_size, pos+window_size+1)
               for pos in pvalues.index.values]

    merged_windows = Intervals.combine(windows)
    windows_min_p = [pvalues.ix[float(start):float(end-1)].min()
                    for start, end in merged_windows]
    return zip(merged_windows, windows_min_p)
コード例 #24
0
    def processChunk(gene_id, contig, strand, frame, regions):
        if gene_id == None: return

        start = min(map(lambda x: x[0], regions))
        end = max(map(lambda x: x[0], regions))

        intervals = Intervals.complementIntervals(regions, start, end)
        for start, end in intervals:
            locations.append(
                (gene_id, gene_id, contig, strand, start, end, frame))
コード例 #25
0
ファイル: GTF.py プロジェクト: Charlie-George/cgat
def toIntronIntervals(chunk):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will raise an error if more than one transcript is submitted.

    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0:
        return []
    contig, strand, transcript_id = chunk[
        0].contig, chunk[0].strand, chunk[0].transcript_id
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
        assert gff.transcript_id == transcript_id, "more than one transcript submitted"

    intervals = Intervals.combine([(x.start, x.end)
                                   for x in chunk if x.feature == "exon"])
    return Intervals.complement(intervals)
コード例 #26
0
 def testSingle(self):
     """test empty input."""
     self.assertEqual(Intervals.intersect([(0, 5)], [(0, 5)]), [(0, 5)])
     self.assertEqual(Intervals.intersect([(0, 5)], [(0, 3)]), [(0, 3)])
     self.assertEqual(Intervals.intersect([(0, 3)], [(0, 5)]), [(0, 3)])
     self.assertEqual(Intervals.intersect([(0, 5)], [(3, 5)]), [(3, 5)])
     self.assertEqual(Intervals.intersect([(3, 5)], [(0, 5)]), [(3, 5)])
     self.assertEqual(Intervals.intersect([(5, 10)], [(5, 20)]), [(5, 10)])
     self.assertEqual(Intervals.intersect([(5, 10)], [(0, 20)]), [(5, 10)])
コード例 #27
0
ファイル: gtf2gff.py プロジェクト: gsc0107/cgat
def annotateTTS(iterator, fasta, options):
    """annotate termination sites within iterator.

    Entries specified with ``--restrict-source are annotated``.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, npromotors = 0, 0, 0

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        tts = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript]), max(
                [x.end for x in transcript])
            transcript_ids.append(transcript[0].transcript_id)
            # if tts is directly at start/end of contig, the tss will
            # be within an exon.  otherwise, it is outside an exon.
            if is_negative_strand:
                tts.append(
                    (max(0, mi - options.promotor), max(options.promotor, mi)))
            else:
                tts.append(
                    (min(ma, lcontig - options.promotor),
                     min(lcontig, ma + options.promotor)))

        if options.merge_promotors:
            # merge the promotors (and rename - as sort order might have
            # changed)
            tts = Intervals.combine(tts)
            transcript_ids = ["%i" % (x + 1) for x in range(len(tts))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "tts"

        x = 0
        for start, end in tts:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            npromotors += 1
            x += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ngenes=%i, ntranscripts=%i, ntss=%i\n" %
            (ngenes, ntranscripts, npromotors))
コード例 #28
0
ファイル: gtf2gff.py プロジェクト: gsc0107/cgat
def annotateExons(iterator, fasta, options):
    """annotate exons within iterator."""

    gene_iterator = GTF.gene_iterator(iterator)

    ninput, noutput, noverlapping = 0, 0, 0

    for this in gene_iterator:
        ninput += 1
        intervals = collections.defaultdict(list)
        ntranscripts = len(this)

        is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand)

        for exons in this:
            # make sure these are sorted correctly
            exons.sort(key=lambda x: x.start)
            if is_negative_strand:
                exons.reverse()

            nexons = len(exons)
            for i, e in enumerate(exons):
                intervals[(e.start, e.end)].append((i + 1, nexons))

        gtf = GTF.Entry()
        gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id)
        gtf.addAttribute("ntranscripts", ntranscripts)

        gtfs = []
        for r, pos in intervals.items():

            g = GTF.Entry().copy(gtf)
            g.start, g.end = r
            g.addAttribute("nused", len(pos))
            g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos]))
            gtfs.append(g)

        gtfs.sort(key=lambda x: x.start)

        for g in gtfs:
            options.stdout.write("%s\n" % str(g))

        # check for exon overlap
        intervals = [(g.start, g.end) for g in gtfs]
        nbefore = len(intervals)
        nafter = len(Intervals.combine(intervals))
        if nafter != nbefore:
            noverlapping += 1

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
コード例 #29
0
ファイル: gtf2gff.py プロジェクト: Charlie-George/cgat
def annotateExons(iterator, fasta, options):
    """annotate exons within iterator."""

    gene_iterator = GTF.gene_iterator(iterator)

    ninput, noutput, noverlapping = 0, 0, 0

    for this in gene_iterator:
        ninput += 1
        intervals = collections.defaultdict(list)
        ntranscripts = len(this)

        is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand)

        for exons in this:
            # make sure these are sorted correctly
            exons.sort(key=lambda x: x.start)
            if is_negative_strand:
                exons.reverse()

            nexons = len(exons)
            for i, e in enumerate(exons):
                intervals[(e.start, e.end)].append((i + 1, nexons))

        gtf = GTF.Entry()
        gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id)
        gtf.addAttribute("ntranscripts", ntranscripts)

        gtfs = []
        for r, pos in intervals.iteritems():

            g = GTF.Entry().copy(gtf)
            g.start, g.end = r
            g.addAttribute("nused", len(pos))
            g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos]))
            gtfs.append(g)

        gtfs.sort(key=lambda x: x.start)

        for g in gtfs:
            options.stdout.write("%s\n" % str(g))

        # check for exon overlap
        intervals = [(g.start, g.end) for g in gtfs]
        nbefore = len(intervals)
        nafter = len(Intervals.combine(intervals))
        if nafter != nbefore:
            noverlapping += 1

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
コード例 #30
0
ファイル: gtf2gff.py プロジェクト: Charlie-George/cgat
def annotateTTS(iterator, fasta, options):
    """annotate termination sites within iterator.

    Entries specified with ``--restrict-source are annotated``.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, npromotors = 0, 0, 0

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        tts = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript]), max(
                [x.end for x in transcript])
            transcript_ids.append(transcript[0].transcript_id)
            # if tts is directly at start/end of contig, the tss will
            # be within an exon.  otherwise, it is outside an exon.
            if is_negative_strand:
                tts.append(
                    (max(0, mi - options.promotor), max(options.promotor, mi)))
            else:
                tts.append(
                    (min(ma, lcontig - options.promotor),
                     min(lcontig, ma + options.promotor)))

        if options.merge_promotors:
            # merge the promotors (and rename - as sort order might have
            # changed)
            tts = Intervals.combine(tts)
            transcript_ids = ["%i" % (x + 1) for x in range(len(tts))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "tts"

        x = 0
        for start, end in tts:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            npromotors += 1
            x += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ngenes=%i, ntranscripts=%i, ntss=%i\n" %
            (ngenes, ntranscripts, npromotors))
コード例 #31
0
def get_windows(pvalues, window_size, threshold):

    # intervals are close closed
    windows = [(pos - window_size, pos + window_size + 1)
               for pos in pvalues.index.values]

    merged_windows = Intervals.combine(windows)
    windows_min_p = [
        pvalues.ix[float(start):float(end - 1)].min()
        for start, end in merged_windows
    ]
    return zip(merged_windows, windows_min_p)
コード例 #32
0
def findRetainedIntrons(infile, outfile):

    outf = IOTools.openFile(outfile, "w")

    for gene in GTF.gene_iterator(GTF.iterator(IOTools.openFile(infile))):

        gene_out = []
        introns_out = []

        # now find if any of the transcripts are retained intron
        # versions of any of the others
        for first, second in itertools.product(gene, gene):

            first = sorted(
                [entry for entry in first if entry.feature == "exon"],
                key=lambda x: x.start)
            second = sorted(
                [entry for entry in second if entry.feature == "exon"],
                key=lambda x: x.start)

            first_introns = set(GTF.toIntronIntervals(first))
            second_introns = set(GTF.toIntronIntervals(second))

            if len(first_introns-second_introns) > 0 and \
               len(second_introns-first_introns) == 0:
                novel_introns = list(first_introns - second_introns)

                def _filterIntron(intron):
                    return intron[0] > second[0].start and \
                        intron[1] < second[-1].end

                novel_introns = filter(_filterIntron, novel_introns)

                if len(novel_introns) > 0:
                    gene_out.extend(first)

                for intron in novel_introns:
                    introns_out.append(intron)

        introns_out = Intervals.combine(introns_out)
        template = gene[0][0]
        template.feature = "exon"
        for gff in introns_out:
            entry = GTF.Entry().copy(template)
            entry.start = gff[0]
            entry.end = gff[1]
            outf.write("%s\n" % str(entry))
コード例 #33
0
ファイル: GTF.py プロジェクト: santayana/cgat
def asRanges(gffs, feature=None):
    """return ranges within a set of gffs.

    Overlapping intervals are merged.

    The returned intervals are sorted.
    """

    if isinstance(feature, basestring):
        gg = filter(lambda x: x.feature == feature, gffs)
    elif feature:
        gg = filter(lambda x: x.feature in feature, gffs)
    else:
        gg = gffs[:]

    r = [(g.start, g.end) for g in gg]
    return Intervals.combine(r)
コード例 #34
0
ファイル: GTF.py プロジェクト: prasoonnema/cgat
def asRanges(gffs, feature=None):
    """return ranges within a set of gffs.

    Overlapping intervals are merged.

    The returned intervals are sorted.
    """

    if isinstance(feature, basestring):
        gg = filter(lambda x: x.feature == feature, gffs)
    elif feature:
        gg = filter(lambda x: x.feature in feature, gffs)
    else:
        gg = gffs[:]

    r = [(g.start, g.end) for g in gg]
    return Intervals.combine(r)
コード例 #35
0
ファイル: GTF.py プロジェクト: CGATOxford/cgat
def asRanges(gffs, feature=None):
    """return ranges within a set of gffs.

    Overlapping intervals are merged.

    The returned intervals are sorted.
    """

    if isinstance(feature, str):
        gg = [x for x in gffs if x.feature == feature]
    elif feature:
        gg = [x for x in gffs if x.feature in feature]
    else:
        gg = gffs[:]

    r = [(g.start, g.end) for g in gg]
    return Intervals.combine(r)
コード例 #36
0
def asRanges(gffs, feature=None):
    """return ranges within a set of gffs.

    Overlapping intervals are merged.

    The returned intervals are sorted.
    """

    if isinstance(feature, str):
        gg = [x for x in gffs if x.feature == feature]
    elif feature:
        gg = [x for x in gffs if x.feature in feature]
    else:
        gg = gffs[:]

    r = [(g.start, g.end) for g in gg]
    return Intervals.combine(r)
コード例 #37
0
ファイル: gff_decorate.py プロジェクト: Q-KIM/cgat
def transform_third_codon(start, end, intervals_with_gff):
    """transform: only return nucleotide positions in window (start, end) 
    that are in third codon position.
    """
    intervals = []
    for istart, iend, gff in intervals_with_gff:

        if gff.frame == ".":
            raise ValueError("need a frame for third codon positions.")

        # frame = nucleotides from start to next codon
        frame = int(gff.frame)

        # to make life easier, convert to 0-based coordinates,
        # with zero starting at first position in window
        # re-arrange positions on negative strand
        if Genomics.IsNegativeStrand(gff.strand):
            # convert to negative strand coordinates counting from 0
            coordinate_offset = end
            reverse = True
            istart, iend = end - iend, end - istart
        else:
            istart, iend = istart - start, iend - start
            reverse = False
            coordinate_offset = start

        # make sure that you start on a second codon position and within window
        if istart < 0:
            frame = (frame + istart) % 3
            istart = 0
        if frame != 0:
            istart -= (3 - frame)
        istart += 2

        iend = min(iend, end - start)

        for x in range(istart, iend, 3):

            if reverse:
                c = coordinate_offset - x - 1
            else:
                c = coordinate_offset + x
            intervals.append((c, c + 1))

    return Intervals.combineIntervals(intervals)
コード例 #38
0
ファイル: gff2table.py プロジェクト: kathrinjansen/cgat
def transform_third_codon(start, end, intervals_with_gff):
    """transform: only return nucleotide positions in window (start, end) 
    that are in third codon position.
    """
    intervals = []
    for istart, iend, gff in intervals_with_gff:

        if gff.frame == ".":
            raise ValueError("need a frame for third codon positions.")

        # frame = nucleotides from start to next codon
        frame = int(gff.frame)

        # to make life easier, convert to 0-based coordinates,
        # with zero starting at first position in window
        # re-arrange positions on negative strand
        if Genomics.IsNegativeStrand(gff.strand):
            # convert to negative strand coordinates counting from 0
            coordinate_offset = end
            reverse = True
            istart, iend = end - iend, end - istart
        else:
            istart, iend = istart - start, iend - start
            reverse = False
            coordinate_offset = start

        # make sure that you start on a second codon position and within window
        if istart < 0:
            frame = (frame + istart) % 3
            istart = 0
        if frame != 0:
            istart -= (3 - frame)
        istart += 2

        iend = min(iend, end - start)

        for x in range(istart, iend, 3):

            if reverse:
                c = coordinate_offset - x - 1
            else:
                c = coordinate_offset + x
            intervals.append((c, c + 1))

    return Intervals.combineIntervals(intervals)
コード例 #39
0
ファイル: gff2stats.py プロジェクト: jmadzo/cgat
    def __str__(self):

        single_exon_transcripts = 0
        exons_per_transcript = []
        intron_sizes = []
        transcript_lengths = []
        exon_sizes = []

        for x in self.counts_exons_per_transcript.values():

            x.sort()
            x = Intervals.combine(x)
            transcript_lengths.append(x[-1][1] - x[0][0])

            exons_per_transcript.append(len(x))

            for start, end in x:
                exon_sizes.append(end - start)

            if len(x) == 1:
                single_exon_transcripts += 1
                continue

            last_end = x[0][1]
            for start, end in x[1:]:
                intron_sizes.append(start - last_end)
                last_end = end

        return "\t".join(
            map(
                str,
                (
                    len(self.counts_gene_ids),
                    len(self.counts_transcript_ids),
                    single_exon_transcripts,
                    Stats.Summary(exons_per_transcript),
                    Stats.Summary(exon_sizes),
                    Stats.Summary(intron_sizes),
                    Stats.Summary(transcript_lengths),
                ),
            )
        )
コード例 #40
0
ファイル: gtf2gtf.py プロジェクト: Q-KIM/cgat
def find_retained_introns(gene):
    '''Given a bundle of transcripts, find intervals matching retained
    introns. A retained intron is defined as an interval from an exon/intron
    boundary to the next where both boundaries are in the same exon of another
    transcript'''

    intron_intervals = [GTF.toIntronIntervals(transcript)
                        for transcript in gene]
    intron_intervals = list(set(
        itertools.chain.from_iterable(intron_intervals)))
    intron_intervals.sort()

    for transcript in gene:
        exons = iter(sorted(GTF.asRanges(transcript)))
        introns = iter(intron_intervals)
        retained_introns = []
        try:
            intron = introns.next()
            exon = exons.next()
            while True:

                if exon[1] < intron[0]:

                    exon = exons.next()
                    continue

                if intron[0] >= exon[0] and intron[1] <= exon[1]:
                    E.debug("exon %s of transcript %s contains intron %s" %
                            (exon, transcript[0].transcript_id, intron))
                    retained_introns.append(intron)
                intron = introns.next()
        except StopIteration:
            pass

        retained_introns = Intervals.combine(retained_introns)

        for intron in retained_introns:
            entry = GTF.Entry()
            entry = entry.copy(transcript[0])
            entry.start = intron[0]
            entry.end = intron[1]
            yield entry
コード例 #41
0
    def __str__(self):

        single_exon_transcripts = 0
        exons_per_transcript = []
        intron_sizes = []
        transcript_lengths = []
        exon_sizes = []

        for x in self.counts_exons_per_transcript.values():

            x.sort()
            x = Intervals.combine(x)
            transcript_lengths.append(x[-1][1] - x[0][0])

            exons_per_transcript.append(len(x))

            for start, end in x:
                exon_sizes.append(end - start)

            if len(x) == 1:
                single_exon_transcripts += 1
                continue

            last_end = x[0][1]
            for start, end in x[1:]:
                intron_sizes.append(start - last_end)
                last_end = end

        return "\t".join(
            map(str, (
                len(self.counts_gene_ids),
                len(self.counts_transcript_ids),
                single_exon_transcripts,
                Stats.Summary(exons_per_transcript),
                Stats.Summary(exon_sizes),
                Stats.Summary(intron_sizes),
                Stats.Summary(transcript_lengths),
            )))
コード例 #42
0
 def testHalfEmpty(self):
     """test empty input."""
     self.assertEqual(Intervals.intersect([(0, 5)], []), [])
     self.assertEqual(Intervals.intersect([], [(0, 5)]), [])
コード例 #43
0
 def testEmpty(self):
     """test empty input."""
     self.assertEqual(Intervals.intersect([], []), [])
コード例 #44
0
 def testEmpty(self):
     """test empty input."""
     self.assertEqual(Intervals.truncate([], []), [])
コード例 #45
0
 def testArray2(self):
     """test longer array."""
     a = [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]
     self.assertEqual(Intervals.fromArray(a), [(0, 3), (6, 9), (12, 15)])
     self.assertEqual(Intervals.fromArray([not x for x in a]), [(3, 6),
                                                                (9, 12)])
コード例 #46
0
ファイル: gff2table.py プロジェクト: kathrinjansen/cgat
def transform_overlap(start, end, intervals_with_gff):
    """transform: overlap of intervals in x with y."""
    y = Intervals.combineIntervals([(x[0], x[1]) for x in intervals_with_gff])
    return Intervals.pruneIntervals(y, start, end)
コード例 #47
0
ファイル: gff_decorate.py プロジェクト: Q-KIM/cgat
def transform_complement(start, end, intervals_with_gff):
    y = Intervals.combineIntervals(
        map(lambda x: (x[0], x[1]), intervals_with_gff))
    return Intervals.complementIntervals(y, start, end)
コード例 #48
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--feature",
                      dest="feature",
                      type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="which feature to use: gene/transcript/exon")
    parser.add_option("--unstranded-bw",
                      dest="unstranded_wig",
                      type="string",
                      help="BigWig with tag counts on both strands")
    parser.add_option("--plus-bw",
                      dest="plus_wig",
                      type="string",
                      help="BigWig with tag counts from plus strand")
    parser.add_option("--minus-bw",
                      dest="minus_wig",
                      type="string",
                      help="BigWig with tag counts from minus strand")
    parser.add_option("--bed",
                      dest="bedfile",
                      type="string",
                      help="tabix indexed bed file with tag counts"),
    parser.add_option("-c",
                      "--use-centre",
                      dest="centre",
                      action="store_true",
                      default=False,
                      help="Use centre of read rather than start")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":

        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]

        iterator = _exon_iterator(iterator)

    if options.unstranded_wig:
        bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig)
    elif options.plus_wig:
        if not options.minus_wig:
            raise ValueError(
                "Please provide wigs for both strands or use --unstranded_wig")
        bamfile = iCLIP.make_getter(plus_wig=options.plus_wig,
                                    minus_wig=options.minus_wig)
    elif options.bedfile:
        bamfile = iCLIP.make_getter(bedfile=options.bedfile)
    else:
        bamfile = pysam.AlignmentFile(args[0])

    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32",
                                            use_centre=options.centre)

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32",
                                              use_centre=options.centre)

        intron_counts = intron_counts.sum()

        if options.feature == "exon":

            try:
                exon_id = feature[0].exon_id
            except AttributeError:
                try:
                    exon_id = feature[0].exon_number
                except AttributeError:
                    exon_id = "missing"

            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = float(intron_counts)

        outlines.append([
            gene_id, transcript_id, exon_id,
            str(float(exon_counts)),
            str(intron_counts)
        ])

    options.stdout.write("\t".join([
        "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"
    ]) + "\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()
コード例 #49
0
ファイル: gff2fasta.py プロジェクト: gsc0107/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        merge=False,
                        feature=None,
                        filename_masks=None,
                        remove_masked_regions=False,
                        min_length=0,
                        max_length=0,
                        extend_at=None,
                        extend_by=100,
                        extend_with=None,
                        masker=None,
                        fold_at=None,
                        naming_attribute=False)

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with IOTools.openFile(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand
        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        options.stdout.write(
            ">%s %s:%s:%s\n%s\n" %
            (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.Stop()
コード例 #50
0
ファイル: gff2table.py プロジェクト: kathrinjansen/cgat
def transform_complement(start, end, intervals_with_gff):
    y = Intervals.combineIntervals([(x[0], x[1]) for x in intervals_with_gff])
    return Intervals.complementIntervals(y, start, end)
コード例 #51
0
ファイル: gff2fasta.py プロジェクト: SCV/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option(
        "-m", "--merge-adjacent", dest="merge", action="store_true",
        help="merge adjacent intervals with the same attributes."
        " [default=%default]")

    parser.add_option(
        "-e", "--feature", dest="feature", type="string",
        help="filter by a feature, for example 'exon', 'CDS'."
        " If set to the empty string, all entries are output "
        "[%default].")

    parser.add_option(
        "-f", "--maskregions-bed-file", dest="filename_masks",
        type="string", metavar="gff",
        help="mask sequences with regions given in gff file "
        "[%default].")

    parser.add_option(
        "--remove-masked-regions", dest="remove_masked_regions",
        action="store_true",
        help="remove regions instead of masking [%default].")

    parser.add_option(
        "--min-interval-length", dest="min_length", type="int",
        help="set minimum length for sequences output "
        "[%default]")

    parser.add_option(
        "--max-length", dest="max_length", type="int",
        help="set maximum length for sequences output "
        "[%default]")

    parser.add_option(
        "--extend-at", dest="extend_at", type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at no end, 3', 5' or both ends. If "
        "3only or 5only are set, only the added sequence "
        "is returned [default=%default]")

    parser.add_option(
        "--extend-by", dest="extend_by", type="int",
        help="extend by # bases [default=%default]")

    parser.add_option(
        "--extend-with", dest="extend_with", type="string",
        help="extend using base [default=%default]")

    parser.add_option(
        "--masker", dest="masker", type="choice",
        choices=("dust", "dustmasker", "softmask", "none"),
        help="apply masker [%default].")

    parser.add_option(
        "--fold-at", dest="fold_at", type="int",
        help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute", dest="naming_attribute", type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(
        is_gtf=False,
        genome_file=None,
        merge=False,
        feature=None,
        filename_masks=None,
        remove_masked_regions=False,
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        extend_with=None,
        masker=None,
        fold_at=None,
        naming_attribute=False
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with open(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in e.keys():
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

#    for item in iterator:
# print len(item) # 3, 2
#	for i in item:
# print len(i) # 9, 9, 9, 9, 9
#	   print i.contig
#	   print i.strand
#	   print i.transcript_id

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = filter(lambda x: x.feature == feature, ichunk)
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig,
                                       ichunk[0].start,
                                       ichunk[0].end,
                                       str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand
        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {x.split("=")[0]: x.split("=")[1]
                             for x in chunk[0].attributes.split(";")}
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise "unimplemented"

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write("# skipped because fully masked: "
                                             "%s: regions=%s masks=%s\n" %
                                             (name,
                                              str([(x.start,
                                                    x.end) for x in chunk]),
                                              masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [fasta.getSequence(contig, strand, start, end)
             for start, end in intervals]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length or
                (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with,) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i+n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        options.stdout.write(">%s %s:%s:%s\n%s\n" % (name,
                                                     contig,
                                                     strand,
                                                     ";".join(
                                                         ["%i-%i" %
                                                          x for x in out]),
                                                     seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons,
            nskipped_masked, nskipped_length))

    E.Stop()
コード例 #52
0
ファイル: psl2map.py プロジェクト: siping/cgat
    def processChunk( query_id, matches ):
        """process a set of matches from query_id"""

        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches
        global nremoved_regions, nqueries_removed_region
        global outfile_empty
        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []

        x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0
        nmatches = len(matches)

        new_matches = []

        # absolute filters applicable to non-fragmentory matches

        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue
                
            if match.mNMatches < options.threshold_min_matches:
                nremoved_nmatches += 1
                continue

            if options.threshold_max_error_rate:
                r = 100.0 * math.power( options.threshold_max_error_rate, match.mNMatches + match.mNMismatches)
                if match.mPid < r:
                    nremoved_pid += 1
                    x_nremoved_pid += 1
                    continue
            
            new_matches.append(match)

        matches = new_matches

        # filter matches        
        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\
                                     (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) )
            nskipped += 1
            return
        
        if options.keep_unique_matches and len(matches) == 1:
            pass
        else:
            new_matches = []

            for match in matches:

                if match.mQueryCoverage < options.threshold_min_query_coverage:
                    nremoved_query_coverage += 1
                    x_nquery_coverage += 1
                    continue

                if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue
                
                new_matches.append( match )
            matches = new_matches

        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\
                                     (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) )
            nskipped += 1
            return

        ## Remove queries matching to a forbidden region. This section
        ## will remove the full query if any of its matches matches in a
        ## forbidden region.
        keep = True
        for match in matches:
            if intersectors and match.mSbjctId in intersectors:
                found = intersectors[match.mSbjctId].find( match.mSbjctFrom, match.mSbjctTo )
                if found and not options.keep_forbidden or (found and not options.keep_forbidden):
                    nremoved_regions += 1
                    keep = False
                    continue

        if not keep:
            nqueries_removed_region += 1
            if outfile_empty:
                outfile_empty.write( "%s\toverlap with forbidden region\n" % query_id )
            return 

        ## check for full length matches
        for match in matches:
            if match.mQueryCoverage >= 99.9:
                full_matches.append(match)
            if match.mQueryCoverage > options.threshold_good_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)
            
        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1

        ## compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append( (match.mQueryFrom, match.mQueryTo) )
        
        rest = Intervals.complement( intervals, 0, match.mQueryLength )
        
        query_coverage = 100.0 * (match.mQueryLength - sum( map( lambda x: x[1] - x[0], rest) ) ) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append( query_id )
        elif  query_coverage > options.threshold_good_query_coverage:
            well_matched.append( query_id )
        else:
            partially_matched.append( query_id )

        aggregate_coverages.append( query_coverage )

        ## select matches to output
        matches, msg = selectMatches( query_id, matches, options, queries_fasta )

        if len(matches) > 0:
            for match in matches:
                if options.query_forward_coordinates:
                    match.convertCoordinates()

                if options.output_format == "map":
                    options.stdout.write( "%s\n" %\
                                              "\t".join( map(str, (
                                match.mQueryId, match.mSbjctId, 
                                match.strand,
                                "%5.2f" % match.mQueryCoverage,
                                "%5.2f" % match.mSbjctCoverage,
                                "%5.2f" % match.mPid,
                                match.mQueryLength,
                                match.mSbjctLength,
                                match.mQueryFrom, match.mQueryTo,
                                match.mSbjctFrom, match.mSbjctTo,
                                ",".join( map(str,match.mBlockSizes) ),
                                ",".join( map(str,match.mQueryBlockStarts)),
                                ",".join( map(str,match.mSbjctBlockStarts)), 
                                ))))
                elif options.output_format == "psl":
                    options.stdout.write( str(match) + "\n" )

            noutput += 1
        else:
            if outfile_empty:
                outfile_empty.write( "%s\tno matches selected: %s\n" % (query_id, msg) )
            nempty += 1
コード例 #53
0
 def testNoOverlap(self):
     """test empty input."""
     self.assertEqual(Intervals.intersect([(0, 5), (10, 15)], [(5, 10)]),
                      [])
     self.assertEqual(Intervals.intersect([(5, 10)], [(0, 5), (10, 15)]),
                      [])
コード例 #54
0
 def testEmpty(self):
     """test empty input."""
     self.assertEqual(Intervals.fromArray([]), [])
コード例 #55
0
 def testArray1(self):
     """test simple array."""
     a = [1, 1, 1, 0, 0, 0, 1, 1, 1]
     self.assertEqual(Intervals.fromArray(a), [(0, 3), (6, 9)])
     self.assertEqual(Intervals.fromArray([not x for x in a]), [(3, 6)])
コード例 #56
0
ファイル: gtf2gff.py プロジェクト: Charlie-George/cgat
def annotateGenes(iterator, fasta, options):
    """annotate gene structures

    This method outputs intervals for first/middle/last exon/intron,
    UTRs and flanking regions.

    This method annotates per transcript. In order to achieve a unique tiling,
    use only a single transcript per gene and remove any overlap between
    genes.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nskipped = 0, 0, 0

    results = []
    increment = options.increment

    introns_detail = "introns" in options.detail
    exons_detail = "exons" in options.detail

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        try:
            lcontig = fasta.getLength(gene[0][0].contig)
        except KeyError:
            nskipped += 1
            continue

        results = []

        for transcript in gene:

            def _add(interval, anno):
                gtf = GTF.Entry()
                gtf.contig = transcript[0].contig
                gtf.gene_id = transcript[0].gene_id
                gtf.transcript_id = transcript[0].transcript_id
                gtf.strand = transcript[0].strand
                gtf.feature = anno
                gtf.start, gtf.end = interval
                results.append(gtf)

            ntranscripts += 1

            exons = [(x.start, x.end)
                     for x in transcript if x.feature == "exon"]
            if len(exons) == 0:
                nskipped += 1

            exons.sort()
            introns = []
            end = exons[0][1]
            for exon in exons[1:]:
                introns.append((end, exon[0]))
                end = exon[1]

            # add flank
            start, end = exons[0][0], exons[-1][1]
            upstream, downstream = [], []
            for x in xrange(0, options.flank, increment):
                upstream.append((start - increment, start))
                start -= increment
                downstream.append((end, end + increment))
                end += increment

            # remove out-of-bounds coordinates
            upstream = [x for x in upstream if x[0] >= 0]
            downstream = [x for x in downstream if x[1] <= lcontig]

            if is_negative_strand:
                exons.reverse()
                introns.reverse()
                upstream, downstream = downstream, upstream

            # add exons
            if exons_detail:
                _add(exons[0], "first_exon")
                if len(exons) > 1:
                    _add(exons[-1], "last_exon")
                for e in exons[1:-1]:
                    _add(e, "middle_exon")
            else:
                for e in exons:
                    _add(e, "exon")

            # add introns
            if introns_detail:
                if len(introns) > 0:
                    _add(introns[0], "first_intron")
                if len(introns) > 1:
                    _add(introns[-1], "last_intron")
                for i in introns[1:-1]:
                    _add(i, "middle_intron")
            else:
                for i in introns:
                    _add(i, "intron")

            for x, u in enumerate(upstream):
                _add(u, "upstream_%i" % (increment * (x + 1)))

            for x, u in enumerate(downstream):
                _add(u, "downstream_%i" % (increment * (x + 1)))

            results.sort(key=lambda x: x.feature)

        cache = []
        for key, vals in itertools.groupby(results, key=lambda x: x.feature):
            v = list(vals)
            intervals = [(x.start, x.end) for x in v]
            intervals = Intervals.combine(intervals)

            for start, end in intervals:
                r = GTF.Entry()
                r.copy(v[0])
                r.start, r.end = start, end
                cache.append(r)

        cache.sort(key=lambda x: x.start)
        for r in cache:
            options.stdout.write("%s\n" % str(r))

    E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" %
           (ngenes, ntranscripts, nskipped))
コード例 #57
0
ファイル: gtf2fasta.py プロジェクト: lesheng/cgat
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE):
    """annotate a genome given by the indexed *fasta* file and 
    an iterator over gtf annotations.
    """

    annotations = {}
    contig_sizes = fasta.getContigSizes(with_synonyms=False)
    E.info("allocating memory for %i contigs and %i bytes" %
           (len(contig_sizes), sum(contig_sizes.values()) * array.array("c").itemsize))
          # AString.AString( "a").itemsize ))

    for contig, size in contig_sizes.items():
        E.debug("allocating %s: %i bases" % (contig, size))
        # annotations[contig] = AString.AString( default_code * size )
        annotations[contig] = array.array("c", default_code * size)

    E.info("allocated memory for %i contigs" % len(fasta))

    counter = E.Counter()

    # output splice junctions
    outfile_junctions = E.openOutputFile("junctions")
    outfile_junctions.write(
        "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n")
    for gtfs in iterator:

        counter.input += 1

        if counter.input % options.report_step == 0:
            E.info("iteration %i" % counter.input)

        try:
            contig = fasta.getToken(gtfs[0].contig)
        except KeyError, msg:
            E.warn("contig %s not found - annotation ignored" % gtfs[0].contig)
            counter.skipped_contig += 1
            continue

        lcontig = fasta.getLength(contig)

        # make sure that exons are sorted by coordinate
        gtfs.sort(key=lambda x: x.start)

        is_positive = Genomics.IsPositiveStrand(gtfs[0].strand)
        source = gtfs[0].source

        # process non-coding data
        if source in MAP_ENSEMBL:
            code = MAP_ENSEMBL[source]

            intervals = [(x.start, x.end) for x in gtfs]
            addSegments(annotations[contig],
                        intervals,
                        is_positive,
                        code)

        elif source == "protein_coding":

            # collect exons for utr
            exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"]
            cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"]
            if len(cds) == 0:
                counter.skipped_transcripts += 1
                E.warn("protein-coding transcript %s without CDS - skipped" %
                       gtfs[0].transcript_id)
                continue

            exons = Intervals.truncate(exons, cds)
            start, end = cds[0][0], cds[-1][1]

            UTR5 = [x for x in exons if x[1] < start]
            UTR3 = [x for x in exons if x[0] >= end]

            if not is_positive:
                UTR5, UTR3 = UTR3, UTR5
                splice_code = "S"
            else:
                splice_code = "s"

            addSegments(annotations[contig],
                        UTR5,
                        is_positive,
                        "u")

            addIntrons(annotations[contig],
                       UTR5,
                       is_positive,
                       options.max_frameshift_length)

            addSegments(annotations[contig],
                        UTR3,
                        is_positive,
                        "v")

            addIntrons(annotations[contig],
                       UTR3,
                       is_positive,
                       options.max_frameshift_length)

            # output CDS according to frame
            addCDS(annotations[contig],
                   [x for x in gtfs if x.feature == "CDS"],
                   is_positive)

            # add introns between CDS
            addIntrons(annotations[contig],
                       cds,
                       is_positive,
                       options.max_frameshift_length)

            # output splice junctions
            cds = [x for x in gtfs if x.feature == "CDS"]

            # apply corrections for 1-past end coordinates
            # to point between residues within CDS
            if is_positive:
                ender = lambda x: x.end - 1
                starter = lambda x: x.start
                out_positive = "+"
            else:
                ender = lambda x: lcontig - x.start - 1
                starter = lambda x: lcontig - x.end
                out_positive = "-"
                cds.reverse()

            end = ender(cds[0])
            for c in cds[1:]:
                start = starter(c)
                outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" %
                                        (contig,
                                         out_positive,
                                         end,
                                         start,
                                         c.frame,
                                         c.gene_id,
                                         c.transcript_id,
                                         ))
                end = ender(c)
コード例 #58
0
ファイル: gff2psl.py プロジェクト: santayana/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.add_option("--no-header", dest="with_header", action="store_false",
                      help="do not output BLAT header [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string",
                      help="fasta filename with queries [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default]."""  )

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        with_header=True,
                        allow_duplicates=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        genome_fasta = None

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    ninput, noutput, nskipped = 0, 0, 0

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin),
                                                                 feature="exon"),
                                           strict=not options.allow_duplicates)
    else:
        iterator = GTF.joined_iterator(GTF.iterator(sys.stdin))

    if options.with_header:
        options.stdout.write(Blat.Match().getHeader() + "\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        result = alignlib_lite.py_makeAlignmentBlocks()

        xstart = 0

        intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs])

        for start, end in intervals:
            xend = xstart + end - start

            result.addDiagonal(xstart, xend,
                               start - xstart)
            xstart = xend

        entry = Blat.Match()
        entry.mQueryId = gff.transcript_id
        entry.mSbjctId = gff.contig
        entry.strand = gff.strand

        if genome_fasta:
            if entry.mSbjctId in genome_fasta:
                entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId)
            else:
                entry.mSbjctLength = result.getColTo()

        if queries_fasta:
            if entry.mQueryId in queries_fasta:
                entry.mQueryLength = queries_fasta.getLength(entry.mQueryId)
        else:
            entry.mQueryLength = result.getRowTo()

        entry.fromMap(result)

        options.stdout.write(str(entry) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
コード例 #59
0
ファイル: bed2bed.py プロジェクト: Q-KIM/cgat
def merge(iterator,
          max_distance=0,
          by_name=False,
          min_intervals=1,
          remove_inconsistent=False,
          resolve_blocks=False,
          stranded=False):
    """iterator for merging adjacent bed entries.

    *max_distance* > 0 permits merging of intervals that are
    not directly adjacent.

    If *by_name = True*, only entries with the same name are merged.

    If *remove_inconsistent*, overlapping intervals where the names
    are inconsistent will be removed.

    The score gives the number of intervals that have been merged.
    """

    if remove_inconsistent and by_name:
        assert ValueError(
            "using both remove_inconsistent and by_name makes no sense")

    def iterate_chunks(iterator):
        max_end = defaultdict(int)
        to_join = defaultdict(list)
        last_name = defaultdict(str)

        last = iterator.next()

        if not stranded:
            strand = "."
        else:
            strand = last.strand

        max_end[strand] = last.end
        to_join[strand] = [last]

        for bed in iterator:

            if not stranded:
                strand = "."
            else:
                strand = bed.strand

            d = bed.start - max_end[strand]

            if bed.contig == last.contig:
                assert bed.start >= last.start, \
                    "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" \
                    % (d, last, bed)

            if bed.contig != last.contig:

                for s in to_join:
                    if to_join[s]:
                        yield to_join[s]
                    to_join[s] = []
                    max_end[s] = 0

            elif (d > max_distance or
                  (by_name and last_name[strand] != bed.name)):

                if to_join[strand]:
                    yield to_join[strand]

                to_join[strand] = []

            last = bed
            last_name[strand] = last.name
            max_end[strand] = max(bed.end, max_end[strand])
            to_join[strand].append(bed)

        for strand in to_join:
            if to_join[strand]:
                yield to_join[strand]
        raise StopIteration

    c = E.Counter()

    for to_join in iterate_chunks(iterator):

        c.input += 1

        if remove_inconsistent:
            names = set([x.name for x in to_join])
            if len(names) > 1:
                c.skipped_inconcistent_intervals += 1
                continue

        if resolve_blocks:
            
            # keep track of number of intervals in each entry
            for bed in to_join:
                bed["score"] = 1
  
            merged = True
            while merged:
                
                joined = []
                not_joined = []
                merged = False
                
                while len(to_join) > 0:
                    bed1, to_join = to_join[0], to_join[1:]
                    intervals1 = bed1.toIntervals()
                    for bed2 in to_join:
                        intervals2 = bed2.toIntervals()
                        if Intervals.calculateOverlap(intervals1, intervals2) > 0:
                            intervals = Intervals.combine(intervals1 +
                                                          intervals2)
                            bed1.fromIntervals(intervals)
                            bed1["score"] += bed2["score"]
                            merged = True
                        else:
                            not_joined.append(bed2)

                    joined.append(bed1)
                    to_join = not_joined
                    not_joined = []

                to_join = joined
                joined = []
                
            to_join = sorted(to_join, key=lambda x: int(x.start))
            
            # keep only those with the created from the merge of the minimum
            # number of intervals
            
            for bed in to_join:

                if bed["score"] < min_intervals:
                    c.skipped_min_intervals += 1
                    continue

                yield bed
                c.output += 1
        else:
                        
            if len(to_join) < min_intervals:
                c.skipped_min_intervals += 1
                continue

            a = to_join[0]
            a.end = max([entry.end for entry in to_join])
            a.score = len(to_join)
            yield a
            c.output += 1

    E.info(str(c))