Ejemplo n.º 1
0
Archivo: GTF.py Proyecto: yangjl/cgat
def toSequence(chunk, fasta):
    """convert a list of gff attributes to a single sequence.
    
    This function ensures correct in-order concatenation on
    positive/negative strand. Overlapping regions are merged.
    """
    if len(chunk) == 0: return ""

    contig, strand = chunk[0].contig, chunk[0].strand

    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    lcontig = fasta.getLength(contig)
    positive = Genomics.IsPositiveStrand(strand)

    if not positive:
        intervals = [(lcontig - end, lcontig - start)
                     for start, end in intervals]
        intervals.reverse()

    s = [
        fasta.getSequence(contig, strand, start, end)
        for start, end in intervals
    ]

    return "".join(s)
Ejemplo n.º 2
0
Archivo: GTF.py Proyecto: yangjl/cgat
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"):
    """select only those genes with a minimum length of a given feature."""
    for gffs in gff_iterator:
        intervals = [(x.start, x.end) for x in gffs if x.feature == feature]
        intervals = Intervals.combine(intervals)
        t = sum((x[1] - x[0] for x in intervals))
        if t >= min_length: yield gffs
Ejemplo n.º 3
0
def toSequence( chunk, fasta ):
    """convert a list of gff attributes to a single sequence.
    
    This function ensures correct in-order concatenation on
    positive/negative strand. Overlapping regions are merged.
    """
    if len(chunk) == 0: return ""

    contig, strand = chunk[0].contig, chunk[0].strand 

    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
    
    intervals = Intervals.combine( [ (x.start, x.end) for x in chunk ] )
    lcontig = fasta.getLength(contig)
    positive = Genomics.IsPositiveStrand( strand )

    if not positive: 
        intervals = [ (lcontig - end, lcontig - start) for start,end in intervals ]
        intervals.reverse()

    s = [ fasta.getSequence( contig, strand, start, end ) for start, end in intervals ]

    return "".join(s)
Ejemplo n.º 4
0
def iterator_min_feature_length( gff_iterator, min_length, feature="exon" ):
    """select only those genes with a minimum length of a given feature."""
    for gffs in gff_iterator:
        intervals = [ (x.start, x.end) for x in gffs if x.feature == feature ]
        intervals = Intervals.combine( intervals )
        t = sum( ( x[1] - x[0] for x in intervals ) )
        if t >= min_length: yield gffs
Ejemplo n.º 5
0
Archivo: GTF.py Proyecto: yangjl/cgat
def asRanges(gffs, feature=None):
    """return ranges within a set of gffs.

    Overlapping intervals are merged. 

    The returned intervals are sorted.
    """

    if type(feature) == types.StringType:
        gg = filter(lambda x: x.feature == feature, gffs)
    elif feature:
        gg = filter(lambda x: x.feature in feature, gffs)
    else:
        gg = gffs[:]

    r = [(g.start, g.end) for g in gg]
    return Intervals.combine(r)
Ejemplo n.º 6
0
def toIntronIntervals( chunk ):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will raise an error if more than one transcript is submitted.
    
    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0: return []
    t = set([ x.transcript_id for x in chunk ])
    contig, strand, transcript_id = chunk[0].contig, chunk[0].strand, chunk[0].transcript_id
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
        assert gff.transcript_id == transcript_id, "more than one transcript submitted"

    intervals = Intervals.combine( [ (x.start, x.end) for x in chunk ] )
    return Intervals.complement( intervals )
Ejemplo n.º 7
0
def asRanges( gffs, feature = None ):
    """return ranges within a set of gffs.

    Overlapping intervals are merged. 

    The returned intervals are sorted.
    """

    if type(feature) == types.StringType:
        gg = filter( lambda x: x.feature == feature, gffs )
    elif feature:
        gg = filter( lambda x: x.feature in feature, gffs )
    else:
        gg = gffs[:]
    
    r = [ (g.start, g.end) for g in gg ]
    return Intervals.combine( r )
Ejemplo n.º 8
0
Archivo: GTF.py Proyecto: yangjl/cgat
def toIntronIntervals(chunk):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will raise an error if more than one transcript is submitted.
    
    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0: return []
    t = set([x.transcript_id for x in chunk])
    contig, strand, transcript_id = chunk[0].contig, chunk[0].strand, chunk[
        0].transcript_id
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
        assert gff.transcript_id == transcript_id, "more than one transcript submitted"

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    return Intervals.complement(intervals)
Ejemplo n.º 9
0
    def processFamily( family_id, family_intervals, all_intervals, min_length_domain, query_length ):

        if not family_intervals: return

        if options.combine_overlaps:
            i = Intervals.combine( map( lambda x: (x.mStart, x.mEnd), family_intervals) )
        else:
            i = family_intervals

        ## note: this is overall pid, not per region.
        best_pid = max( map(lambda x: x.mPid, family_intervals) )
        for start, end in i:
            coverage = 100.0 * (end - start) / query_length
            if end - start < min_length_domain and coverage < options.min_coverage:
                if options.loglevel >= 3:
                    options.stdlog.write("# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage))
                continue

            all_intervals.append( DomainMatch( best_pid, start, end, family_id ) )
Ejemplo n.º 10
0
    def processFamily(family_id, family_intervals, all_intervals,
                      min_length_domain, query_length):

        if not family_intervals: return

        if options.combine_overlaps:
            i = Intervals.combine(
                map(lambda x: (x.mStart, x.mEnd), family_intervals))
        else:
            i = family_intervals

        ## note: this is overall pid, not per region.
        best_pid = max(map(lambda x: x.mPid, family_intervals))
        for start, end in i:
            coverage = 100.0 * (end - start) / query_length
            if end - start < min_length_domain and coverage < options.min_coverage:
                if options.loglevel >= 3:
                    options.stdlog.write(
                        "# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n"
                        % (family_id, start, end, coverage))
                continue

            all_intervals.append(DomainMatch(best_pid, start, end, family_id))
Ejemplo n.º 11
0
def buildSCOPDomains( infiles, outfile ):
    '''reconcile mapped domains into a single domain file.

    * fragments are removed - a domain must map at least 90%
      of its length.

    * domains overlapping on the same sequence with the same
      superfamily classification are merged.
    '''
    
    linksfile, fastafile = infiles

    # filtering criteria
    min_coverage = 0.9
    # only take first four fold classes
    classes = 'abcd'

    rx = re.compile('(\S+)\s(\S+)\s(.*)' )
    id2class = {}
    with IOTools.openFile( fastafile ) as inf:
        for x in FastaIterator.iterate( inf ):
            pid, cls, description = rx.match(x.title).groups()
            id2class[pid] = (cls, len(x.sequence) )
            
    E.info('read mappings for %i sequences' % len(id2class))
    counter = E.Counter()

    with IOTools.openFile( linksfile ) as inf:
        nid2domains = collections.defaultdict( list )
        ndomains = 0
        for line in inf:
            if line.startswith('query_nid'): continue
            if line.startswith('#'): continue
            counter.links += 1
            
            domain_id, nid, evalue, domain_start, domain_end, sbjct_start, sbjct_end, \
                block_sizes, domain_starts, sbjct_starts, \
                bitscore, pid = line[:-1].split()
            
            nid, domain_start, domain_end, sbjct_start, sbjct_end = map(int, \
                                                                       ( nid, domain_start, domain_end, sbjct_start, sbjct_end ))

            family, length = id2class[domain_id]

            cls, fold, superfamily, family = family.split('.')
            if cls not in classes: continue
            if float(domain_end - domain_start) / length < min_coverage: continue
            counter.unmerged_domains += 1
            superfamily = '00%c%03i%03i' % (cls, int(fold), int(superfamily))

            nid2domains[nid].append( (superfamily, sbjct_start, sbjct_end ) )

        counter.sequences = len(nid2domains)

    E.info( 'merging %i domains in %i sequences' % (counter.unmerged_domains, counter.sequences))

    outf = IOTools.openFile( outfile, 'w' )
    outf.write('nid\tstart\tend\tfamily\n')
    for nid, dd in sorted(nid2domains.iteritems()):
        for family, domains in itertools.groupby( dd, key = lambda x: x[0] ):
            unmerged_domains = [ (x[1],x[2]) for x in domains ]
            merged_domains = Intervals.combine( unmerged_domains )
            for start, end in merged_domains:
                counter.domains += 1
                outf.write( '%i\t%i\t%i\t%s\n' % (nid, start, end, family ) )
    outf.close()

    E.info( counter )
Ejemplo n.º 12
0
    def Finalize( self ):
        """process each sequence and fine tune domain boundaries.
        adds singletons as well.
        """

        nids = self.mTableNids.GetAllNids()        

        if self.mLogLevel >= 1:
            print "--> at the beginning: "
            print "--> domains: %i" % (self.mTableFamilies.RowCount())
            print "--> alignments: %i" % (self.mTableDomains.RowCount())
            print "--> nids: %i" % len(self.mTableDomains.GetAllNids())
            print "--> in %s: %i" % (self.mTableNameSource, len(nids))
            sys.stdout.flush()

        self.OpenOutfiles()
        nsingletons = 0

        known_families = set(self.mTableFamilies.GetAllClasses())
        
        for nid in nids:

            if self.mFilterRepeats:
                repeats = self.mTableDomainsCore.GetDomainBoundaries( nid )
            else:
                repeats = None
            
            domains = list(self.mTableDomains.GetDomainBoundaries(nid))
            length = self.mTableNrdb.GetLength( nid )

            domains.sort()
            all_intervalls = []
            last_family = None
            for family, domain_from, domain_to in domains:

                if last_family != family:
                    if last_family:
                        if self.mCombineOverlaps:
                            i = Intervals.combine( family_intervalls )
                        else:
                            i = family_intervalls
                            
                        all_intervalls += i
                        self.WriteIntervals( last_family, nid, i, repeats)

                    family_intervalls = []
                    
                last_family = family
                family_intervalls.append( (domain_from, domain_to) )

            if last_family:

                if self.mCombineOverlaps:
                    i = Intervals.combine( family_intervalls )
                else:
                    i = family_intervalls
                    
                all_intervalls += i                
                self.WriteIntervals( last_family, nid, i, repeats)

            # remove all domains that overlap with repeats by adding the repeats
            if self.mFilterRepeats:

                for rfamily, rfrom, rto in repeats:
                    all_intervalls.append( (rfrom, rto) )
                    
            # add singletons
            i = Intervals.complement( all_intervalls, 1, length)

            if self.mLogLevel > 3:
                print "nid=%i" % nid, all_intervalls, repeats, domains, i

            for first_res, last_res in i:
                if last_res-first_res > self.mMinSingletonLength:
                    
                    new_family = self.mTableFamilies.GetNewFamily( known_families )
                    self.WriteNewSingleton( new_family, nid, first_res, last_res )
                    nsingletons += 1
                    known_families.add( new_family )
            
        self.CloseOutfiles()

        self.Load()

        if self.mLogLevel >= 1:
            print "--> at the end: "
            print "--> domains: %i" % (self.mTableFamilies.RowCount())
            print "--> alignments: %i" % (self.mTableDomains.RowCount())
            print "--> nids: %i" % len(self.mTableDomains.GetAllNids())
            print "--> singletons added: %i" % nsingletons
            sys.stdout.flush()