Ejemplo n.º 1
0
def toIntronIntervals( chunk ):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will raise an error if more than one transcript is submitted.
    
    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0: return []
    t = set([ x.transcript_id for x in chunk ])
    contig, strand, transcript_id = chunk[0].contig, chunk[0].strand, chunk[0].transcript_id
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
        assert gff.transcript_id == transcript_id, "more than one transcript submitted"

    intervals = Intervals.combine( [ (x.start, x.end) for x in chunk ] )
    return Intervals.complement( intervals )
Ejemplo n.º 2
0
Archivo: GTF.py Proyecto: yangjl/cgat
def toIntronIntervals(chunk):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will raise an error if more than one transcript is submitted.
    
    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0: return []
    t = set([x.transcript_id for x in chunk])
    contig, strand, transcript_id = chunk[0].contig, chunk[0].strand, chunk[
        0].transcript_id
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
        assert gff.transcript_id == transcript_id, "more than one transcript submitted"

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    return Intervals.complement(intervals)
Ejemplo n.º 3
0
def mapDomains( query_id, matches, map_nid2domains, new_family_id, options ):
    """map domains onto query_id."""

    if options.loglevel >= 1:
        options.stdlog.write("# attempting to map domains for %s\n" % query_id )
        
        if options.loglevel >= 3:
            for match in matches:
                options.stdlog.write("# match=%s\n" % str(match) )
                nid = match.mNid
                if nid in map_nid2domains:
                    for domain in map_nid2domains[nid]:
                        options.stdlog.write("# domain=%s\n" % str(domain) )                        
                else:
                    options.stdlog.write("# no domains for nid %s\n" % nid )
                    
    mapped_domains = []
    
    class DomainMatch:
        def __init__(self, pid, start, end, family):
            self.mPid = pid
            self.mStart = start
            self.mEnd = end
            self.mFamily = family

        def __str__ (self ):
            return "\t".join(map(str, (self.mPid, self.mStart, self.mEnd, self.mFamily)))

    for match in matches:
        nid = match.mNid
        query_length = match.mQueryLength
        if nid not in map_nid2domains: continue

        match.buildAlignment()
        
        ali = match.mMapSbjct2Query

        for domain in map_nid2domains[nid]:
            
            if domain.mStart >= ali.getRowTo() or domain.mEnd < ali.getRowFrom():
                continue
            
            start = ali.mapRowToCol( domain.mStart, alignlib.RIGHT )
            end   = ali.mapRowToCol( domain.mEnd, alignlib.LEFT )
            assert start >= 0 and end <= query_length, "warning: unmapped coordinates: %i-%i" % (start,end)
            mapped_domains.append( DomainMatch(match.mPid, start, end, domain.mFamily) )

    if options.loglevel >= 1:
        options.stdlog.write( "# nid=%s, length=%i, mapped domains=%i\n" % (query_id, query_length, len(mapped_domains) ) )

    last_family = None

    ## sort by matches by family
    mapped_domains.sort( lambda x, y: cmp( x.mFamily, y.mFamily ))

    ##########################################################
    ##########################################################
    ##########################################################
    ## combine matches from different sources

    def processFamily( family_id, family_intervals, all_intervals, min_length_domain, query_length ):

        if not family_intervals: return

        if options.combine_overlaps:
            i = Intervals.combine( map( lambda x: (x.mStart, x.mEnd), family_intervals) )
        else:
            i = family_intervals

        ## note: this is overall pid, not per region.
        best_pid = max( map(lambda x: x.mPid, family_intervals) )
        for start, end in i:
            coverage = 100.0 * (end - start) / query_length
            if end - start < min_length_domain and coverage < options.min_coverage:
                if options.loglevel >= 3:
                    options.stdlog.write("# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage))
                continue

            all_intervals.append( DomainMatch( best_pid, start, end, family_id ) )

    last_family = None
    family_intervals = []
    all_intervals = []
    min_length_domain = min( options.min_length_domain, query_length - 10 )

    for domain in mapped_domains:
        if last_family != domain.mFamily:
            processFamily( last_family, family_intervals, all_intervals, min_length_domain, query_length )
            family_intervals = []

        last_family = domain.mFamily
        family_intervals.append( domain )

    processFamily( last_family, family_intervals, all_intervals, min_length_domain, query_length )
                                  
    if options.loglevel >= 2:
        options.stdlog.write("# %s: before filtering: %i domains\n" % (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## pick the best domains
    all_intervals.sort( lambda x, y: cmp( x.mPid * float(x.mEnd-x.mStart), 
                                          y.mPid * float(y.mEnd - y.mStart)) )
    all_intervals.reverse()
    
    new_intervals = []
    for domain in all_intervals:
        
        overlap = Intervals.calculateOverlap( map( lambda x: (x.mStart,x.mEnd), new_intervals),
                                              [(domain.mStart,domain.mEnd)] )
            
        if overlap > 0:
            continue
        
        new_intervals.append( domain )
        
    all_intervals = new_intervals

    if options.loglevel >= 2:
        options.stdlog.write("# %s: after filtering: %i domains\n" % (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## add singletons
    singletons = []

    if options.add_singletons:
        all_singletons = Intervals.complement( 
            map( lambda x: (x.mStart, x.mEnd), all_intervals), 
            0, query_length)

        for first_res, last_res in all_singletons:
            if last_res-first_res > options.min_length_singletons:
                singletons.append( Domain( 0, first_res, last_res, new_family_id ) )
                new_family_id += 1
            
    return new_family_id, all_intervals, singletons
Ejemplo n.º 4
0
    def processChunk( query_id, matches ):
        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid 
        global new_family_id, nsingletons, nmapped_domains, nmapped_sequences, nmapped_empty

        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []
        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue

            ## check for full length matches
            query_coverage = 100.0 * (match.mQueryTo - match.mQueryFrom) / match.mQueryLength

            if query_coverage >= 99.9:
                full_matches.append(match)
            if query_coverage > options.threshold_min_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)

        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1
        else:
            nskipped += 1
            return
            
        ## compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append( (match.mQueryFrom, match.mQueryTo) )
        
        rest = Intervals.complement( intervals, 0, match.mQueryLength )
        
        query_coverage = 100.0 * (match.mQueryLength - sum( map( lambda x: x[1] - x[0], rest) ) ) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append( query_id )
        elif  query_coverage > options.threshold_min_query_coverage:
            well_matched.append( query_id )
        else:
            partially_matched.append( query_id )

        aggregate_coverages.append( query_coverage )
        
        new_family_id, mapped_domains, singletons = mapDomains( query_id, matches, map_nid2domains, new_family_id, options )
        
        if len(mapped_domains) > 0:
            nmapped_sequences += 1
        else:
            nmapped_empty += 1
        nmapped_domains += len(mapped_domains)

        mapped_coverage = 100.0 * sum( map( lambda x: x.mEnd - x.mStart, mapped_domains ) ) / match.mQueryLength
        mapped_coverages.append( mapped_coverage )
        
        for domain in mapped_domains:
            options.stdout.write( "\t".join( map( str, (query_id, domain.mStart, domain.mEnd, domain.mFamily) ) ) + "\n" )

        for domain in singletons:
            options.stdout.write( "\t".join( map( str, (query_id, domain.mStart, domain.mEnd, domain.mFamily) ) ) + "\n" )

        noutput += 1
Ejemplo n.º 5
0
    def Finalize( self ):
        """process each sequence and fine tune domain boundaries.
        adds singletons as well.
        """

        nids = self.mTableNids.GetAllNids()        

        if self.mLogLevel >= 1:
            print "--> at the beginning: "
            print "--> domains: %i" % (self.mTableFamilies.RowCount())
            print "--> alignments: %i" % (self.mTableDomains.RowCount())
            print "--> nids: %i" % len(self.mTableDomains.GetAllNids())
            print "--> in %s: %i" % (self.mTableNameSource, len(nids))
            sys.stdout.flush()

        self.OpenOutfiles()
        nsingletons = 0

        known_families = set(self.mTableFamilies.GetAllClasses())
        
        for nid in nids:

            if self.mFilterRepeats:
                repeats = self.mTableDomainsCore.GetDomainBoundaries( nid )
            else:
                repeats = None
            
            domains = list(self.mTableDomains.GetDomainBoundaries(nid))
            length = self.mTableNrdb.GetLength( nid )

            domains.sort()
            all_intervalls = []
            last_family = None
            for family, domain_from, domain_to in domains:

                if last_family != family:
                    if last_family:
                        if self.mCombineOverlaps:
                            i = Intervals.combine( family_intervalls )
                        else:
                            i = family_intervalls
                            
                        all_intervalls += i
                        self.WriteIntervals( last_family, nid, i, repeats)

                    family_intervalls = []
                    
                last_family = family
                family_intervalls.append( (domain_from, domain_to) )

            if last_family:

                if self.mCombineOverlaps:
                    i = Intervals.combine( family_intervalls )
                else:
                    i = family_intervalls
                    
                all_intervalls += i                
                self.WriteIntervals( last_family, nid, i, repeats)

            # remove all domains that overlap with repeats by adding the repeats
            if self.mFilterRepeats:

                for rfamily, rfrom, rto in repeats:
                    all_intervalls.append( (rfrom, rto) )
                    
            # add singletons
            i = Intervals.complement( all_intervalls, 1, length)

            if self.mLogLevel > 3:
                print "nid=%i" % nid, all_intervalls, repeats, domains, i

            for first_res, last_res in i:
                if last_res-first_res > self.mMinSingletonLength:
                    
                    new_family = self.mTableFamilies.GetNewFamily( known_families )
                    self.WriteNewSingleton( new_family, nid, first_res, last_res )
                    nsingletons += 1
                    known_families.add( new_family )
            
        self.CloseOutfiles()

        self.Load()

        if self.mLogLevel >= 1:
            print "--> at the end: "
            print "--> domains: %i" % (self.mTableFamilies.RowCount())
            print "--> alignments: %i" % (self.mTableDomains.RowCount())
            print "--> nids: %i" % len(self.mTableDomains.GetAllNids())
            print "--> singletons added: %i" % nsingletons
            sys.stdout.flush()
Ejemplo n.º 6
0
def mapDomains(query_id, matches, map_nid2domains, new_family_id, options):
    """map domains onto query_id."""

    if options.loglevel >= 1:
        options.stdlog.write("# attempting to map domains for %s\n" % query_id)

        if options.loglevel >= 3:
            for match in matches:
                options.stdlog.write("# match=%s\n" % str(match))
                nid = match.mNid
                if nid in map_nid2domains:
                    for domain in map_nid2domains[nid]:
                        options.stdlog.write("# domain=%s\n" % str(domain))
                else:
                    options.stdlog.write("# no domains for nid %s\n" % nid)

    mapped_domains = []

    class DomainMatch:
        def __init__(self, pid, start, end, family):
            self.mPid = pid
            self.mStart = start
            self.mEnd = end
            self.mFamily = family

        def __str__(self):
            return "\t".join(
                map(str, (self.mPid, self.mStart, self.mEnd, self.mFamily)))

    for match in matches:
        nid = match.mNid
        query_length = match.mQueryLength
        if nid not in map_nid2domains: continue

        match.buildAlignment()

        ali = match.mMapSbjct2Query

        for domain in map_nid2domains[nid]:

            if domain.mStart >= ali.getRowTo() or domain.mEnd < ali.getRowFrom(
            ):
                continue

            start = ali.mapRowToCol(domain.mStart, alignlib.RIGHT)
            end = ali.mapRowToCol(domain.mEnd, alignlib.LEFT)
            assert start >= 0 and end <= query_length, "warning: unmapped coordinates: %i-%i" % (
                start, end)
            mapped_domains.append(
                DomainMatch(match.mPid, start, end, domain.mFamily))

    if options.loglevel >= 1:
        options.stdlog.write("# nid=%s, length=%i, mapped domains=%i\n" %
                             (query_id, query_length, len(mapped_domains)))

    last_family = None

    ## sort by matches by family
    mapped_domains.sort(lambda x, y: cmp(x.mFamily, y.mFamily))

    ##########################################################
    ##########################################################
    ##########################################################
    ## combine matches from different sources

    def processFamily(family_id, family_intervals, all_intervals,
                      min_length_domain, query_length):

        if not family_intervals: return

        if options.combine_overlaps:
            i = Intervals.combine(
                map(lambda x: (x.mStart, x.mEnd), family_intervals))
        else:
            i = family_intervals

        ## note: this is overall pid, not per region.
        best_pid = max(map(lambda x: x.mPid, family_intervals))
        for start, end in i:
            coverage = 100.0 * (end - start) / query_length
            if end - start < min_length_domain and coverage < options.min_coverage:
                if options.loglevel >= 3:
                    options.stdlog.write(
                        "# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n"
                        % (family_id, start, end, coverage))
                continue

            all_intervals.append(DomainMatch(best_pid, start, end, family_id))

    last_family = None
    family_intervals = []
    all_intervals = []
    min_length_domain = min(options.min_length_domain, query_length - 10)

    for domain in mapped_domains:
        if last_family != domain.mFamily:
            processFamily(last_family, family_intervals, all_intervals,
                          min_length_domain, query_length)
            family_intervals = []

        last_family = domain.mFamily
        family_intervals.append(domain)

    processFamily(last_family, family_intervals, all_intervals,
                  min_length_domain, query_length)

    if options.loglevel >= 2:
        options.stdlog.write("# %s: before filtering: %i domains\n" %
                             (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## pick the best domains
    all_intervals.sort(lambda x, y: cmp(x.mPid * float(x.mEnd - x.mStart),
                                        y.mPid * float(y.mEnd - y.mStart)))
    all_intervals.reverse()

    new_intervals = []
    for domain in all_intervals:

        overlap = Intervals.calculateOverlap(
            map(lambda x: (x.mStart, x.mEnd), new_intervals),
            [(domain.mStart, domain.mEnd)])

        if overlap > 0:
            continue

        new_intervals.append(domain)

    all_intervals = new_intervals

    if options.loglevel >= 2:
        options.stdlog.write("# %s: after filtering: %i domains\n" %
                             (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## add singletons
    singletons = []

    if options.add_singletons:
        all_singletons = Intervals.complement(
            map(lambda x: (x.mStart, x.mEnd), all_intervals), 0, query_length)

        for first_res, last_res in all_singletons:
            if last_res - first_res > options.min_length_singletons:
                singletons.append(Domain(0, first_res, last_res,
                                         new_family_id))
                new_family_id += 1

    return new_family_id, all_intervals, singletons
Ejemplo n.º 7
0
    def processChunk(query_id, matches):
        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid
        global new_family_id, nsingletons, nmapped_domains, nmapped_sequences, nmapped_empty

        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []
        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue

            ## check for full length matches
            query_coverage = 100.0 * (match.mQueryTo -
                                      match.mQueryFrom) / match.mQueryLength

            if query_coverage >= 99.9:
                full_matches.append(match)
            if query_coverage > options.threshold_min_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)

        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1
        else:
            nskipped += 1
            return

        ## compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append((match.mQueryFrom, match.mQueryTo))

        rest = Intervals.complement(intervals, 0, match.mQueryLength)

        query_coverage = 100.0 * (match.mQueryLength - sum(
            map(lambda x: x[1] - x[0], rest))) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append(query_id)
        elif query_coverage > options.threshold_min_query_coverage:
            well_matched.append(query_id)
        else:
            partially_matched.append(query_id)

        aggregate_coverages.append(query_coverage)

        new_family_id, mapped_domains, singletons = mapDomains(
            query_id, matches, map_nid2domains, new_family_id, options)

        if len(mapped_domains) > 0:
            nmapped_sequences += 1
        else:
            nmapped_empty += 1
        nmapped_domains += len(mapped_domains)

        mapped_coverage = 100.0 * sum(
            map(lambda x: x.mEnd - x.mStart,
                mapped_domains)) / match.mQueryLength
        mapped_coverages.append(mapped_coverage)

        for domain in mapped_domains:
            options.stdout.write("\t".join(
                map(str, (query_id, domain.mStart, domain.mEnd,
                          domain.mFamily))) + "\n")

        for domain in singletons:
            options.stdout.write("\t".join(
                map(str, (query_id, domain.mStart, domain.mEnd,
                          domain.mFamily))) + "\n")

        noutput += 1