Ejemplo n.º 1
0
def mapDomains( query_id, matches, map_nid2domains, new_family_id, options ):
    """map domains onto query_id."""

    if options.loglevel >= 1:
        options.stdlog.write("# attempting to map domains for %s\n" % query_id )
        
        if options.loglevel >= 3:
            for match in matches:
                options.stdlog.write("# match=%s\n" % str(match) )
                nid = match.mNid
                if nid in map_nid2domains:
                    for domain in map_nid2domains[nid]:
                        options.stdlog.write("# domain=%s\n" % str(domain) )                        
                else:
                    options.stdlog.write("# no domains for nid %s\n" % nid )
                    
    mapped_domains = []
    
    class DomainMatch:
        def __init__(self, pid, start, end, family):
            self.mPid = pid
            self.mStart = start
            self.mEnd = end
            self.mFamily = family

        def __str__ (self ):
            return "\t".join(map(str, (self.mPid, self.mStart, self.mEnd, self.mFamily)))

    for match in matches:
        nid = match.mNid
        query_length = match.mQueryLength
        if nid not in map_nid2domains: continue

        match.buildAlignment()
        
        ali = match.mMapSbjct2Query

        for domain in map_nid2domains[nid]:
            
            if domain.mStart >= ali.getRowTo() or domain.mEnd < ali.getRowFrom():
                continue
            
            start = ali.mapRowToCol( domain.mStart, alignlib.RIGHT )
            end   = ali.mapRowToCol( domain.mEnd, alignlib.LEFT )
            assert start >= 0 and end <= query_length, "warning: unmapped coordinates: %i-%i" % (start,end)
            mapped_domains.append( DomainMatch(match.mPid, start, end, domain.mFamily) )

    if options.loglevel >= 1:
        options.stdlog.write( "# nid=%s, length=%i, mapped domains=%i\n" % (query_id, query_length, len(mapped_domains) ) )

    last_family = None

    ## sort by matches by family
    mapped_domains.sort( lambda x, y: cmp( x.mFamily, y.mFamily ))

    ##########################################################
    ##########################################################
    ##########################################################
    ## combine matches from different sources

    def processFamily( family_id, family_intervals, all_intervals, min_length_domain, query_length ):

        if not family_intervals: return

        if options.combine_overlaps:
            i = Intervals.combine( map( lambda x: (x.mStart, x.mEnd), family_intervals) )
        else:
            i = family_intervals

        ## note: this is overall pid, not per region.
        best_pid = max( map(lambda x: x.mPid, family_intervals) )
        for start, end in i:
            coverage = 100.0 * (end - start) / query_length
            if end - start < min_length_domain and coverage < options.min_coverage:
                if options.loglevel >= 3:
                    options.stdlog.write("# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage))
                continue

            all_intervals.append( DomainMatch( best_pid, start, end, family_id ) )

    last_family = None
    family_intervals = []
    all_intervals = []
    min_length_domain = min( options.min_length_domain, query_length - 10 )

    for domain in mapped_domains:
        if last_family != domain.mFamily:
            processFamily( last_family, family_intervals, all_intervals, min_length_domain, query_length )
            family_intervals = []

        last_family = domain.mFamily
        family_intervals.append( domain )

    processFamily( last_family, family_intervals, all_intervals, min_length_domain, query_length )
                                  
    if options.loglevel >= 2:
        options.stdlog.write("# %s: before filtering: %i domains\n" % (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## pick the best domains
    all_intervals.sort( lambda x, y: cmp( x.mPid * float(x.mEnd-x.mStart), 
                                          y.mPid * float(y.mEnd - y.mStart)) )
    all_intervals.reverse()
    
    new_intervals = []
    for domain in all_intervals:
        
        overlap = Intervals.calculateOverlap( map( lambda x: (x.mStart,x.mEnd), new_intervals),
                                              [(domain.mStart,domain.mEnd)] )
            
        if overlap > 0:
            continue
        
        new_intervals.append( domain )
        
    all_intervals = new_intervals

    if options.loglevel >= 2:
        options.stdlog.write("# %s: after filtering: %i domains\n" % (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## add singletons
    singletons = []

    if options.add_singletons:
        all_singletons = Intervals.complement( 
            map( lambda x: (x.mStart, x.mEnd), all_intervals), 
            0, query_length)

        for first_res, last_res in all_singletons:
            if last_res-first_res > options.min_length_singletons:
                singletons.append( Domain( 0, first_res, last_res, new_family_id ) )
                new_family_id += 1
            
    return new_family_id, all_intervals, singletons
Ejemplo n.º 2
0
def mapDomains(query_id, matches, map_nid2domains, new_family_id, options):
    """map domains onto query_id."""

    if options.loglevel >= 1:
        options.stdlog.write("# attempting to map domains for %s\n" % query_id)

        if options.loglevel >= 3:
            for match in matches:
                options.stdlog.write("# match=%s\n" % str(match))
                nid = match.mNid
                if nid in map_nid2domains:
                    for domain in map_nid2domains[nid]:
                        options.stdlog.write("# domain=%s\n" % str(domain))
                else:
                    options.stdlog.write("# no domains for nid %s\n" % nid)

    mapped_domains = []

    class DomainMatch:
        def __init__(self, pid, start, end, family):
            self.mPid = pid
            self.mStart = start
            self.mEnd = end
            self.mFamily = family

        def __str__(self):
            return "\t".join(
                map(str, (self.mPid, self.mStart, self.mEnd, self.mFamily)))

    for match in matches:
        nid = match.mNid
        query_length = match.mQueryLength
        if nid not in map_nid2domains: continue

        match.buildAlignment()

        ali = match.mMapSbjct2Query

        for domain in map_nid2domains[nid]:

            if domain.mStart >= ali.getRowTo() or domain.mEnd < ali.getRowFrom(
            ):
                continue

            start = ali.mapRowToCol(domain.mStart, alignlib.RIGHT)
            end = ali.mapRowToCol(domain.mEnd, alignlib.LEFT)
            assert start >= 0 and end <= query_length, "warning: unmapped coordinates: %i-%i" % (
                start, end)
            mapped_domains.append(
                DomainMatch(match.mPid, start, end, domain.mFamily))

    if options.loglevel >= 1:
        options.stdlog.write("# nid=%s, length=%i, mapped domains=%i\n" %
                             (query_id, query_length, len(mapped_domains)))

    last_family = None

    ## sort by matches by family
    mapped_domains.sort(lambda x, y: cmp(x.mFamily, y.mFamily))

    ##########################################################
    ##########################################################
    ##########################################################
    ## combine matches from different sources

    def processFamily(family_id, family_intervals, all_intervals,
                      min_length_domain, query_length):

        if not family_intervals: return

        if options.combine_overlaps:
            i = Intervals.combine(
                map(lambda x: (x.mStart, x.mEnd), family_intervals))
        else:
            i = family_intervals

        ## note: this is overall pid, not per region.
        best_pid = max(map(lambda x: x.mPid, family_intervals))
        for start, end in i:
            coverage = 100.0 * (end - start) / query_length
            if end - start < min_length_domain and coverage < options.min_coverage:
                if options.loglevel >= 3:
                    options.stdlog.write(
                        "# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n"
                        % (family_id, start, end, coverage))
                continue

            all_intervals.append(DomainMatch(best_pid, start, end, family_id))

    last_family = None
    family_intervals = []
    all_intervals = []
    min_length_domain = min(options.min_length_domain, query_length - 10)

    for domain in mapped_domains:
        if last_family != domain.mFamily:
            processFamily(last_family, family_intervals, all_intervals,
                          min_length_domain, query_length)
            family_intervals = []

        last_family = domain.mFamily
        family_intervals.append(domain)

    processFamily(last_family, family_intervals, all_intervals,
                  min_length_domain, query_length)

    if options.loglevel >= 2:
        options.stdlog.write("# %s: before filtering: %i domains\n" %
                             (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## pick the best domains
    all_intervals.sort(lambda x, y: cmp(x.mPid * float(x.mEnd - x.mStart),
                                        y.mPid * float(y.mEnd - y.mStart)))
    all_intervals.reverse()

    new_intervals = []
    for domain in all_intervals:

        overlap = Intervals.calculateOverlap(
            map(lambda x: (x.mStart, x.mEnd), new_intervals),
            [(domain.mStart, domain.mEnd)])

        if overlap > 0:
            continue

        new_intervals.append(domain)

    all_intervals = new_intervals

    if options.loglevel >= 2:
        options.stdlog.write("# %s: after filtering: %i domains\n" %
                             (query_id, len(all_intervals)))
        for d in all_intervals:
            options.stdlog.write("# %s\n" % str(d))

    ##########################################################
    ##########################################################
    ##########################################################
    ## add singletons
    singletons = []

    if options.add_singletons:
        all_singletons = Intervals.complement(
            map(lambda x: (x.mStart, x.mEnd), all_intervals), 0, query_length)

        for first_res, last_res in all_singletons:
            if last_res - first_res > options.min_length_singletons:
                singletons.append(Domain(0, first_res, last_res,
                                         new_family_id))
                new_family_id += 1

    return new_family_id, all_intervals, singletons