def toIntronIntervals( chunk ): '''convert a set of gtf elements within a transcript to intron coordinates. Will raise an error if more than one transcript is submitted. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] t = set([ x.transcript_id for x in chunk ]) contig, strand, transcript_id = chunk[0].contig, chunk[0].strand, chunk[0].transcript_id for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." assert gff.transcript_id == transcript_id, "more than one transcript submitted" intervals = Intervals.combine( [ (x.start, x.end) for x in chunk ] ) return Intervals.complement( intervals )
def toIntronIntervals(chunk): '''convert a set of gtf elements within a transcript to intron coordinates. Will raise an error if more than one transcript is submitted. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] t = set([x.transcript_id for x in chunk]) contig, strand, transcript_id = chunk[0].contig, chunk[0].strand, chunk[ 0].transcript_id for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." assert gff.transcript_id == transcript_id, "more than one transcript submitted" intervals = Intervals.combine([(x.start, x.end) for x in chunk]) return Intervals.complement(intervals)
def mapDomains( query_id, matches, map_nid2domains, new_family_id, options ): """map domains onto query_id.""" if options.loglevel >= 1: options.stdlog.write("# attempting to map domains for %s\n" % query_id ) if options.loglevel >= 3: for match in matches: options.stdlog.write("# match=%s\n" % str(match) ) nid = match.mNid if nid in map_nid2domains: for domain in map_nid2domains[nid]: options.stdlog.write("# domain=%s\n" % str(domain) ) else: options.stdlog.write("# no domains for nid %s\n" % nid ) mapped_domains = [] class DomainMatch: def __init__(self, pid, start, end, family): self.mPid = pid self.mStart = start self.mEnd = end self.mFamily = family def __str__ (self ): return "\t".join(map(str, (self.mPid, self.mStart, self.mEnd, self.mFamily))) for match in matches: nid = match.mNid query_length = match.mQueryLength if nid not in map_nid2domains: continue match.buildAlignment() ali = match.mMapSbjct2Query for domain in map_nid2domains[nid]: if domain.mStart >= ali.getRowTo() or domain.mEnd < ali.getRowFrom(): continue start = ali.mapRowToCol( domain.mStart, alignlib.RIGHT ) end = ali.mapRowToCol( domain.mEnd, alignlib.LEFT ) assert start >= 0 and end <= query_length, "warning: unmapped coordinates: %i-%i" % (start,end) mapped_domains.append( DomainMatch(match.mPid, start, end, domain.mFamily) ) if options.loglevel >= 1: options.stdlog.write( "# nid=%s, length=%i, mapped domains=%i\n" % (query_id, query_length, len(mapped_domains) ) ) last_family = None ## sort by matches by family mapped_domains.sort( lambda x, y: cmp( x.mFamily, y.mFamily )) ########################################################## ########################################################## ########################################################## ## combine matches from different sources def processFamily( family_id, family_intervals, all_intervals, min_length_domain, query_length ): if not family_intervals: return if options.combine_overlaps: i = Intervals.combine( map( lambda x: (x.mStart, x.mEnd), family_intervals) ) else: i = family_intervals ## note: this is overall pid, not per region. best_pid = max( map(lambda x: x.mPid, family_intervals) ) for start, end in i: coverage = 100.0 * (end - start) / query_length if end - start < min_length_domain and coverage < options.min_coverage: if options.loglevel >= 3: options.stdlog.write("# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage)) continue all_intervals.append( DomainMatch( best_pid, start, end, family_id ) ) last_family = None family_intervals = [] all_intervals = [] min_length_domain = min( options.min_length_domain, query_length - 10 ) for domain in mapped_domains: if last_family != domain.mFamily: processFamily( last_family, family_intervals, all_intervals, min_length_domain, query_length ) family_intervals = [] last_family = domain.mFamily family_intervals.append( domain ) processFamily( last_family, family_intervals, all_intervals, min_length_domain, query_length ) if options.loglevel >= 2: options.stdlog.write("# %s: before filtering: %i domains\n" % (query_id, len(all_intervals))) for d in all_intervals: options.stdlog.write("# %s\n" % str(d)) ########################################################## ########################################################## ########################################################## ## pick the best domains all_intervals.sort( lambda x, y: cmp( x.mPid * float(x.mEnd-x.mStart), y.mPid * float(y.mEnd - y.mStart)) ) all_intervals.reverse() new_intervals = [] for domain in all_intervals: overlap = Intervals.calculateOverlap( map( lambda x: (x.mStart,x.mEnd), new_intervals), [(domain.mStart,domain.mEnd)] ) if overlap > 0: continue new_intervals.append( domain ) all_intervals = new_intervals if options.loglevel >= 2: options.stdlog.write("# %s: after filtering: %i domains\n" % (query_id, len(all_intervals))) for d in all_intervals: options.stdlog.write("# %s\n" % str(d)) ########################################################## ########################################################## ########################################################## ## add singletons singletons = [] if options.add_singletons: all_singletons = Intervals.complement( map( lambda x: (x.mStart, x.mEnd), all_intervals), 0, query_length) for first_res, last_res in all_singletons: if last_res-first_res > options.min_length_singletons: singletons.append( Domain( 0, first_res, last_res, new_family_id ) ) new_family_id += 1 return new_family_id, all_intervals, singletons
def processChunk( query_id, matches ): global ninput, noutput, nskipped global nfull_matches, npartial_matches, ngood_matches global nremoved_pid global new_family_id, nsingletons, nmapped_domains, nmapped_sequences, nmapped_empty ninput += 1 full_matches = [] good_matches = [] partial_matches = [] for match in matches: if match.mPid < options.threshold_min_pid: nremoved_pid += 1 continue ## check for full length matches query_coverage = 100.0 * (match.mQueryTo - match.mQueryFrom) / match.mQueryLength if query_coverage >= 99.9: full_matches.append(match) if query_coverage > options.threshold_min_query_coverage: good_matches.append(match) else: partial_matches.append(match) if full_matches: nfull_matches += 1 elif good_matches: ngood_matches += 1 elif partial_matches: npartial_matches += 1 else: nskipped += 1 return ## compute coverage of sequence with matches intervals = [] for match in full_matches + good_matches + partial_matches: intervals.append( (match.mQueryFrom, match.mQueryTo) ) rest = Intervals.complement( intervals, 0, match.mQueryLength ) query_coverage = 100.0 * (match.mQueryLength - sum( map( lambda x: x[1] - x[0], rest) ) ) / match.mQueryLength if query_coverage >= 99.9: fully_matched.append( query_id ) elif query_coverage > options.threshold_min_query_coverage: well_matched.append( query_id ) else: partially_matched.append( query_id ) aggregate_coverages.append( query_coverage ) new_family_id, mapped_domains, singletons = mapDomains( query_id, matches, map_nid2domains, new_family_id, options ) if len(mapped_domains) > 0: nmapped_sequences += 1 else: nmapped_empty += 1 nmapped_domains += len(mapped_domains) mapped_coverage = 100.0 * sum( map( lambda x: x.mEnd - x.mStart, mapped_domains ) ) / match.mQueryLength mapped_coverages.append( mapped_coverage ) for domain in mapped_domains: options.stdout.write( "\t".join( map( str, (query_id, domain.mStart, domain.mEnd, domain.mFamily) ) ) + "\n" ) for domain in singletons: options.stdout.write( "\t".join( map( str, (query_id, domain.mStart, domain.mEnd, domain.mFamily) ) ) + "\n" ) noutput += 1
def Finalize( self ): """process each sequence and fine tune domain boundaries. adds singletons as well. """ nids = self.mTableNids.GetAllNids() if self.mLogLevel >= 1: print "--> at the beginning: " print "--> domains: %i" % (self.mTableFamilies.RowCount()) print "--> alignments: %i" % (self.mTableDomains.RowCount()) print "--> nids: %i" % len(self.mTableDomains.GetAllNids()) print "--> in %s: %i" % (self.mTableNameSource, len(nids)) sys.stdout.flush() self.OpenOutfiles() nsingletons = 0 known_families = set(self.mTableFamilies.GetAllClasses()) for nid in nids: if self.mFilterRepeats: repeats = self.mTableDomainsCore.GetDomainBoundaries( nid ) else: repeats = None domains = list(self.mTableDomains.GetDomainBoundaries(nid)) length = self.mTableNrdb.GetLength( nid ) domains.sort() all_intervalls = [] last_family = None for family, domain_from, domain_to in domains: if last_family != family: if last_family: if self.mCombineOverlaps: i = Intervals.combine( family_intervalls ) else: i = family_intervalls all_intervalls += i self.WriteIntervals( last_family, nid, i, repeats) family_intervalls = [] last_family = family family_intervalls.append( (domain_from, domain_to) ) if last_family: if self.mCombineOverlaps: i = Intervals.combine( family_intervalls ) else: i = family_intervalls all_intervalls += i self.WriteIntervals( last_family, nid, i, repeats) # remove all domains that overlap with repeats by adding the repeats if self.mFilterRepeats: for rfamily, rfrom, rto in repeats: all_intervalls.append( (rfrom, rto) ) # add singletons i = Intervals.complement( all_intervalls, 1, length) if self.mLogLevel > 3: print "nid=%i" % nid, all_intervalls, repeats, domains, i for first_res, last_res in i: if last_res-first_res > self.mMinSingletonLength: new_family = self.mTableFamilies.GetNewFamily( known_families ) self.WriteNewSingleton( new_family, nid, first_res, last_res ) nsingletons += 1 known_families.add( new_family ) self.CloseOutfiles() self.Load() if self.mLogLevel >= 1: print "--> at the end: " print "--> domains: %i" % (self.mTableFamilies.RowCount()) print "--> alignments: %i" % (self.mTableDomains.RowCount()) print "--> nids: %i" % len(self.mTableDomains.GetAllNids()) print "--> singletons added: %i" % nsingletons sys.stdout.flush()
def mapDomains(query_id, matches, map_nid2domains, new_family_id, options): """map domains onto query_id.""" if options.loglevel >= 1: options.stdlog.write("# attempting to map domains for %s\n" % query_id) if options.loglevel >= 3: for match in matches: options.stdlog.write("# match=%s\n" % str(match)) nid = match.mNid if nid in map_nid2domains: for domain in map_nid2domains[nid]: options.stdlog.write("# domain=%s\n" % str(domain)) else: options.stdlog.write("# no domains for nid %s\n" % nid) mapped_domains = [] class DomainMatch: def __init__(self, pid, start, end, family): self.mPid = pid self.mStart = start self.mEnd = end self.mFamily = family def __str__(self): return "\t".join( map(str, (self.mPid, self.mStart, self.mEnd, self.mFamily))) for match in matches: nid = match.mNid query_length = match.mQueryLength if nid not in map_nid2domains: continue match.buildAlignment() ali = match.mMapSbjct2Query for domain in map_nid2domains[nid]: if domain.mStart >= ali.getRowTo() or domain.mEnd < ali.getRowFrom( ): continue start = ali.mapRowToCol(domain.mStart, alignlib.RIGHT) end = ali.mapRowToCol(domain.mEnd, alignlib.LEFT) assert start >= 0 and end <= query_length, "warning: unmapped coordinates: %i-%i" % ( start, end) mapped_domains.append( DomainMatch(match.mPid, start, end, domain.mFamily)) if options.loglevel >= 1: options.stdlog.write("# nid=%s, length=%i, mapped domains=%i\n" % (query_id, query_length, len(mapped_domains))) last_family = None ## sort by matches by family mapped_domains.sort(lambda x, y: cmp(x.mFamily, y.mFamily)) ########################################################## ########################################################## ########################################################## ## combine matches from different sources def processFamily(family_id, family_intervals, all_intervals, min_length_domain, query_length): if not family_intervals: return if options.combine_overlaps: i = Intervals.combine( map(lambda x: (x.mStart, x.mEnd), family_intervals)) else: i = family_intervals ## note: this is overall pid, not per region. best_pid = max(map(lambda x: x.mPid, family_intervals)) for start, end in i: coverage = 100.0 * (end - start) / query_length if end - start < min_length_domain and coverage < options.min_coverage: if options.loglevel >= 3: options.stdlog.write( "# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage)) continue all_intervals.append(DomainMatch(best_pid, start, end, family_id)) last_family = None family_intervals = [] all_intervals = [] min_length_domain = min(options.min_length_domain, query_length - 10) for domain in mapped_domains: if last_family != domain.mFamily: processFamily(last_family, family_intervals, all_intervals, min_length_domain, query_length) family_intervals = [] last_family = domain.mFamily family_intervals.append(domain) processFamily(last_family, family_intervals, all_intervals, min_length_domain, query_length) if options.loglevel >= 2: options.stdlog.write("# %s: before filtering: %i domains\n" % (query_id, len(all_intervals))) for d in all_intervals: options.stdlog.write("# %s\n" % str(d)) ########################################################## ########################################################## ########################################################## ## pick the best domains all_intervals.sort(lambda x, y: cmp(x.mPid * float(x.mEnd - x.mStart), y.mPid * float(y.mEnd - y.mStart))) all_intervals.reverse() new_intervals = [] for domain in all_intervals: overlap = Intervals.calculateOverlap( map(lambda x: (x.mStart, x.mEnd), new_intervals), [(domain.mStart, domain.mEnd)]) if overlap > 0: continue new_intervals.append(domain) all_intervals = new_intervals if options.loglevel >= 2: options.stdlog.write("# %s: after filtering: %i domains\n" % (query_id, len(all_intervals))) for d in all_intervals: options.stdlog.write("# %s\n" % str(d)) ########################################################## ########################################################## ########################################################## ## add singletons singletons = [] if options.add_singletons: all_singletons = Intervals.complement( map(lambda x: (x.mStart, x.mEnd), all_intervals), 0, query_length) for first_res, last_res in all_singletons: if last_res - first_res > options.min_length_singletons: singletons.append(Domain(0, first_res, last_res, new_family_id)) new_family_id += 1 return new_family_id, all_intervals, singletons
def processChunk(query_id, matches): global ninput, noutput, nskipped global nfull_matches, npartial_matches, ngood_matches global nremoved_pid global new_family_id, nsingletons, nmapped_domains, nmapped_sequences, nmapped_empty ninput += 1 full_matches = [] good_matches = [] partial_matches = [] for match in matches: if match.mPid < options.threshold_min_pid: nremoved_pid += 1 continue ## check for full length matches query_coverage = 100.0 * (match.mQueryTo - match.mQueryFrom) / match.mQueryLength if query_coverage >= 99.9: full_matches.append(match) if query_coverage > options.threshold_min_query_coverage: good_matches.append(match) else: partial_matches.append(match) if full_matches: nfull_matches += 1 elif good_matches: ngood_matches += 1 elif partial_matches: npartial_matches += 1 else: nskipped += 1 return ## compute coverage of sequence with matches intervals = [] for match in full_matches + good_matches + partial_matches: intervals.append((match.mQueryFrom, match.mQueryTo)) rest = Intervals.complement(intervals, 0, match.mQueryLength) query_coverage = 100.0 * (match.mQueryLength - sum( map(lambda x: x[1] - x[0], rest))) / match.mQueryLength if query_coverage >= 99.9: fully_matched.append(query_id) elif query_coverage > options.threshold_min_query_coverage: well_matched.append(query_id) else: partially_matched.append(query_id) aggregate_coverages.append(query_coverage) new_family_id, mapped_domains, singletons = mapDomains( query_id, matches, map_nid2domains, new_family_id, options) if len(mapped_domains) > 0: nmapped_sequences += 1 else: nmapped_empty += 1 nmapped_domains += len(mapped_domains) mapped_coverage = 100.0 * sum( map(lambda x: x.mEnd - x.mStart, mapped_domains)) / match.mQueryLength mapped_coverages.append(mapped_coverage) for domain in mapped_domains: options.stdout.write("\t".join( map(str, (query_id, domain.mStart, domain.mEnd, domain.mFamily))) + "\n") for domain in singletons: options.stdout.write("\t".join( map(str, (query_id, domain.mStart, domain.mEnd, domain.mFamily))) + "\n") noutput += 1