def toSequence(chunk, fasta): """convert a list of gff attributes to a single sequence. This function ensures correct in-order concatenation on positive/negative strand. Overlapping regions are merged. """ if len(chunk) == 0: return "" contig, strand = chunk[0].contig, chunk[0].strand for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk]) lcontig = fasta.getLength(contig) positive = Genomics.IsPositiveStrand(strand) if not positive: intervals = [(lcontig - end, lcontig - start) for start, end in intervals] intervals.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] return "".join(s)
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"): """select only those genes with a minimum length of a given feature.""" for gffs in gff_iterator: intervals = [(x.start, x.end) for x in gffs if x.feature == feature] intervals = Intervals.combine(intervals) t = sum((x[1] - x[0] for x in intervals)) if t >= min_length: yield gffs
def toSequence( chunk, fasta ): """convert a list of gff attributes to a single sequence. This function ensures correct in-order concatenation on positive/negative strand. Overlapping regions are merged. """ if len(chunk) == 0: return "" contig, strand = chunk[0].contig, chunk[0].strand for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine( [ (x.start, x.end) for x in chunk ] ) lcontig = fasta.getLength(contig) positive = Genomics.IsPositiveStrand( strand ) if not positive: intervals = [ (lcontig - end, lcontig - start) for start,end in intervals ] intervals.reverse() s = [ fasta.getSequence( contig, strand, start, end ) for start, end in intervals ] return "".join(s)
def iterator_min_feature_length( gff_iterator, min_length, feature="exon" ): """select only those genes with a minimum length of a given feature.""" for gffs in gff_iterator: intervals = [ (x.start, x.end) for x in gffs if x.feature == feature ] intervals = Intervals.combine( intervals ) t = sum( ( x[1] - x[0] for x in intervals ) ) if t >= min_length: yield gffs
def asRanges(gffs, feature=None): """return ranges within a set of gffs. Overlapping intervals are merged. The returned intervals are sorted. """ if type(feature) == types.StringType: gg = filter(lambda x: x.feature == feature, gffs) elif feature: gg = filter(lambda x: x.feature in feature, gffs) else: gg = gffs[:] r = [(g.start, g.end) for g in gg] return Intervals.combine(r)
def toIntronIntervals( chunk ): '''convert a set of gtf elements within a transcript to intron coordinates. Will raise an error if more than one transcript is submitted. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] t = set([ x.transcript_id for x in chunk ]) contig, strand, transcript_id = chunk[0].contig, chunk[0].strand, chunk[0].transcript_id for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." assert gff.transcript_id == transcript_id, "more than one transcript submitted" intervals = Intervals.combine( [ (x.start, x.end) for x in chunk ] ) return Intervals.complement( intervals )
def asRanges( gffs, feature = None ): """return ranges within a set of gffs. Overlapping intervals are merged. The returned intervals are sorted. """ if type(feature) == types.StringType: gg = filter( lambda x: x.feature == feature, gffs ) elif feature: gg = filter( lambda x: x.feature in feature, gffs ) else: gg = gffs[:] r = [ (g.start, g.end) for g in gg ] return Intervals.combine( r )
def toIntronIntervals(chunk): '''convert a set of gtf elements within a transcript to intron coordinates. Will raise an error if more than one transcript is submitted. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] t = set([x.transcript_id for x in chunk]) contig, strand, transcript_id = chunk[0].contig, chunk[0].strand, chunk[ 0].transcript_id for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." assert gff.transcript_id == transcript_id, "more than one transcript submitted" intervals = Intervals.combine([(x.start, x.end) for x in chunk]) return Intervals.complement(intervals)
def processFamily( family_id, family_intervals, all_intervals, min_length_domain, query_length ): if not family_intervals: return if options.combine_overlaps: i = Intervals.combine( map( lambda x: (x.mStart, x.mEnd), family_intervals) ) else: i = family_intervals ## note: this is overall pid, not per region. best_pid = max( map(lambda x: x.mPid, family_intervals) ) for start, end in i: coverage = 100.0 * (end - start) / query_length if end - start < min_length_domain and coverage < options.min_coverage: if options.loglevel >= 3: options.stdlog.write("# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage)) continue all_intervals.append( DomainMatch( best_pid, start, end, family_id ) )
def processFamily(family_id, family_intervals, all_intervals, min_length_domain, query_length): if not family_intervals: return if options.combine_overlaps: i = Intervals.combine( map(lambda x: (x.mStart, x.mEnd), family_intervals)) else: i = family_intervals ## note: this is overall pid, not per region. best_pid = max(map(lambda x: x.mPid, family_intervals)) for start, end in i: coverage = 100.0 * (end - start) / query_length if end - start < min_length_domain and coverage < options.min_coverage: if options.loglevel >= 3: options.stdlog.write( "# ignoring domain because too small: %s:%i-%i = cov=%5.2f\n" % (family_id, start, end, coverage)) continue all_intervals.append(DomainMatch(best_pid, start, end, family_id))
def buildSCOPDomains( infiles, outfile ): '''reconcile mapped domains into a single domain file. * fragments are removed - a domain must map at least 90% of its length. * domains overlapping on the same sequence with the same superfamily classification are merged. ''' linksfile, fastafile = infiles # filtering criteria min_coverage = 0.9 # only take first four fold classes classes = 'abcd' rx = re.compile('(\S+)\s(\S+)\s(.*)' ) id2class = {} with IOTools.openFile( fastafile ) as inf: for x in FastaIterator.iterate( inf ): pid, cls, description = rx.match(x.title).groups() id2class[pid] = (cls, len(x.sequence) ) E.info('read mappings for %i sequences' % len(id2class)) counter = E.Counter() with IOTools.openFile( linksfile ) as inf: nid2domains = collections.defaultdict( list ) ndomains = 0 for line in inf: if line.startswith('query_nid'): continue if line.startswith('#'): continue counter.links += 1 domain_id, nid, evalue, domain_start, domain_end, sbjct_start, sbjct_end, \ block_sizes, domain_starts, sbjct_starts, \ bitscore, pid = line[:-1].split() nid, domain_start, domain_end, sbjct_start, sbjct_end = map(int, \ ( nid, domain_start, domain_end, sbjct_start, sbjct_end )) family, length = id2class[domain_id] cls, fold, superfamily, family = family.split('.') if cls not in classes: continue if float(domain_end - domain_start) / length < min_coverage: continue counter.unmerged_domains += 1 superfamily = '00%c%03i%03i' % (cls, int(fold), int(superfamily)) nid2domains[nid].append( (superfamily, sbjct_start, sbjct_end ) ) counter.sequences = len(nid2domains) E.info( 'merging %i domains in %i sequences' % (counter.unmerged_domains, counter.sequences)) outf = IOTools.openFile( outfile, 'w' ) outf.write('nid\tstart\tend\tfamily\n') for nid, dd in sorted(nid2domains.iteritems()): for family, domains in itertools.groupby( dd, key = lambda x: x[0] ): unmerged_domains = [ (x[1],x[2]) for x in domains ] merged_domains = Intervals.combine( unmerged_domains ) for start, end in merged_domains: counter.domains += 1 outf.write( '%i\t%i\t%i\t%s\n' % (nid, start, end, family ) ) outf.close() E.info( counter )
def Finalize( self ): """process each sequence and fine tune domain boundaries. adds singletons as well. """ nids = self.mTableNids.GetAllNids() if self.mLogLevel >= 1: print "--> at the beginning: " print "--> domains: %i" % (self.mTableFamilies.RowCount()) print "--> alignments: %i" % (self.mTableDomains.RowCount()) print "--> nids: %i" % len(self.mTableDomains.GetAllNids()) print "--> in %s: %i" % (self.mTableNameSource, len(nids)) sys.stdout.flush() self.OpenOutfiles() nsingletons = 0 known_families = set(self.mTableFamilies.GetAllClasses()) for nid in nids: if self.mFilterRepeats: repeats = self.mTableDomainsCore.GetDomainBoundaries( nid ) else: repeats = None domains = list(self.mTableDomains.GetDomainBoundaries(nid)) length = self.mTableNrdb.GetLength( nid ) domains.sort() all_intervalls = [] last_family = None for family, domain_from, domain_to in domains: if last_family != family: if last_family: if self.mCombineOverlaps: i = Intervals.combine( family_intervalls ) else: i = family_intervalls all_intervalls += i self.WriteIntervals( last_family, nid, i, repeats) family_intervalls = [] last_family = family family_intervalls.append( (domain_from, domain_to) ) if last_family: if self.mCombineOverlaps: i = Intervals.combine( family_intervalls ) else: i = family_intervalls all_intervalls += i self.WriteIntervals( last_family, nid, i, repeats) # remove all domains that overlap with repeats by adding the repeats if self.mFilterRepeats: for rfamily, rfrom, rto in repeats: all_intervalls.append( (rfrom, rto) ) # add singletons i = Intervals.complement( all_intervalls, 1, length) if self.mLogLevel > 3: print "nid=%i" % nid, all_intervalls, repeats, domains, i for first_res, last_res in i: if last_res-first_res > self.mMinSingletonLength: new_family = self.mTableFamilies.GetNewFamily( known_families ) self.WriteNewSingleton( new_family, nid, first_res, last_res ) nsingletons += 1 known_families.add( new_family ) self.CloseOutfiles() self.Load() if self.mLogLevel >= 1: print "--> at the end: " print "--> domains: %i" % (self.mTableFamilies.RowCount()) print "--> alignments: %i" % (self.mTableDomains.RowCount()) print "--> nids: %i" % len(self.mTableDomains.GetAllNids()) print "--> singletons added: %i" % nsingletons sys.stdout.flush()