def rvscomp(self): self.seq = rvs_comp_str(self.seq)
#!/usr/bin/python import sys, string from AshworthUtil import rvs_comp_str if len(sys.argv) > 1: input = string.join( sys.argv[1:], '' ) else: input = sys.stdin.read() print rvs_comp_str(input.rstrip('\n'))
def makeprobes(self, fastafiles): f = FastaSeqs() f.loadseqs(fastafiles) print f.summarize() if self.regions == {}: for seq in f.seqs.values(): # create region spanning whole seq l = len(seq) for d in ["+", "-"]: self.regions[(seq.name, d)] = {} ss = RegionSet("noid") ss.add(Region(seq.name, d, 1, l)) self.regions[(seq.name, d)]["noid"] = ss else: # prevent lookup errors for seq in f.seqs.values(): for d in ["+", "-"]: if not self.regions.has_key((seq.name, d)): self.regions[(seq.name, d)] = {"noid": RegionSet()} for seq in f.seqs.values(): seqlen = len(seq) sys.stderr.write("%s\n" % seq.name) seqabbv = re.sub("_", "", seq.name[: min(len(seq.name), 8)]) # the probes are created starting from the 3' end of the region. This makes it easy to create '3'-biased' probe sets of 'n' probes # forward strand for regionset in self.regions[(seq.name, "+")].values(): # sys.stderr.write('+:\n%s\n' %str(regionset)) # start with last region (exon), create probes from 3' end until nprobes reached nprobes_id = 0 for region in reversed(sorted(regionset.regions)): nprobes_region = 0 cursor = region.end - self.probelength + self.posoffset # note: strings 0-indexed (i), genome sequences 1-indexed (i+1) while cursor >= region.start: if cursor < 0 or cursor + self.probelength >= seqlen: break if self.probespergene > 0 and nprobes_id >= self.probespergene: break if self.probesperregion > 0 and nprobes_region >= self.probesperregion: break probeseq = seq.seq[cursor : cursor + self.probelength] if not re.search(opt.mask, probeseq): # skip probes containing masked sequence probename = "%s_%i_+" % (seqabbv, cursor + 1) p = Probe(start=cursor + 1, strand="+", name=probename, seq=probeseq, parent=seq.name) self.probes.append(p) nprobes_id += 1 nprobes_region += 1 cursor -= self.probelength + self.gap # reverse strand for regionset in self.regions[(seq.name, "-")].values(): # sys.stderr.write('-:\n%s\n' %str(regionset)) nprobes_id = 0 for region in sorted(regionset.regions): nprobes_region = 0 cursor = region.start + self.negoffset while cursor < region.end - self.probelength: if cursor < 0 or cursor + self.probelength >= seqlen: break if self.probespergene > 0 and nprobes_id >= self.probespergene: break if self.probesperregion > 0 and nprobes_region >= self.probesperregion: break probeseq = seq.seq[cursor : cursor + self.probelength] if not re.search(opt.mask, probeseq): probename = "%s_%i_-" % (seqabbv, cursor + 1) p = Probe( start=cursor + 1, strand="-", name=probename, seq=rvs_comp_str(probeseq), parent=seq.name, ) self.probes.append(p) nprobes_id += 1 nprobes_region += 1 cursor += self.probelength + self.gap
def makeprobes(self,fastafiles): f=FastaSeqs() f.loadseqs(fastafiles) print f.summarize() if self.regions == {}: for seq in f.seqs.values(): # create region spanning whole seq l = len(seq) for d in ['+','-']: self.regions[(seq.name,d)] = {} ss = RegionSet('noid') ss.add( Region(seq.name,d,1,l) ) self.regions[ (seq.name,d) ]['noid'] = ss else: # prevent lookup errors for seq in f.seqs.values(): for d in ['+','-']: if not self.regions.has_key( (seq.name,d) ): self.regions[ (seq.name,d) ] = {'noid':RegionSet()} for seq in f.seqs.values(): seqlen = len(seq) sys.stderr.write('%s\n' %seq.name) seqabbv=re.sub('_','',seq.name[ : min(len(seq.name),8) ]) # the probes are created starting from the 3' end of the region. This makes it easy to create '3'-biased' probe sets of 'n' probes # forward strand for regionset in self.regions[(seq.name,'+')].values(): # sys.stderr.write('+:\n%s\n' %str(regionset)) # start with last region (exon), create probes from 3' end until nprobes reached nprobes_id = 0 for region in reversed(sorted(regionset.regions)): nprobes_region = 0 cursor = region.end - self.probelength + self.posoffset # note: strings 0-indexed (i), genome sequences 1-indexed (i+1) while cursor >= region.start: if cursor < 0 or cursor + self.probelength >= seqlen: break if self.probespergene > 0 and nprobes_id >= self.probespergene: break if self.probesperregion > 0 and nprobes_region >= self.probesperregion: break probeseq=seq.seq[cursor:cursor+self.probelength] if not re.search(opt.mask,probeseq): # skip probes containing masked sequence probename='%s_%i_+' %(seqabbv,cursor+1) p=Probe( start=cursor+1, strand='+', name=probename, seq=probeseq, parent=seq.name ) self.probes.append(p) nprobes_id += 1 nprobes_region += 1 cursor -= self.probelength + self.gap # reverse strand for regionset in self.regions[(seq.name,'-')].values(): # sys.stderr.write('-:\n%s\n' %str(regionset)) nprobes_id = 0 for region in sorted(regionset.regions): nprobes_region = 0 cursor = region.start + self.negoffset while cursor < region.end - self.probelength: if cursor < 0 or cursor + self.probelength >= seqlen: break if self.probespergene > 0 and nprobes_id >= self.probespergene: break if self.probesperregion > 0 and nprobes_region >= self.probesperregion: break probeseq=seq.seq[cursor:cursor+self.probelength] if not re.search(opt.mask,probeseq): probename='%s_%i_-' %(seqabbv,cursor+1) p=Probe( start=cursor+1, strand='-', name=probename, seq=rvs_comp_str(probeseq), parent=seq.name ) self.probes.append(p) nprobes_id += 1 nprobes_region += 1 cursor += self.probelength + self.gap
def countsites(self, seq, length): n = range(len(seq) - length) for i in n: subseq = seq[i:i + length] self.sites[subseq] += 1 self.sites[rvs_comp_str(subseq)] += 1
def countsites(self,seq,length): n = range(len(seq)-length) for i in n: subseq = seq[i:i+length] self.sites[subseq] += 1 self.sites[rvs_comp_str(subseq)] += 1
if not parent in genes: genes[parent] = Gene() genes[parent]['seq'] = seq genes[parent]['cds'] = [] genes[parent]['cds'].append((start, end, strand)) seqfile = sys.argv[2] src = FastaSeqs() src.loadseqs([seqfile]) outp = [] for id, gene in genes.items(): seq = gene['seq'] if not seq in src.seqs: msg('%s not found in source!' % seq) sys.exit() if 'cds' in gene: fullcds = [] rvs = False for start, end, strand in sorted(gene['cds'], key=lambda x: x[0]): ss = src.seqs[seq].seq[(start - 1):end] if strand == '-': ss = rvs_comp_str(ss) rvs = True fullcds.append(ss) if rvs: fullcds.reverse() outp.append('>%s_%s\n%s' % (id, 'cds', ''.join(fullcds))) print('\n'.join(outp))