def _buildMatchVersions(self, miRNA_seq): """ """ returnDict = {} # Extract all versions of seed from miRNA for seedType in _seedModels: returnDict[seedType] = [] # If NO third index like here: [0,7,'A1'] if len(_seedModels[seedType]) == 2: index_0 = _seedModels[seedType][0] index_1 = _seedModels[seedType][1] returnDict[seedType].append(bioDefs.revComp(miRNA_seq[index_0:index_1].replace('U','T'))) # If extra data: use 'instructions index' to construct seedVersion elif len(_seedModels[seedType]) == 3: nuc,pos = list(_seedModels[seedType][2]) pos = int(pos) index_0 = _seedModels[seedType][0] index_1 = _seedModels[seedType][1] returnDict[seedType].append(bioDefs.revComp(miRNA_seq[index_0:index_1].replace('U','T'))) # Convert correct position to reflect adjustment listed # in 'instructions'. returnDict[seedType][0] = list(returnDict[seedType][0]) # convert to list for inplace mutation returnDict[seedType][0][-pos] = nuc # insert nuc returnDict[seedType][0] = ''.join(returnDict[seedType][0]) # convert back to string return returnDict
def _loadSeed(self): # Determine whether we got miRNA or kmer and initialize # 'seed' and 'seedComp' dicts. if len(self.sourceSeq) > 8: self.sourceSeqType = 'miRNA' # Extract all versions of seed from miRNA for seedType in self.seedModels: # If no extra info take the defined slice if len(self.seedModels[seedType]) == 2: index_0 = self.seedModels[seedType][0] index_1 = self.seedModels[seedType][1] self.seed[seedType] = self.sourceSeq[index_0:index_1].replace('U','T') # If extra data: use 'instructions' to construct seed where # revComp will generate match to message. elif len(self.seedModels[seedType]) == 3: nuc,pos = list(self.seedModels[seedType][2]) pos = int(pos) index_0 = self.seedModels[seedType][0] index_1 = self.seedModels[seedType][1] self.seed[seedType] = self.sourceSeq[index_0:index_1].replace('U','T') # Convert correct position to reflect adjustment listed # in 'instructions'. self.seed[seedType] = list(self.seed[seedType]) # convert to list for inplace mutation self.seed[seedType][pos-1] = bioDefs.revComp(nuc) # insert rvCmp'd nuc self.seed[seedType] = ''.join(self.seed[seedType]) # convert back to string else: self.sourceSeqType = 'kmer' self.seed['kmer'] = self.sourceSeq.replace('U','T')
def _loadmatchVersions(self, set4realMatches): # Determine whether we got miRNA or kmer and initialize # matchVersions Dict. assert self.sourceSeq > 8, \ 'ERROR: self.sourceSeq (%s) must be > 8 nt long.' % (self.sourceSeq) self.sourceSeqType = 'miRNA' # We use the m2_to_m8 version to represent the seedVersion set for an miRNA since all versions can be constructed from this one. set4realMatches.add(bioDefs.revComp(self.sourceSeq[1:8].replace('U','T'))) matchVersions = self._buildMatchVersions(self.sourceSeq) for seedType in _seedModels: self.matchVersions[seedType] = matchVersions[seedType] # Log each version of the seed match to use as restrictedList when building Ctrls set4realMatches.add(matchVersions[seedType][0]) # for technical reasons matchVersions[seedType] == ['aSeedMatchSeq'] so we must index it
def buildCtrlsFromProSeed(self, restrictedList, numOfCtrls=15): """ WARNING: This should only be called after the entire list of real seeds have been initialized! Computes 15 permutations of the 'true' proSeed matching seqeunce (m2_to_m8) and derives matchVersions as in the true case. The permuted sequence is checked agaisnt the restrictedList to prevent using KNOWN seed matches for controls. Ctrl seed matches are stored in a list located at second index of the list located in dict entry self.matchVersions[seedType]. Each seedVersion of a ctrl set will share the same index number. """ ##assert True==False, \ ##"""WARNING!!! miRNA.buildCtrlsFromMatchVers() should be used instead!! ##If you REALLY want to use this method, modify or remove this assert statement. ##But seriously... use miRNA.buildCtrlsFromMatchVers().""" # check to see whether this has already been done. # If so, complain and die. # Else, append an empty list as index_1 after REAL matchSeq for each version for seedType in self.matchVersions: assert type(self.matchVersions[seedType]) == type([]), \ 'ERROR: %s.matchVersions[%s] is not type: list.' % (self.name,seedType) assert len(self.matchVersions[seedType]) == 1, \ 'ERROR: len(%s.matchVersions[%s]) is not 1; ctrls seqs may have already been built.' % (self.name,seedType) self.matchVersions[seedType].append([]) proSeed = self.sourceSeq[1:8] matchPerms = [''.join(x) for x in xpermutations.xpermutations(list(self.matchVersions['m2_to_m8'][0]))] # Select 15 random permutations of matchVersions['m2_to_m8'] that are not in the # restrictedList. chosenPerms = [] while len(chosenPerms) < numOfCtrls: permSeq = JamesDefs.randFromList_noReplace(matchPerms) if permSeq not in restrictedList: chosenPerms.append(permSeq) # Use each chosenSeq to generate the diff matchVersions for seq in chosenPerms: # Create Fake miRNA with seq at the seed location to feed to _buildMatchVersions() seq = 'N%sNNNNNNNNNNNNN' % (bioDefs.revComp(seq)) matchVersions = self._buildMatchVersions(seq) for seedType in self.matchVersions: self.matchVersions[seedType][1].append(matchVersions[seedType][0]) # must use index[0] bc _buildMatchVersions returns a list len==1
oligoType = 'control' # 'match' or 'control' assert oligoType == 'match' or 'control', 'oligoType MUST be only "match" or "control".' # Load miRNA fastas into dict. miRNAs = Fasta.file2dict(miRNAFile) # Create new dict for seeds. seeds = {} # 1) Cycle through miRNA dict taking 7mers starting at pos 1 # and then pos2. Adapt key to reflect which. # 2) Convert to all uppers and convert U's to T's # 3) If oligoType == 'match', rvcmp each 7mer and adapt key # to reflect which. for miRNA in miRNAs: pos1_seed = miRNAs[miRNA][:7].upper().replace('U','T') pos2_seed = miRNAs[miRNA][1:8].upper().replace('U','T') if oligoType == 'match': seeds[miRNA+'_match_pos1'] = bioDefs.revComp(pos1_seed) seeds[miRNA+'_match_pos2'] = bioDefs.revComp(pos2_seed) else: seeds[miRNA+'_ctrl_pos1'] = pos1_seed seeds[miRNA+'_ctrl_pos2'] = pos2_seed # Write out seed dict as fasta. Fasta.write(seeds,seedFile) print "Done."
from gusPyCode.defs.bioDefs import revComp # To convert Aa putative TSS seqs into +1 Ori based on gene's Ori chromOriSeqs = map(lambda line: line.strip(), open('/Users/biggus/Documents/James/Data/AedesCorePromoterWork/output/Inr_DPE/Aa_Ensbl49_AaegL1.1.plus100minus100._InrDPE_.newInr.chromStrand.fa','rU').readlines()) outFile = '/Users/biggus/Documents/James/Data/AedesCorePromoterWork/output/Inr_DPE/Aa_Ensbl49_AaegL1.1.plus100minus100._InrDPE_.newInr.geneStrand.local.fa' geneOriSeqs = [] chromOriSeqs = zipped = zip(chromOriSeqs[:-1:2], chromOriSeqs[1::2]) for item in chromOriSeqs: fieldsList = item[0].split(' ') seq = item[1] if fieldsList[-1] == '(1:-1)': fieldsList[-1] = '(-1:1)' seq = revComp(seq) geneOriSeqs.append(' '.join(fieldsList)+'\n') geneOriSeqs.append(seq+'\n') else: geneOriSeqs.append(item[0]+'\n') geneOriSeqs.append(item[1]+'\n') outFile = open(outFile, 'w') outFile.writelines(geneOriSeqs) print 'Done.'
def _loadSeedComp(self): for seedType in self.seed: self.seedComp[seedType] = bioDefs.revComp(self.seed[seedType])
from TAMO.seq import Fasta from gusPyCode.defs.bioDefs import revComp seqFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.fas' kmerFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.7mersInAll.tamoVers.txt' #outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.7mersInAll.tamoVers.txt' seqs = Fasta.file2dict(seqFile) seqNames = sorted(seqs.keys()) kmers = map(lambda l: l.strip(), open(kmerFile, 'rU').readlines()) kmers.append('AAHRRSSSSSSSSSMMMMM') results = [] for kmer in kmers: print '>%s:' % (kmer) results.append('%s:' % (kmer)) for seq in seqNames: if seqs[seq].find(kmer) != -1 or seqs[seq].find(revComp(kmer)) != -1: print '\t'+seq results.append(seq) else: print '---NOT FOUND---' #outFile = open(outFile, 'w') #for each in inAllSeqs: #outFile.write(each+'\n') print 'Done'