Ejemplo n.º 1
0
 def _buildMatchVersions(self, miRNA_seq):
     """
     
     """
     returnDict = {}
     # Extract all versions of seed from miRNA
     for seedType in _seedModels:
         returnDict[seedType] = []
         # If NO third index like here: [0,7,'A1']
         if len(_seedModels[seedType]) == 2:
             index_0 = _seedModels[seedType][0]
             index_1 = _seedModels[seedType][1]
             returnDict[seedType].append(bioDefs.revComp(miRNA_seq[index_0:index_1].replace('U','T'))) 
         # If extra data: use 'instructions index' to construct seedVersion
         elif len(_seedModels[seedType]) == 3:
             nuc,pos = list(_seedModels[seedType][2])
             pos = int(pos) 
             index_0 = _seedModels[seedType][0]
             index_1 = _seedModels[seedType][1]
             returnDict[seedType].append(bioDefs.revComp(miRNA_seq[index_0:index_1].replace('U','T'))) 
             # Convert correct position to reflect adjustment listed
             # in 'instructions'.
             returnDict[seedType][0]       = list(returnDict[seedType][0])    # convert to list for inplace mutation
             returnDict[seedType][0][-pos] = nuc                              # insert nuc
             returnDict[seedType][0]       = ''.join(returnDict[seedType][0]) # convert back to string
     
     return returnDict
Ejemplo n.º 2
0
 def _loadSeed(self):
     # Determine whether we got miRNA or kmer and initialize
     # 'seed' and 'seedComp' dicts.
     if len(self.sourceSeq) > 8:
         self.sourceSeqType = 'miRNA'
         # Extract all versions of seed from miRNA
         for seedType in self.seedModels:
             # If no extra info take the defined slice
             if len(self.seedModels[seedType]) == 2:
                 index_0 = self.seedModels[seedType][0]
                 index_1 = self.seedModels[seedType][1]
                 self.seed[seedType] = self.sourceSeq[index_0:index_1].replace('U','T') 
             # If extra data: use 'instructions' to construct seed where
             # revComp will generate match to message.
             elif len(self.seedModels[seedType]) == 3:
                 nuc,pos = list(self.seedModels[seedType][2])
                 pos = int(pos) 
                 index_0 = self.seedModels[seedType][0]
                 index_1 = self.seedModels[seedType][1]
                 self.seed[seedType] = self.sourceSeq[index_0:index_1].replace('U','T') 
                 # Convert correct position to reflect adjustment listed
                 # in 'instructions'.
                 self.seed[seedType] = list(self.seed[seedType])    # convert to list for inplace mutation
                 self.seed[seedType][pos-1] = bioDefs.revComp(nuc)  # insert rvCmp'd nuc
                 self.seed[seedType] = ''.join(self.seed[seedType]) # convert back to string
         
     else:
         self.sourceSeqType = 'kmer'
         self.seed['kmer'] = self.sourceSeq.replace('U','T')
Ejemplo n.º 3
0
 def _loadmatchVersions(self, set4realMatches):
     # Determine whether we got miRNA or kmer and initialize
     # matchVersions Dict.
     assert self.sourceSeq > 8, \
            'ERROR: self.sourceSeq (%s) must be > 8 nt long.' % (self.sourceSeq)
     self.sourceSeqType = 'miRNA'
     # We use the m2_to_m8 version to represent the seedVersion set for an miRNA since all versions can be constructed from this one.
     set4realMatches.add(bioDefs.revComp(self.sourceSeq[1:8].replace('U','T')))
     
     matchVersions = self._buildMatchVersions(self.sourceSeq)
     for seedType in _seedModels:
         self.matchVersions[seedType] = matchVersions[seedType]
         # Log each version of the seed match to use as restrictedList when building Ctrls
         set4realMatches.add(matchVersions[seedType][0]) # for technical reasons matchVersions[seedType] == ['aSeedMatchSeq'] so we must index it
Ejemplo n.º 4
0
 def buildCtrlsFromProSeed(self, restrictedList, numOfCtrls=15):
     """
     WARNING: This should only be called after the entire list of real seeds have been initialized!
     
     Computes 15 permutations of the 'true' proSeed matching seqeunce (m2_to_m8) and derives
     matchVersions as in the true case. The permuted sequence is checked agaisnt the restrictedList
     to prevent using KNOWN seed matches for controls. Ctrl seed matches are stored in a list located
     at second index of the list located in dict entry self.matchVersions[seedType]. Each seedVersion 
     of a ctrl set will share the same index number.
     """
     ##assert True==False, \
            ##"""WARNING!!! miRNA.buildCtrlsFromMatchVers() should be used instead!!
            ##If you REALLY want to use this method, modify or remove this assert statement.
            
            ##But seriously...  use miRNA.buildCtrlsFromMatchVers()."""
     # check to see whether this has already been done.
     # If so, complain and die.
     # Else, append an empty list as index_1 after REAL matchSeq for each version
     for seedType in self.matchVersions:
         assert type(self.matchVersions[seedType]) == type([]), \
                'ERROR: %s.matchVersions[%s] is not type: list.' % (self.name,seedType)
         assert len(self.matchVersions[seedType]) == 1, \
                'ERROR: len(%s.matchVersions[%s]) is not 1; ctrls seqs may have already been built.' % (self.name,seedType)
         self.matchVersions[seedType].append([])
     
     proSeed = self.sourceSeq[1:8]
     matchPerms  = [''.join(x) for x in xpermutations.xpermutations(list(self.matchVersions['m2_to_m8'][0]))]
     
     # Select 15 random permutations of matchVersions['m2_to_m8'] that are not in the 
     # restrictedList.
     chosenPerms = []
     while len(chosenPerms) < numOfCtrls:
         permSeq = JamesDefs.randFromList_noReplace(matchPerms)
         if permSeq not in restrictedList:
             chosenPerms.append(permSeq)
     
     # Use each chosenSeq to generate the diff matchVersions
     for seq in chosenPerms:
         # Create Fake miRNA with seq at the seed location to feed to _buildMatchVersions()
         seq = 'N%sNNNNNNNNNNNNN' % (bioDefs.revComp(seq))
         matchVersions = self._buildMatchVersions(seq)
         for seedType in self.matchVersions:
             self.matchVersions[seedType][1].append(matchVersions[seedType][0]) # must use index[0] bc _buildMatchVersions returns a list len==1 
Ejemplo n.º 5
0
oligoType = 'control' # 'match' or 'control'
assert oligoType == 'match' or 'control', 'oligoType MUST be only "match" or "control".'

# Load miRNA fastas into dict.
miRNAs = Fasta.file2dict(miRNAFile)

# Create new dict for seeds.
seeds = {}

# 1) Cycle through miRNA dict taking 7mers starting at pos 1 
#    and then pos2. Adapt key to reflect which. 
# 2) Convert to all uppers and convert U's to T's
# 3) If oligoType == 'match', rvcmp each 7mer and adapt key
#    to reflect which.
for miRNA in miRNAs:
    pos1_seed = miRNAs[miRNA][:7].upper().replace('U','T')
    pos2_seed = miRNAs[miRNA][1:8].upper().replace('U','T')


    if oligoType == 'match':
        seeds[miRNA+'_match_pos1'] = bioDefs.revComp(pos1_seed)
        seeds[miRNA+'_match_pos2'] = bioDefs.revComp(pos2_seed)
    else:
        seeds[miRNA+'_ctrl_pos1'] = pos1_seed
        seeds[miRNA+'_ctrl_pos2'] = pos2_seed
        
# Write out seed dict as fasta. 
Fasta.write(seeds,seedFile)

print "Done."
Ejemplo n.º 6
0
from gusPyCode.defs.bioDefs import revComp

# To convert Aa putative TSS seqs into +1 Ori based on gene's Ori


chromOriSeqs = map(lambda line: line.strip(), open('/Users/biggus/Documents/James/Data/AedesCorePromoterWork/output/Inr_DPE/Aa_Ensbl49_AaegL1.1.plus100minus100._InrDPE_.newInr.chromStrand.fa','rU').readlines())
outFile = '/Users/biggus/Documents/James/Data/AedesCorePromoterWork/output/Inr_DPE/Aa_Ensbl49_AaegL1.1.plus100minus100._InrDPE_.newInr.geneStrand.local.fa'
geneOriSeqs  = []


chromOriSeqs = zipped = zip(chromOriSeqs[:-1:2], chromOriSeqs[1::2])

for item in chromOriSeqs:
    fieldsList = item[0].split(' ')
    seq = item[1]
    if fieldsList[-1] == '(1:-1)':
        fieldsList[-1] = '(-1:1)'
        seq = revComp(seq)
        geneOriSeqs.append(' '.join(fieldsList)+'\n')
        geneOriSeqs.append(seq+'\n')
    else:
        geneOriSeqs.append(item[0]+'\n')
        geneOriSeqs.append(item[1]+'\n')
        
        
outFile = open(outFile, 'w')
outFile.writelines(geneOriSeqs)
    
print 'Done.'

Ejemplo n.º 7
0
 def _loadSeedComp(self):
     for seedType in self.seed:
         self.seedComp[seedType] = bioDefs.revComp(self.seed[seedType])
Ejemplo n.º 8
0
from TAMO.seq import Fasta
from gusPyCode.defs.bioDefs import revComp

seqFile     = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.fas'
kmerFile    = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.7mersInAll.tamoVers.txt'
#outFile     = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.7mersInAll.tamoVers.txt'


seqs     = Fasta.file2dict(seqFile)
seqNames = sorted(seqs.keys())
kmers    = map(lambda l: l.strip(), open(kmerFile, 'rU').readlines())
kmers.append('AAHRRSSSSSSSSSMMMMM')
results = []
for kmer in kmers:
    print '>%s:' % (kmer)
    results.append('%s:' % (kmer))
    for seq in seqNames:
        if seqs[seq].find(kmer) != -1 or seqs[seq].find(revComp(kmer)) != -1:
            print '\t'+seq
            results.append(seq)
        else:
            print '---NOT FOUND---'
    
#outFile = open(outFile, 'w')
#for each in inAllSeqs:
    #outFile.write(each+'\n')




print 'Done'