Exemple #1
0
from TAMO.MotifMetrics import Fasta
from gusPyCode.defs.JamesDefs import revComp

seedsFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/testSeeds.fas'
kMersFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/MDOSoutPut/CqAg_7mers_orthosForMDOS_500afterCoding.mdos.motifzSgte3.txt'

seeds = Fasta.file2dict(seedsFile)
kMers = map(lambda line: line.strip().split('\t'), open(kMersFile,'rU').readlines())

print 'seedsFile: %s\nkMersFile: %s\n' % (seedsFile,kMersFile)

for l in kMers:
    for seed in seeds.keys():
        if revComp(seeds[seed].upper()) == l[0]:
            print '>%s\n%s\n' % (seed,'\t'.join(l))

            
print 'done'
Exemple #2
0
from gusPyCode.defs.JamesDefs import revComp


#========================= User Defined Variables =========================
inFile  = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2Kb_AllMosquitoes/MosqMotifs/MotifGroupPWMs/AllGroups/AllGroups_Aln.fwd.txt'
outFile = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2Kb_AllMosquitoes/MosqMotifs/MotifGroupPWMs/AllGroups/AllGroups_Aln.rvCmp.txt'

#==========================================================================

#--------- Script Specific Function Definitions ---------------------


#--------------------------------------------------------------------


inFile = map(lambda line : line.strip(), open(inFile, 'rU').readlines())


outData = []

for each in inFile:
    outData.append(revComp(each)+"\n")
    

outFile = open(outFile, 'w')
outFile.writelines(outData)

print "Done."
from gusPyCode.defs.JamesDefs import revComp
from sets import Set

KmerCounts = {'AA':3,'TT':3,'GC':3,'CG':3,'CC':3,'TA':3,'AT':3}
allKmers = KmerCounts.keys()

#nrKmers = []

# make a set of sets that represent each revcomp pair Sets do not allow multiple identicle
# entries so you will end up with a single set for each rvcmp pair.  Notice that for motifs
# like 'AT' the rvcmp IS 'AT' so there is only 'AT' in the set not set([AT,AT])''. But for 
# most you should have set([fwd,rvcmp]) in rvCmpPairs
rvCmpPairSets = Set([])
for i in allKmers:
    rc = Set([i,revComp(i)])
    rvCmpPairSets.add(rc)
    
# now convert the sets back into lists and force them to become strings so we can use them for a dict key
rvCmpPairDict = {}
for i in rvCmpPairSets:
    rvCmpPairDict[(str(list(i)))] = 0
    
# iterate through the original dict and ask if the original motif key is in the list that makes up the
# rvcmp pair key.  This uses "eval" which will magicly convert the key into a list again since we created
# the string like this str([list]).  We then add the value to whichever key in rvCmpPairDict that the key
# in KmerCounts matches.
for key in KmerCounts:
    for pairKey in rvCmpPairDict:
        if key in eval(pairKey):
            rvCmpPairDict[pairKey] = rvCmpPairDict[pairKey] + KmerCounts[key] # adding kmercount to whatever value is in rvCmpPairDict[pairKey]
#--------------------------------------------------------------------



#========================= User Defined Variables =========================
inFile =  '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2kb_DmelMosquitoes/2KBup_CombinedDrosophilaAndCulexOrthologs.masked.fas'
outFile = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2kb_DmelMosquitoes/2KBup_CombinedDrosophilaAndCulexOrthologs.masked.rvCmp.fas'

#==========================================================================


inFile = map(lambda line : line.strip(), open(inFile, 'rU').readlines())

bp_10 = re.compile('^[ATGCN]{10,}')

for i in range(0,len(inFile)):
    if bp_10.search(inFile[i]):
        inFile[i] = "%snnn%s\n" % (inFile[i], revComp(inFile[i]))
    else:
        inFile[i] = inFile[i]+'\n'
        

        
    
outFile = open(outFile, 'w')

outFile.writelines(inFile)

print 'Done!'