from TAMO.MotifMetrics import Fasta from gusPyCode.defs.JamesDefs import revComp seedsFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/testSeeds.fas' kMersFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/MDOSoutPut/CqAg_7mers_orthosForMDOS_500afterCoding.mdos.motifzSgte3.txt' seeds = Fasta.file2dict(seedsFile) kMers = map(lambda line: line.strip().split('\t'), open(kMersFile,'rU').readlines()) print 'seedsFile: %s\nkMersFile: %s\n' % (seedsFile,kMersFile) for l in kMers: for seed in seeds.keys(): if revComp(seeds[seed].upper()) == l[0]: print '>%s\n%s\n' % (seed,'\t'.join(l)) print 'done'
from gusPyCode.defs.JamesDefs import revComp #========================= User Defined Variables ========================= inFile = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2Kb_AllMosquitoes/MosqMotifs/MotifGroupPWMs/AllGroups/AllGroups_Aln.fwd.txt' outFile = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2Kb_AllMosquitoes/MosqMotifs/MotifGroupPWMs/AllGroups/AllGroups_Aln.rvCmp.txt' #========================================================================== #--------- Script Specific Function Definitions --------------------- #-------------------------------------------------------------------- inFile = map(lambda line : line.strip(), open(inFile, 'rU').readlines()) outData = [] for each in inFile: outData.append(revComp(each)+"\n") outFile = open(outFile, 'w') outFile.writelines(outData) print "Done."
from gusPyCode.defs.JamesDefs import revComp from sets import Set KmerCounts = {'AA':3,'TT':3,'GC':3,'CG':3,'CC':3,'TA':3,'AT':3} allKmers = KmerCounts.keys() #nrKmers = [] # make a set of sets that represent each revcomp pair Sets do not allow multiple identicle # entries so you will end up with a single set for each rvcmp pair. Notice that for motifs # like 'AT' the rvcmp IS 'AT' so there is only 'AT' in the set not set([AT,AT])''. But for # most you should have set([fwd,rvcmp]) in rvCmpPairs rvCmpPairSets = Set([]) for i in allKmers: rc = Set([i,revComp(i)]) rvCmpPairSets.add(rc) # now convert the sets back into lists and force them to become strings so we can use them for a dict key rvCmpPairDict = {} for i in rvCmpPairSets: rvCmpPairDict[(str(list(i)))] = 0 # iterate through the original dict and ask if the original motif key is in the list that makes up the # rvcmp pair key. This uses "eval" which will magicly convert the key into a list again since we created # the string like this str([list]). We then add the value to whichever key in rvCmpPairDict that the key # in KmerCounts matches. for key in KmerCounts: for pairKey in rvCmpPairDict: if key in eval(pairKey): rvCmpPairDict[pairKey] = rvCmpPairDict[pairKey] + KmerCounts[key] # adding kmercount to whatever value is in rvCmpPairDict[pairKey]
#-------------------------------------------------------------------- #========================= User Defined Variables ========================= inFile = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2kb_DmelMosquitoes/2KBup_CombinedDrosophilaAndCulexOrthologs.masked.fas' outFile = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2kb_DmelMosquitoes/2KBup_CombinedDrosophilaAndCulexOrthologs.masked.rvCmp.fas' #========================================================================== inFile = map(lambda line : line.strip(), open(inFile, 'rU').readlines()) bp_10 = re.compile('^[ATGCN]{10,}') for i in range(0,len(inFile)): if bp_10.search(inFile[i]): inFile[i] = "%snnn%s\n" % (inFile[i], revComp(inFile[i])) else: inFile[i] = inFile[i]+'\n' outFile = open(outFile, 'w') outFile.writelines(inFile) print 'Done!'