def spawnOrthoGroups(promoterFileList,nWayOrthoList): """Takes promoterFileList<listOfPaths> and nWayOrthoList<listOfLists> and spawns the orthoGroup objects in a dictionary with keys = 'geneName1:geneName2:etc' that will be used to run the combined hypergeometric analysis.""" # validation assert type(promoterFileList) == type([]), \ '''promoterFileList must be a list of file paths. You provided type: "%s"'''\ % (type(promoterFileList)) assert type(promoterFileList[0]) == type(''), \ '''promoterFileList must be a list of file paths. promoterFileList[0] != type(''): "%s"'''\ % (type(promoterFileList[0])) # load promoters allPromoters = {} for i in range(len(promoterFileList)): oneGenome = Fasta.file2dict(promoterFileList[i]) for j in oneGenome: allKeys = allPromoters.keys() assert j not in allKeys, \ '''Detected duplicate gene name in promoterFileList! "%s"'''\ % (j) allPromoters[j] = oneGenome[j] # Build Groups orthoGroups = {} for i in range(len(nWayOrthoList)): groupDict = {} for j in range(len(nWayOrthoList[i])): if allPromoters[nWayOrthoList[i][j]]: groupDict[nWayOrthoList[i][j]] = allPromoters[nWayOrthoList[i][j]] else: break # we do not want orthoGroups that are missing members if len(groupDict) != len(nWayOrthoList[i]): break # we do not want orthoGroups that are missing members else: nWayOrthoList[i].sort() orthoGroups[':'.join(nWayOrthoList[i])] = OrthoGroup(groupDict) return orthoGroups
from TAMO.seq import Fasta from gusPyCode.defs import bioDefs miRNAFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.fa' seedFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.seeds.ctrl.fa' oligoType = 'control' # 'match' or 'control' assert oligoType == 'match' or 'control', 'oligoType MUST be only "match" or "control".' # Load miRNA fastas into dict. miRNAs = Fasta.file2dict(miRNAFile) # Create new dict for seeds. seeds = {} # 1) Cycle through miRNA dict taking 7mers starting at pos 1 # and then pos2. Adapt key to reflect which. # 2) Convert to all uppers and convert U's to T's # 3) If oligoType == 'match', rvcmp each 7mer and adapt key # to reflect which. for miRNA in miRNAs: pos1_seed = miRNAs[miRNA][:7].upper().replace('U','T') pos2_seed = miRNAs[miRNA][1:8].upper().replace('U','T') if oligoType == 'match': seeds[miRNA+'_match_pos1'] = bioDefs.revComp(pos1_seed) seeds[miRNA+'_match_pos2'] = bioDefs.revComp(pos2_seed) else: seeds[miRNA+'_ctrl_pos1'] = pos1_seed seeds[miRNA+'_ctrl_pos2'] = pos2_seed
from TAMO.MotifTools import top_nmers,Motif from TAMO import MotifTools from TAMO.seq import Fasta from gusPyCode.defs.bioDefs import ifKmerInAll seqFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.fas' outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/mainTwoGenes.8mersInAll.txt' kmerSize = 8 scoreThresh = 0.999999 seqs = Fasta.file2dict(seqFile) # create new dict to store the seqs' kmers seqsKmers = {} for i in seqs: seqsKmers[i] = top_nmers(kmerSize,[seqs[i]], purge_Ns = 1) # for some reason top_nmers fails silently if given str instead of list inAllSeqs = [] count = 0 for seq in seqsKmers: for kmer in seqsKmers[seq]: if ifKmerInAll(kmer,seqs,scoreThresh): if kmer not in inAllSeqs: inAllSeqs.append(kmer) count+=1 print count outFile = open(outFile, 'w')
parser.add_option('-f', dest="make_fasta", action="store_true",default=False, help="""Produce relavent fasta files too. (default=%default)""") (opts, args) = parser.parse_args() # --- A Little Extra Input Validation --- if len(args) < 2: parser.print_help() print '\nERROR: Both geneListFile and fastaFile are required!' exit(1) geneNames = map(lambda l: l.strip(),open(args[0], 'rU').readlines()) totalSeqs = Fasta.file2dict(args[1]) randClusterLists = genRandClusters(geneNames,totalSeqs,N=opts.N, keepLen=1) # -- Make Out Folder -- mkdirp(opts.out_dir) for i in range(len(randClusterLists)): oFileName = args[0].replace('.txt','randomGeneNames_%s.txt' % (i)).split('/')[-1] oFile = open('%s/%s' % (opts.out_dir,oFileName), 'w') for name in randClusterLists[i]: oFile.write(name+'\n') oFile.close() # --- If Asked, Create Fastas --- if opts.make_fasta: fNames = map(lambda l: l.strip(),open('%s/%s' % (opts.out_dir,oFileName), 'rU').readlines())
from TAMO.seq import Fasta #from gusPyCode.defs.JamesDefs import revComp fFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/Fastas/Aa_500afterCoding.usuable.stpCdn.fas' sFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/miRNAs/miRBase/mature.aga.seeds.ctrl.fa' oFile = '/Users/biggus/Documents/James/Data/Tu_miRNA/SeedCountOutPut/counts/miRBaseMatureSeedsOn_Aa_500afterCoding.ctrl.txt' print 'WARNING!!\nThis script now takes the exact k-mer to be searched!!!\nGive it the "match" or the "control" specifically.\n(match is rvcmp\'d version of miRNA seed)\nIT WILL _NOT_ REVCOMP IT FOR YOU!!!!\n' # --------- Fasta Prep --------- fastas = Fasta.file2dict(fFile) seqNames = fastas.keys() seqNames.sort() # seqs are softMasked. This unMaskes them. for seq in fastas: fastas[seq] = fastas[seq].upper() # --------- Seed Prep --------- seeds = Fasta.file2dict(sFile) seedNames = seeds.keys() seedNames.sort() # to make sure we are only looking for uppercase strings for seed in seeds: seeds[seed] = seeds[seed].upper() results = ['#seqName\t'+'\t'.join(seedNames)] def findSeedsInSeq(seeds,seedNames,seqStr,seqName): '''take dict of seeds, a seq, and its name. Return tsv string of seqName followed by 0s and 1s corelating with presence