def countModuleInAll(module, seqDict): import re ##motifList = moduleStrs.split('\t') # Calculate revComp for each motif, convert them into perl-like regular expressions, compile those RegExStrings into # python regEx objs and modify data structure to be list of lists with each # 2ary list = [regExObj_Fwd, regExObj_Rev] i = 0 for IUPACmotif in module: # convert data struct and calc revComp module[i] = [IUPACmotif, [IUPACmotif, JamesDefs.revComp(IUPACmotif)]] c = 0 for each in module[i][1]: module[i][1][c] = JamesDefs.iupac2regex(module[i][1][c]) c += 1 module[i][1] = makeFwdAndRevCompRegExObj(module[i][1]) i += 1 # Loop over list and count module in fwd and revComp oris for presence absense # Sum total hits in totalHits totalHits = 0 for record in seqDict: # initiate modulePresent attrib to 0 seqDict[record].modulePresent = 0 hit = findModuleInLength(module, seqDict[record], 500) totalHits += hit return totalHits
def revCmp(self, toString=0): assert toString == 0 or 1 # 0 = return new SeqObj; 1 = return string if toString == 1: return JamesDefs.revComp(self.toString()) else: return self.__class__(JamesDefs.revComp(self.toString()))
def buildCtrlsFromMatchVers(self, restrictedList, numOfCtrls=15): """ WARNING: This should only be called after the entire list of real seeds have been initialized! Computes usr supplied number of permutations (dflt=15) of each 'true' matchVersion and screens them for real seqs in restrictedList to prevent using KNOWN seed matches for controls. Ctrl seed matches are stored in a list located at second index of the list located in dict entry self.matchVersions[seedType]. """ # check to see whether this has already been done. # If so, complain and die. # Else, append an empty list as index_1 after REAL matchSeq for each version for seedType in self.matchVersions: assert type(self.matchVersions[seedType]) == type([]), \ 'ERROR: %s.matchVersions[%s] is not type: list.' % (self.name,seedType) assert len(self.matchVersions[seedType]) == 1, \ 'ERROR: len(%s.matchVersions[%s]) is not 1; ctrls seqs may have already been built.' % (self.name,seedType) self.matchVersions[seedType].append([]) # permute and screen each seedVersion for seedType in self.matchVersions: # If NO third index like here: [0,7,'A1'] if len(_seedModels[seedType]) == 2: # Select numOfCtrls random permutations of matchVersions[seedType] that are not in the # restrictedList. matchPermList = [''.join(x) for x in xpermutations.xpermutations(list(self.matchVersions[seedType][0]))] while len(self.matchVersions[seedType][1]) < numOfCtrls: permSeq = JamesDefs.randFromList_noReplace(matchPermList) if permSeq not in restrictedList: # Append permuted Seq if not in restrictedList self.matchVersions[seedType][1].append(permSeq) # If extra data: use 'instructions index' to only permute the nucs not explicitly # defined in the seedModel elif len(_seedModels[seedType]) == 3: nuc,pos = list(_seedModels[seedType][2]) # Leave 1-registered bc we will use negIndex bc dealing with rvCmp of miRNA # so pos == 1 actually means pos == LAST in matchSeq pos = int(pos) # explode seq to remove defined nuc in place seq2Perm = list(self.matchVersions[seedType][0]) del seq2Perm[-pos] # Generate permutations from remaining nucs, matchPermList = [x for x in xpermutations.xpermutations(seq2Perm)] while len(self.matchVersions[seedType][1]) < numOfCtrls: permSeq = JamesDefs.randFromList_noReplace(matchPermList) # Replace nuc and check restricted list. if pos > 1: permSeq.insert(-pos+1,nuc) else: permSeq.append(nuc) permSeq = ''.join(permSeq) if permSeq not in restrictedList: # Append permuted Seq if not in restrictedList self.matchVersions[seedType][1].append(permSeq)
def makeFwdAndRevCompRegExObj(motif): import re motifPair = [motif, JamesDefs.revComp(motif)] # convert iupac string to regEx string for i in range(len(motifPair)): motifPair[i] = JamesDefs.iupac2regex(motifPair[i]) motif = '(%s|%s)' % (motifPair[0], motifPair[1]) fwdRevComp_regExObj = re.compile(motif, re.IGNORECASE) return fwdRevComp_regExObj
def findAllMotifs_SameLine(motifList, seqName, dictOfFastas, resultList): for motif in motifList: # convert from IUPAC to regEx and search in forward direction fwd_RegExMotif = re.compile(JamesDefs.iupac2regex(motif), re.IGNORECASE) # initiate result string for forward matches with name of AGAP and fwd IUPAC motif string MatchesStr = seqName+'\t'+motif # initiate location list to hold then sort locations matchLocations = [] # sequentially append each hit's coords to the end of locationList for fwdMatcheObj in fwd_RegExMotif.finditer(dictOfFastas[seqName].seq.tostring()): ## must add 1 to the start pos due to computer numbers.............* matchLocations.append(fwdMatcheObj.start()+1) # commented this because i am combining fwd and rev hits onto one line for memory's sake down the pipe ### add trailing newline for printing to file later ##fwd_MatchesStr = fwd_MatchesStr+'\n' ### send fwd results to resultList ##resultList.append(fwd_MatchesStr) # convert from IUPAC to regEx and search in reverse direction rev_RegExMotif = re.compile(JamesDefs.iupac2regex(JamesDefs.revComp(motif)), re.IGNORECASE) ### initiate result string for forward matches with name of AGAP and fwd IUPAC motif string ##rev_MatchesStr = seqName+'\t'+motif+'_rc\t' # sequentially append each hit's coords to the end of rev_MatchesStr for revMatcheObj in rev_RegExMotif.finditer(dictOfFastas[seqName].seq.tostring()): ## must add 1 to the start pos due to computer numbers.............* matchLocations.append(revMatcheObj.start()+1) # sort locations by start matchLocations.sort() # format matchStr for loc in matchLocations: MatchesStr = "%s\t%i" % (MatchesStr, loc) # add trailing newline for printing to file later MatchesStr = MatchesStr+'\n' # send fwd results to resultList resultList.append(MatchesStr)
def findAllMotifs(motifList, seqName, dictOfFastas, resultList): for motif in motifList: # convert from IUPAC to regEx and search in forward direction fwd_RegExMotif = re.compile(JamesDefs.iupac2regex(motif), re.IGNORECASE) # initiate result string for forward matches with name of AGAP and fwd IUPAC motif string fwd_MatchesStr = seqName+'\t'+motif+'\t' # sequentially append each hit's coords to the end of fwd_MatchesStr for fwdMatcheObj in fwd_RegExMotif.finditer(dictOfFastas[seqName].seq.tostring()): ## must add 1 to the start pos due to computer numbers.............* fwd_MatchesStr = fwd_MatchesStr+'%s\t' % (str(fwdMatcheObj.start()+1)) # add trailing newline for printing to file later fwd_MatchesStr = fwd_MatchesStr+'\n' # send fwd results to resultList resultList.append(fwd_MatchesStr) # convert from IUPAC to regEx and search in reverse direction rev_RegExMotif = re.compile(JamesDefs.iupac2regex(JamesDefs.revComp(motif)), re.IGNORECASE) # initiate result string for forward matches with name of AGAP and fwd IUPAC motif string rev_MatchesStr = seqName+'\t'+motif+'_rc\t' # sequentially append each hit's coords to the end of rev_MatchesStr for revMatcheObj in rev_RegExMotif.finditer(dictOfFastas[seqName].seq.tostring()): ## must add 1 to the start pos due to computer numbers.............* rev_MatchesStr = rev_MatchesStr+'%s\t' % (str(revMatcheObj.start()+1)) # add trailing newline for printing to file later rev_MatchesStr = rev_MatchesStr+'\n' # send fwd results to resultList resultList.append(rev_MatchesStr)
def makeFwdAndRevCompRegExObj_IUPAC(motif, equals=0): import re motifPair = [motif, JamesDefs.revComp(motif)] targetContainsMotif = '(%s|%s)' % (motifPair[0], motifPair[1]) targetISMotif = '^(%s|%s)&' % (motifPair[0], motifPair[1]) if equals == 1: motif = targetISMotif elif equals == 0: motif = targetContainsMotif fwdRevComp_regExObj = re.compile(motif, re.IGNORECASE) return fwdRevComp_regExObj
def buildCtrlsFromProSeed(self, restrictedList, numOfCtrls=15): """ WARNING: This should only be called after the entire list of real seeds have been initialized! Computes 15 permutations of the 'true' proSeed matching seqeunce (m2_to_m8) and derives matchVersions as in the true case. The permuted sequence is checked agaisnt the restrictedList to prevent using KNOWN seed matches for controls. Ctrl seed matches are stored in a list located at second index of the list located in dict entry self.matchVersions[seedType]. Each seedVersion of a ctrl set will share the same index number. """ ##assert True==False, \ ##"""WARNING!!! miRNA.buildCtrlsFromMatchVers() should be used instead!! ##If you REALLY want to use this method, modify or remove this assert statement. ##But seriously... use miRNA.buildCtrlsFromMatchVers().""" # check to see whether this has already been done. # If so, complain and die. # Else, append an empty list as index_1 after REAL matchSeq for each version for seedType in self.matchVersions: assert type(self.matchVersions[seedType]) == type([]), \ 'ERROR: %s.matchVersions[%s] is not type: list.' % (self.name,seedType) assert len(self.matchVersions[seedType]) == 1, \ 'ERROR: len(%s.matchVersions[%s]) is not 1; ctrls seqs may have already been built.' % (self.name,seedType) self.matchVersions[seedType].append([]) proSeed = self.sourceSeq[1:8] matchPerms = [''.join(x) for x in xpermutations.xpermutations(list(self.matchVersions['m2_to_m8'][0]))] # Select 15 random permutations of matchVersions['m2_to_m8'] that are not in the # restrictedList. chosenPerms = [] while len(chosenPerms) < numOfCtrls: permSeq = JamesDefs.randFromList_noReplace(matchPerms) if permSeq not in restrictedList: chosenPerms.append(permSeq) # Use each chosenSeq to generate the diff matchVersions for seq in chosenPerms: # Create Fake miRNA with seq at the seed location to feed to _buildMatchVersions() seq = 'N%sNNNNNNNNNNNNN' % (bioDefs.revComp(seq)) matchVersions = self._buildMatchVersions(seq) for seedType in self.matchVersions: self.matchVersions[seedType][1].append(matchVersions[seedType][0]) # must use index[0] bc _buildMatchVersions returns a list len==1
#========================= User Defined Variables ========================= # InFiles: motifList = map(string.strip, open('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2Kb_AllMosquitoes/MosqMotifs/upstream_exclsv-conserved_mosquito-motifs_nr.txt', 'r')) goodAGAPs = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas' # OutFile: outFile = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2Kb_AllMosquitoes/MosqMotifs/upstream_exclsv-conserved_mosquito-motifs_nr.smLine.map' #========================================================================== t1 = time() # Create dict of fasta Objects from those in goodAGAPs dictOfFastas = JamesDefs.fastaFileToBioSeqDict(goodAGAPs, Alphabet='IUPACAmbiguousDNA') # Create resultList to receive outPut resultList = [] c = 0 for seqName in dictOfFastas: findAllMotifs_SameLine(motifList, seqName, dictOfFastas, resultList) lenResultList = len(resultList) c+=1 print 'SeqName:%s itemNumber: %i' % (seqName,c) outFile= open(outFile, 'w') outFile.writelines(resultList)
def convertMotifList(motifList): i = 0 while i < len(motifList): motifList[i] = [motifList[i], JamesDefs.iupac2regex(motifList[i])] i += 1
t1 = time() # Populate a Dict with Seq objs for Anopheles boundary seqs # What follows directly is a klugde to get my seqDict vals to have the IUPAC ambiguous alphabet boundarySeqs = list(SeqIO.parse(open(boundarySeqs, "rU"), "fasta")) for record in boundarySeqs : record.seq.alphabet = IUPACAmbiguousDNA boundarySeqs = SeqIO.to_dict(boundarySeqs, key_function = lambda rec : rec.description.split()[0]) # convert iupac motifs to regexs and creat list of lists with each motif represented as ['IUPAC', 'REGEX'] convertMotifList(motifList) # group ClusterDefs by ClusterName clusterDefinitionList = JamesDefs.groupByField(clusterDefinitionList, 0) # This will become a list of tab delim'd params for the hyperGeo func: 'Motif:ClusterID';'motifCountInAll';'len(all)';'motifCountInCluster';'numOfSeqsInCluster' hyperGeoParams_4_motifClusterPairs = [] m=0 for motif in motifList: m+=1 print 'Motif '+str(m) # Count how many seq in total list have motif in either orientation motifCountInAll = None motifCountInAll = countMotifInAll(motif[1], boundarySeqs) for cluster in clusterDefinitionList:
def ensmblTx2BED(ensemblPath,BEDoutPath): """Converts files(see below for colNames) to BED files of transcripts. Ensembl Gene ID Ensembl Transcript ID Chromosome/plasmid Gene Start (bp) Gene End (bp) Transcript Start (bp) Transcript End (bp) Strand Transcript count Ensembl Exon ID Exon Chr Start (bp) Exon Chr End (bp) Exon Rank in Transcript phase Constitutive Exon Biotype""" # +++++ func specific Defs +++++ def getBlockSizes(tx): blkSzList = [] for exn in tx: blkSzList.append(str(int(exn[11])-int(exn[10])+1)) return ','.join(blkSzList) def getBlockStarts(tx,chrmStart): blkStrtList = [] for exn in tx: blkStrtList.append(str(int(exn[10])-1-int(chrmStart))) return ','.join(blkStrtList) # +++++ initialize ensembl data +++++ txList = map(lambda l: l.strip('\n') , open(ensemblPath, 'rU')) txList.pop(0) txList = JamesDefs.groupByField_silent(txList,1) # +++++ prepare destination file +++++ bedFile = open(BEDoutPath,'w') bedFile.write('track name="Ensembl Aa Tx Definitions" description="From %s" useScore=0\n' % (ensemblPath)) # +++++ loop through the Txs +++++ for tx in txList: # --- sort tx based on lowest coords of each exon --- tx.sort(key=lambda x: int(x[10])) chrm = tx[0][2] chrmStart = str(int(tx[0][5])-1) chrmEnd = tx[0][6] name = tx[0][1] score = '0' strand = tx[0][7] thkStart = chrmStart thkEnd = chrmEnd rgb = '0' blkCount = str(len(tx)) blkSizes = getBlockSizes(tx) blkStarts = getBlockStarts(tx,chrmStart) # --- write out line --- bedFile.write('%s\n' % ('\t'.join([chrm, chrmStart, chrmEnd, name, score, strand, thkStart, thkEnd, rgb, blkCount, blkSizes, blkStarts])))
# open and create handle for outFile resFile = open(outFile, 'w') tick = time.clock() # read file into list conflictList = conflictFile.readlines() # remove trailing '\n' from every record LEN_cL = len(conflictList) i = 0 while i < LEN_cL: conflictList[i] = conflictList[i].rstrip('\n') i = i + 1 # group file by target gene id using groupByField fjoinOutByGeneIDList = JamesDefs.groupByField(conflictList, 1) resolverArgs = { 'strandField' : 4, 'lowerBoundProximal' : 10, 'higherBoundProximal' : 11, 'conflictRegionStrt' : 18, 'conflictRegionEnd' : 19, 'whichBoundary':'upStream' } resolvedBoundariesList = resolver(fjoinOutByGeneIDList, resolverArgs) resFile.writelines(resolvedBoundariesList) tock = time.clock()
else: print "WARNING: boundaryRegion variable should only be 'up' or 'down'.\nScript exiting." sys.exit() #-------------------------------------------------- # Strip trailing newlines codingBoundsList = map(string.strip, codingBoundsList) resolvedConflictsList = map(string.strip, resolvedConflictsList) # Convert these into lists of lists so that field vals can be interrogated and copied # Explode tab delimited strings of each record into list of values JamesDefs.explodeDelimitedList(codingBoundsList, '\t') JamesDefs.explodeDelimitedList(resolvedConflictsList, '\t') len_codingBoundsList = len(codingBoundsList) len_resolvedConflictsList = len(resolvedConflictsList) # Populate unUsableList unUsableGeneNames = [] i = 0 while i < len_resolvedConflictsList: if int(resolvedConflictsList[i][5]) < shortestUsableBdryReg: unUseableGene = resolvedConflictsList.pop(i)
seenSeeds = set() miR_matches = {} saveObj = {"orthoRelations": orthoRelations, "seenSeeds": seenSeeds, "miR_matches": miR_matches} print "Initializing matchVersions..." for m in miRNAs: seed = miTrgt.seedMatches(miRNAs[m], seenSeeds, orthoRelations=orthoRelations, name=m) miR_matches[seed.name] = seed print "Initializing ctrls..." # choose one rand miRNA to make ctrls randMiRNA = JamesDefs.randFromList_noRplcMulti(miRNAs.keys(), 1)[0] miR_matches[randMiRNA].buildCtrlsFromMatchVers(seenSeeds, 30) randMiRNA = miR_matches[randMiRNA] # for m in miR_matches: # print m.name # for sVer in m.matchVersions: # print '%s: %s' % (sVer, m.matchVersions[sVer]) # print '- '*5 print "Tallying hits..." randMiRNA.tallyHits(orthoSeqs) print "Counting hits in orthos..." randMiRNA.countHitsInOrthos()
def countHitsInOrthos4(self,genomeToken,returnGenes=True): """ Uses results of miRNA.tallyHits() and self.orthos to count how many genes the miRNA seed hits in at least one genome, in at least two orthologs, and in all three orthologs. If returnGenes: returns tuple of two dicts: matchDict(keys=seedType : vals=[None,genesWithMatch,genePairsWithMatch,geneTriplesWithMatch]) ctrlDict(keys=seedType : vals=[[],genesWithMatch_1,genePairsWithMatch_1,geneTriplesWithMatch_1], [],genesWithMatch_2,genePairsWithMatch_2,geneTriplesWithMatch_2], ...]) """ # make sure we have tallied the hits already. assert self.matchData and self.ctrlData, \ 'ERROR: It looks like we have not tallied the hits yet. Call miRNA.tallyHits() first.' if returnGenes: rGeneNames = {} rCtrlNames = {} for seedType in _seedModels: rGeneNames[seedType] = [None,[],[],[]] rCtrlNames[seedType] = JamesDefs.initList(len(self.matchVersions[seedType][1]),[None,[],[],[]]) # Initialize self.matchCounts/self.ctrlCounts for seedType in _seedModels: self.matchCounts[seedType] = [0,0,0,0] self.ctrlCounts[seedType] = [[0]*4 for i in range(len(self.matchVersions[seedType][1]))] # Cycle through self.orthos for orthoSet in self.orthos: assert len(orthoSet) == 3,\ 'ERROR: It seems len(%s) != 3.' # Query the matcheData and ctrlData for hits in orthoSet for seedType in _seedModels: genesInMatchD = 0 genesInCtrlD = [0]*len(self.matchVersions[seedType][1]) if returnGenes: geneNames = [] ctrlNames = JamesDefs.initList(len(self.matchVersions[seedType][1]),[]) # Count how many genes in each orthoSet were hit by the respective seedTypes for gene in orthoSet: if gene in self.matchData[seedType]: genesInMatchD += 1 if returnGenes: geneNames.append(gene) for i in range(len(self.ctrlData[seedType])): if gene in self.ctrlData[seedType][i]: genesInCtrlD[i] += 1 if returnGenes: ctrlNames[i].append(gene) # Update self.matchData based on how many hits the orthoSet got for seedType if genesInMatchD == 0: ##self.matchCounts[seedType][0] += 3 pass elif genesInMatchD == 1: if ''.join(geneNames).find(genomeToken) != -1: ##self.matchCounts[seedType][0] += 2 self.matchCounts[seedType][1] += 1 if returnGenes: rGeneNames[seedType][1].extend(geneNames) elif genesInMatchD == 2: if ''.join(geneNames).find(genomeToken) != -1: ##self.matchCounts[seedType][0] += 1 ##self.matchCounts[seedType][1] += 1 # self.matchCounts[seedType][1] += 2 ##self.matchCounts[seedType][2] += 1 if returnGenes: rGeneNames[seedType][1].extend([x for x in geneNames if x.find(genomeToken) != -1]) rGeneNames[seedType][2].append(tuple(sorted(geneNames))) elif genesInMatchD == 3: if ''.join(geneNames).find(genomeToken) != -1: ##self.matchCounts[seedType][1] += 1 # self.matchCounts[seedType][1] += 3 ##self.matchCounts[seedType][2] += 2 # self.matchCounts[seedType][2] += 3 ##self.matchCounts[seedType][3] += 1 if returnGenes: rGeneNames[seedType][1].extend([x for x in geneNames if x.find(genomeToken) != -1]) type2 = [tuple(sorted(x)) for x in xpermutations.xuniqueCombinations(geneNames,2) if ''.join(x).find(genomeToken) != -1] type3 = tuple(sorted(geneNames)) rGeneNames[seedType][2].extend(type2) rGeneNames[seedType][3].append(type3) # Update self.ctrlData based on how many hits the orthoSet got in each ctrl for seedType for i in range(len(self.ctrlData[seedType])): if genesInCtrlD[i] == 0: ##self.ctrlCounts[seedType][i][0] += 3 pass elif genesInCtrlD[i] == 1: if ''.join(ctrlNames[i]).find(genomeToken) != -1: ##self.ctrlCounts[seedType][i][1] += 1 if returnGenes: rCtrlNames[seedType][i][1].extend(ctrlNames[i]) elif genesInCtrlD[i] == 2: if ''.join(ctrlNames[i]).find(genomeToken) != -1: ##self.ctrlCounts[seedType][i][1] += 1 # self.ctrlCounts[seedType][i][1] += 2 ##self.ctrlCounts[seedType][i][2] += 1 if returnGenes: rCtrlNames[seedType][i][1].extend([x for x in ctrlNames[i] if x.find(genomeToken) != -1]) rCtrlNames[seedType][i][2].append(tuple(sorted(ctrlNames[i]))) elif genesInCtrlD[i] == 3: if ''.join(ctrlNames[i]).find(genomeToken) != -1: ##self.ctrlCounts[seedType][i][1] += 1 # self.ctrlCounts[seedType][i][1] += 3 ##self.ctrlCounts[seedType][i][2] += 2 ##self.ctrlCounts[seedType][i][3] += 1 if returnGenes: rCtrlNames[seedType][i][1].extend([x for x in ctrlNames[i] if x.find(genomeToken) != -1]) type2 = [tuple(sorted(x)) for x in xpermutations.xuniqueCombinations(ctrlNames[i],2) if ''.join(x).find(genomeToken) != -1] type3 = tuple(sorted(ctrlNames[i])) rCtrlNames[seedType][i][2].extend(type2) rCtrlNames[seedType][i][3].append(type3) if returnGenes: for i in range(1,4): assert len(rGeneNames[seedType][i]) == len(set(rGeneNames[seedType][i])),\ "ERROR: rGeneNames[%s] in miRNA(%s) has redundancy." % (i, self.name) if returnGenes: # store and return rGeneNames self.matchEvents = rGeneNames self.ctrlEvents = rCtrlNames return (rGeneNames,rCtrlNames)
#print 'sys args = %s' % (len(sys.argv)) assert len(sys.argv) == 5, 'Usage: combineOrthologs_fromSeedTabs.py tabFile1 tabfile2 orthoDefs outfile' tabFile1 = map(lambda line: line.strip(), open(sys.argv[1],'rU').readlines()) tabFile2 = map(lambda line: line.strip(), open(sys.argv[2],'rU').readlines()) orthoDefs = map(lambda line: line.strip(), open(sys.argv[3],'rU').readlines()) oFile = sys.argv[4] # affirm that column titles match in each tabFile and save the info assert tabFile1[0] == tabFile2[0], 'Column titles do not match between source files.' columnTitles = tabFile1[0] # cleanse commented lines from both lists tabFile1 = JamesDefs.removeCommentLines(tabFile1,'#') tabFile2 = JamesDefs.removeCommentLines(tabFile2,'#') # create one dict from tabFile1&2 combinedDict = {} for line in tabFile1: fields = line.split('\t',1) combinedDict[fields[0]]=fields[1] for line in tabFile2: fields = line.split('\t',1) combinedDict[fields[0]]=fields[1] # write the new list
#========================= User Defined Variables ========================= # Path to original file originalFastaDict = open('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas', 'rU') desiredFastaList = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Clusters.txt' outDir = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/' #========================================================================== desiredFastaList = map(lambda line : line.strip(), open(desiredFastaList, 'rU').readlines()) # Parse clusterDefs into list of clusters listOfClusterDefs = JamesDefs.groupByField(desiredFastaList,0) # Instantiate the fasta rec lists with BioPython Seq using geneID field of discriptor as key to seq objects originalFastaDict = SeqIO.to_dict(SeqIO.parse(originalFastaDict, 'fasta'), key_function = lambda rec : rec.description.split()[0]) for cluster in listOfClusterDefs: print "Working on Cluster: %s" % (cluster[0][0]) # New dict to catch copied seqObjs desiredFastaObjList = [] for rec in cluster: if originalFastaDict.has_key(rec[1]):
# Instantiate the fasta rec lists with BioPython Seq using geneID field of discriptor as key to seq objects genomeOneFastasDict = SeqIO.to_dict(SeqIO.parse(open(genomeFileOne, "rU"), 'fasta'), key_function = lambda rec : rec.description.split()[0]) genomeTwoFastasDict = SeqIO.to_dict(SeqIO.parse(open(genomeFileTwo, "rU"), 'fasta'), key_function = lambda rec : rec.description.split()[0]) # Initiate resultList resultList = [] # Explode orthologList into list of lists JamesDefs.explodeDelimitedList(orthologList, '\t') # Populate a list of GeneIDs in each genome's dict of boundary seqs genomeOneGeneIDs = genomeOneFastasDict.keys() genomeTwoGeneIDs = genomeTwoFastasDict.keys() # Loop through orthologList and call each fasta in orthoPair, format # the new comboFasta and append it to resultList for orthoPair in orthologList: # Test for orthoPair[0] in genomeOneFastasDict and same for orthoPair[1] in genomeTwoFastasDict orthoPair_0_warn = None orthoPair_1_warn = None if orthoPair[0] not in genomeOneGeneIDs: orthoPair_0_warn = 'Yes' if orthoPair[1] not in genomeTwoGeneIDs:
from gusPyCode.defs import JamesDefs import string delimitedList = open('/Users/biggus/Documents/MBGB/Rotations/James/Data/Sequence/Culex/Culex_Exon_Location.txt', 'r').readlines() delimitedList = map(string.strip, delimitedList) newList = JamesDefs.reOrderDelimitedList(delimitedList, '\t', [7,1,8,4,6,2,3,5,0]) outFile = open('/Users/biggus/Documents/MBGB/Rotations/James/Data/Sequence/Culex/Culex_Exon_Location_Reordered.txt','w') for rec in newList: outFile.write(rec+'\n') print 'Yay'
# Read source data into list bioMartList = srcFile.readlines() # remove trailing '\n' from every record LEN_bML = len(bioMartList) i = 0 while i < LEN_bML: bioMartList[i] = bioMartList[i].rstrip('\n') i = i + 1 # Grouping records by gene name and splitting record fields into lists groupedList = JamesDefs.groupByField(bioMartList, 0) # Combine exon records into a single gene line record with start and stop coords for coding region # TranscriptID field will be removed and fields representing the number of exons encountered and # the chromosomal coverage will be appended respectivly to the end of each record oneLineRecordList = combineExons(groupedList, bdryLen) # Write out oneLineRecordList to outFile boundaryFile.writelines(oneLineRecordList) boundaryFile.close() print 'Tada!'
from gusPyCode.defs import JamesDefs #--------- Script Specific Function Definitions --------------------- #-------------------------------------------------------------------- #========================= User Defined Variables ========================= inFile = '/Users/biggus/Documents/MBGB/Rotations/James/Data/mdosJAR_testing/JAR_2KBupAedesAnopheles_7mer.rvCmp.smt2.sortedMotifsOnly.motifs' #outFile = '/Users/biggus/Documents/MBGB/Rotations/James/Data/mdosJAR_testing/JAR_2KBupAedesAnopheles_7mer.rvCmp.smt2.sorted.motifs' #========================================================================== inFile = map(lambda line : line.strip(), open(inFile, 'rU').readlines()) nrList = JamesDefs.nrListBySets(inFile) x=1