def countModuleInAll(module, seqDict):

    import re

    ##motifList = moduleStrs.split('\t')

    #  Calculate revComp for each motif, convert them into perl-like regular expressions, compile those RegExStrings into
    #    python regEx objs and modify data structure to be list of lists with each
    #    2ary list = [regExObj_Fwd, regExObj_Rev]

    i = 0
    for IUPACmotif in module:
        #  convert data struct and calc revComp
        module[i] = [IUPACmotif, [IUPACmotif, JamesDefs.revComp(IUPACmotif)]]
        c = 0
        for each in module[i][1]:
            module[i][1][c] = JamesDefs.iupac2regex(module[i][1][c])
            c += 1
        module[i][1] = makeFwdAndRevCompRegExObj(module[i][1])
        i += 1

    #  Loop over list and count module in fwd and revComp oris for presence absense
    #  Sum total hits in totalHits

    totalHits = 0

    for record in seqDict:
        #  initiate modulePresent attrib to 0
        seqDict[record].modulePresent = 0

        hit = findModuleInLength(module, seqDict[record], 500)
        totalHits += hit

    return totalHits
Exemple #2
0
 def revCmp(self, toString=0):
     assert toString == 0 or 1 # 0 = return new SeqObj; 1 = return string
     
     if toString == 1:
         return JamesDefs.revComp(self.toString())
     else:
         return self.__class__(JamesDefs.revComp(self.toString()))
Exemple #3
0
 def buildCtrlsFromMatchVers(self, restrictedList, numOfCtrls=15):
     """
     WARNING: This should only be called after the entire list of real seeds have been initialized!
     
     Computes usr supplied number of permutations (dflt=15) of each 'true' matchVersion and screens them
     for real seqs in restrictedList to prevent using KNOWN seed matches for controls. Ctrl seed matches
     are stored in a list located at second index of the list located in dict entry
     self.matchVersions[seedType]. 
     """
     
     # check to see whether this has already been done.
     # If so, complain and die.
     # Else, append an empty list as index_1 after REAL matchSeq for each version
     for seedType in self.matchVersions:
         assert type(self.matchVersions[seedType]) == type([]), \
                'ERROR: %s.matchVersions[%s] is not type: list.' % (self.name,seedType)
         assert len(self.matchVersions[seedType]) == 1, \
                'ERROR: len(%s.matchVersions[%s]) is not 1; ctrls seqs may have already been built.' % (self.name,seedType)
         self.matchVersions[seedType].append([])
     
     # permute and screen each seedVersion
     for seedType in self.matchVersions:
         # If NO third index like here: [0,7,'A1']
         if len(_seedModels[seedType]) == 2:
             # Select numOfCtrls random permutations of matchVersions[seedType] that are not in the 
             # restrictedList.
             matchPermList = [''.join(x) for x in xpermutations.xpermutations(list(self.matchVersions[seedType][0]))]
             while len(self.matchVersions[seedType][1]) < numOfCtrls:
                 permSeq = JamesDefs.randFromList_noReplace(matchPermList)
                 if permSeq not in restrictedList:
                     # Append permuted Seq if not in restrictedList
                     self.matchVersions[seedType][1].append(permSeq)
         # If extra data: use 'instructions index' to only permute the nucs not explicitly
         # defined in the seedModel
         elif len(_seedModels[seedType]) == 3:
             nuc,pos = list(_seedModels[seedType][2])
             # Leave 1-registered bc we will use negIndex bc dealing with rvCmp of miRNA
             # so pos == 1 actually means pos == LAST in matchSeq 
             pos = int(pos) 
             # explode seq to remove defined nuc in place
             seq2Perm = list(self.matchVersions[seedType][0])
             del seq2Perm[-pos]
             # Generate permutations from remaining nucs,
             matchPermList = [x for x in xpermutations.xpermutations(seq2Perm)]
             while len(self.matchVersions[seedType][1]) < numOfCtrls:
                 permSeq = JamesDefs.randFromList_noReplace(matchPermList)
                 # Replace nuc and check restricted list.
                 if pos > 1: permSeq.insert(-pos+1,nuc)
                 else:       permSeq.append(nuc)
                 permSeq = ''.join(permSeq)
                 if permSeq not in restrictedList:
                     # Append permuted Seq if not in restrictedList
                     self.matchVersions[seedType][1].append(permSeq) 
def makeFwdAndRevCompRegExObj(motif):
    import re
    
    motifPair = [motif, JamesDefs.revComp(motif)]
    
    #  convert iupac string to regEx string
    for i in range(len(motifPair)):
        motifPair[i] = JamesDefs.iupac2regex(motifPair[i])
        
    motif = '(%s|%s)' % (motifPair[0], motifPair[1])
    
    fwdRevComp_regExObj = re.compile(motif, re.IGNORECASE)
    return fwdRevComp_regExObj
Exemple #5
0
def findAllMotifs_SameLine(motifList, seqName, dictOfFastas, resultList):
    
    for motif in motifList:
        
        #  convert from IUPAC to regEx and search in forward direction
        fwd_RegExMotif = re.compile(JamesDefs.iupac2regex(motif), re.IGNORECASE)
        
        #  initiate result string for forward matches with name of AGAP and fwd IUPAC motif string
        MatchesStr = seqName+'\t'+motif
        
        #  initiate location list to hold then sort locations
        matchLocations = []
        
        #  sequentially append each hit's coords to the end of locationList
        for fwdMatcheObj in fwd_RegExMotif.finditer(dictOfFastas[seqName].seq.tostring()):
            
            ## must add 1 to the start pos due to computer numbers.............*
            matchLocations.append(fwdMatcheObj.start()+1)
        
        # commented this because i am combining fwd and rev hits onto one line for memory's sake down the pipe
        ### add trailing newline for printing to file later
        ##fwd_MatchesStr = fwd_MatchesStr+'\n'
        
        ### send fwd results to resultList
        ##resultList.append(fwd_MatchesStr)
        
        #  convert from IUPAC to regEx and search in reverse direction
        rev_RegExMotif = re.compile(JamesDefs.iupac2regex(JamesDefs.revComp(motif)), re.IGNORECASE)
        
        ###  initiate result string for forward matches with name of AGAP and fwd IUPAC motif string
        ##rev_MatchesStr = seqName+'\t'+motif+'_rc\t'
        
        #  sequentially append each hit's coords to the end of rev_MatchesStr
        for revMatcheObj in rev_RegExMotif.finditer(dictOfFastas[seqName].seq.tostring()):
            
            ## must add 1 to the start pos due to computer numbers.............*
            matchLocations.append(revMatcheObj.start()+1) 
            
        #  sort locations by start
        matchLocations.sort()
        
        #  format matchStr
        for loc in matchLocations:
            MatchesStr = "%s\t%i" % (MatchesStr, loc)
        
        #  add trailing newline for printing to file later
        MatchesStr = MatchesStr+'\n'
        
        #  send fwd results to resultList
        resultList.append(MatchesStr)
Exemple #6
0
def findAllMotifs(motifList, seqName, dictOfFastas, resultList):
    
    for motif in motifList:
        
        #  convert from IUPAC to regEx and search in forward direction
        fwd_RegExMotif = re.compile(JamesDefs.iupac2regex(motif), re.IGNORECASE)
        
        #  initiate result string for forward matches with name of AGAP and fwd IUPAC motif string
        fwd_MatchesStr = seqName+'\t'+motif+'\t'
        
        #  sequentially append each hit's coords to the end of fwd_MatchesStr
        for fwdMatcheObj in fwd_RegExMotif.finditer(dictOfFastas[seqName].seq.tostring()):
            
            ## must add 1 to the start pos due to computer numbers.............*
            fwd_MatchesStr = fwd_MatchesStr+'%s\t' % (str(fwdMatcheObj.start()+1)) 
            
        # add trailing newline for printing to file later
        fwd_MatchesStr = fwd_MatchesStr+'\n'
        
        # send fwd results to resultList
        resultList.append(fwd_MatchesStr)
        
        #  convert from IUPAC to regEx and search in reverse direction
        rev_RegExMotif = re.compile(JamesDefs.iupac2regex(JamesDefs.revComp(motif)), re.IGNORECASE)
        
        #  initiate result string for forward matches with name of AGAP and fwd IUPAC motif string
        rev_MatchesStr = seqName+'\t'+motif+'_rc\t'
        
        #  sequentially append each hit's coords to the end of rev_MatchesStr
        for revMatcheObj in rev_RegExMotif.finditer(dictOfFastas[seqName].seq.tostring()):
            
            ## must add 1 to the start pos due to computer numbers.............*
            rev_MatchesStr = rev_MatchesStr+'%s\t' % (str(revMatcheObj.start()+1)) 
            
        # add trailing newline for printing to file later
        rev_MatchesStr = rev_MatchesStr+'\n'
        
        # send fwd results to resultList
        resultList.append(rev_MatchesStr)
def makeFwdAndRevCompRegExObj_IUPAC(motif, equals=0):
    import re
    
    motifPair = [motif, JamesDefs.revComp(motif)]
        
    targetContainsMotif  = '(%s|%s)' % (motifPair[0], motifPair[1])
    targetISMotif        = '^(%s|%s)&' % (motifPair[0], motifPair[1])
    
    if equals == 1:
        motif = targetISMotif
    elif equals == 0:
        motif = targetContainsMotif
    
    fwdRevComp_regExObj = re.compile(motif, re.IGNORECASE)
    return fwdRevComp_regExObj
Exemple #8
0
 def buildCtrlsFromProSeed(self, restrictedList, numOfCtrls=15):
     """
     WARNING: This should only be called after the entire list of real seeds have been initialized!
     
     Computes 15 permutations of the 'true' proSeed matching seqeunce (m2_to_m8) and derives
     matchVersions as in the true case. The permuted sequence is checked agaisnt the restrictedList
     to prevent using KNOWN seed matches for controls. Ctrl seed matches are stored in a list located
     at second index of the list located in dict entry self.matchVersions[seedType]. Each seedVersion 
     of a ctrl set will share the same index number.
     """
     ##assert True==False, \
            ##"""WARNING!!! miRNA.buildCtrlsFromMatchVers() should be used instead!!
            ##If you REALLY want to use this method, modify or remove this assert statement.
            
            ##But seriously...  use miRNA.buildCtrlsFromMatchVers()."""
     # check to see whether this has already been done.
     # If so, complain and die.
     # Else, append an empty list as index_1 after REAL matchSeq for each version
     for seedType in self.matchVersions:
         assert type(self.matchVersions[seedType]) == type([]), \
                'ERROR: %s.matchVersions[%s] is not type: list.' % (self.name,seedType)
         assert len(self.matchVersions[seedType]) == 1, \
                'ERROR: len(%s.matchVersions[%s]) is not 1; ctrls seqs may have already been built.' % (self.name,seedType)
         self.matchVersions[seedType].append([])
     
     proSeed = self.sourceSeq[1:8]
     matchPerms  = [''.join(x) for x in xpermutations.xpermutations(list(self.matchVersions['m2_to_m8'][0]))]
     
     # Select 15 random permutations of matchVersions['m2_to_m8'] that are not in the 
     # restrictedList.
     chosenPerms = []
     while len(chosenPerms) < numOfCtrls:
         permSeq = JamesDefs.randFromList_noReplace(matchPerms)
         if permSeq not in restrictedList:
             chosenPerms.append(permSeq)
     
     # Use each chosenSeq to generate the diff matchVersions
     for seq in chosenPerms:
         # Create Fake miRNA with seq at the seed location to feed to _buildMatchVersions()
         seq = 'N%sNNNNNNNNNNNNN' % (bioDefs.revComp(seq))
         matchVersions = self._buildMatchVersions(seq)
         for seedType in self.matchVersions:
             self.matchVersions[seedType][1].append(matchVersions[seedType][0]) # must use index[0] bc _buildMatchVersions returns a list len==1 
Exemple #9
0
#========================= User Defined Variables =========================

#  InFiles:
motifList = map(string.strip, open('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2Kb_AllMosquitoes/MosqMotifs/upstream_exclsv-conserved_mosquito-motifs_nr.txt', 'r'))

goodAGAPs = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas'

#  OutFile:
outFile   = '/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Combo/2Kb_AllMosquitoes/MosqMotifs/upstream_exclsv-conserved_mosquito-motifs_nr.smLine.map' 

#==========================================================================

t1 = time()
#  Create dict of fasta Objects from those in goodAGAPs
dictOfFastas = JamesDefs.fastaFileToBioSeqDict(goodAGAPs, Alphabet='IUPACAmbiguousDNA')


#  Create resultList to receive outPut
resultList = []
c = 0
for seqName in dictOfFastas:
    
    findAllMotifs_SameLine(motifList, seqName, dictOfFastas, resultList)
    lenResultList = len(resultList)
    c+=1
    print 'SeqName:%s itemNumber: %i' % (seqName,c)

outFile= open(outFile, 'w')

outFile.writelines(resultList)
Exemple #10
0
def convertMotifList(motifList):
    i = 0
    while i < len(motifList):
        motifList[i] = [motifList[i], JamesDefs.iupac2regex(motifList[i])]
        i += 1
Exemple #11
0
t1 = time()

#  Populate a Dict with Seq objs for Anopheles boundary seqs
#  What follows directly is a klugde to get my seqDict vals to have the IUPAC ambiguous alphabet
boundarySeqs = list(SeqIO.parse(open(boundarySeqs, "rU"), "fasta"))
for record in boundarySeqs :
    record.seq.alphabet = IUPACAmbiguousDNA

boundarySeqs = SeqIO.to_dict(boundarySeqs, key_function = lambda rec : rec.description.split()[0])

# convert iupac motifs to regexs and creat list of lists with each motif represented as ['IUPAC', 'REGEX'] 
convertMotifList(motifList)


#  group ClusterDefs by ClusterName
clusterDefinitionList = JamesDefs.groupByField(clusterDefinitionList, 0)

#  This will become a list of tab delim'd params for the hyperGeo func: 'Motif:ClusterID';'motifCountInAll';'len(all)';'motifCountInCluster';'numOfSeqsInCluster' 
hyperGeoParams_4_motifClusterPairs = []

m=0
for motif in motifList:
    m+=1
    print 'Motif '+str(m)
    #  Count how many seq in total list have motif in either orientation
    motifCountInAll = None
    motifCountInAll = countMotifInAll(motif[1], boundarySeqs)
 
    
    for cluster in clusterDefinitionList:
Exemple #12
0
def ensmblTx2BED(ensemblPath,BEDoutPath):
    """Converts files(see below for colNames) to BED files of transcripts.
    Ensembl Gene ID
    Ensembl Transcript ID
    Chromosome/plasmid
    Gene Start (bp)
    Gene End (bp)
    Transcript Start (bp)
    Transcript End (bp)
    Strand
    Transcript count
    Ensembl Exon ID
    Exon Chr Start (bp)
    Exon Chr End (bp)
    Exon Rank in Transcript
    phase
    Constitutive Exon
    Biotype"""
    # +++++ func specific Defs +++++
    def getBlockSizes(tx):
        blkSzList = []
        for exn in tx:
            blkSzList.append(str(int(exn[11])-int(exn[10])+1))
        return ','.join(blkSzList)
    
    def getBlockStarts(tx,chrmStart):
        blkStrtList = []
        for exn in tx:
            blkStrtList.append(str(int(exn[10])-1-int(chrmStart)))
        return ','.join(blkStrtList)
        
    # +++++ initialize ensembl data +++++
    txList = map(lambda l: l.strip('\n') , open(ensemblPath, 'rU'))
    txList.pop(0)
    txList = JamesDefs.groupByField_silent(txList,1)
    
    # +++++ prepare destination file +++++
    bedFile = open(BEDoutPath,'w')
    bedFile.write('track name="Ensembl Aa Tx Definitions"  description="From %s" useScore=0\n' % (ensemblPath))
    
    # +++++ loop through the Txs +++++
    for tx in txList:
        # --- sort tx based on lowest coords of each exon ---
        tx.sort(key=lambda x: int(x[10]))
        
        chrm      = tx[0][2]
        chrmStart = str(int(tx[0][5])-1)
        chrmEnd   = tx[0][6]
        name      = tx[0][1]
        score     = '0'
        strand    = tx[0][7]
        thkStart  = chrmStart
        thkEnd    = chrmEnd
        rgb       = '0'
        blkCount  = str(len(tx))
        blkSizes  = getBlockSizes(tx)
        blkStarts = getBlockStarts(tx,chrmStart)
        
        # --- write out line ---
        bedFile.write('%s\n' % ('\t'.join([chrm,    
                                           chrmStart,
                                           chrmEnd,
                                           name,
                                           score,
                                           strand,
                                           thkStart,
                                           thkEnd,
                                           rgb,
                                           blkCount,
                                           blkSizes,
                                           blkStarts])))
# open and create handle for outFile
resFile = open(outFile, 'w')
tick = time.clock()

# read file into list 
conflictList = conflictFile.readlines()
# remove trailing '\n' from every record
LEN_cL = len(conflictList)
i = 0
while i < LEN_cL:
    conflictList[i] = conflictList[i].rstrip('\n')
    i = i + 1


# group file by target gene id using groupByField 
fjoinOutByGeneIDList = JamesDefs.groupByField(conflictList, 1)

resolverArgs = {
                    'strandField' : 4,
                    'lowerBoundProximal' : 10,
                    'higherBoundProximal' : 11,
                    'conflictRegionStrt' : 18,
                    'conflictRegionEnd' : 19,
                    'whichBoundary':'upStream'
                }

resolvedBoundariesList = resolver(fjoinOutByGeneIDList, resolverArgs)

resFile.writelines(resolvedBoundariesList)

tock = time.clock()
Exemple #14
0
        else:
            print "WARNING: boundaryRegion variable should only be 'up' or 'down'.\nScript exiting."
            sys.exit()


#--------------------------------------------------       


# Strip trailing newlines
codingBoundsList = map(string.strip, codingBoundsList)
resolvedConflictsList = map(string.strip, resolvedConflictsList)
                            

# Convert these into lists of lists so that field vals can be interrogated and copied 
# Explode tab delimited strings of each record into list of values
JamesDefs.explodeDelimitedList(codingBoundsList, '\t')
JamesDefs.explodeDelimitedList(resolvedConflictsList, '\t')


len_codingBoundsList = len(codingBoundsList)
len_resolvedConflictsList = len(resolvedConflictsList)



# Populate unUsableList
unUsableGeneNames = []
i = 0
while i < len_resolvedConflictsList:
    
    if int(resolvedConflictsList[i][5]) < shortestUsableBdryReg:
        unUseableGene = resolvedConflictsList.pop(i)
seenSeeds = set()
miR_matches = {}

saveObj = {"orthoRelations": orthoRelations, "seenSeeds": seenSeeds, "miR_matches": miR_matches}


print "Initializing matchVersions..."

for m in miRNAs:
    seed = miTrgt.seedMatches(miRNAs[m], seenSeeds, orthoRelations=orthoRelations, name=m)
    miR_matches[seed.name] = seed

print "Initializing ctrls..."
# choose one rand miRNA to make ctrls
randMiRNA = JamesDefs.randFromList_noRplcMulti(miRNAs.keys(), 1)[0]
miR_matches[randMiRNA].buildCtrlsFromMatchVers(seenSeeds, 30)
randMiRNA = miR_matches[randMiRNA]


# for m in miR_matches:
# print m.name
# for sVer in m.matchVersions:
# print '%s: %s' % (sVer, m.matchVersions[sVer])
# print '- '*5
print "Tallying hits..."
randMiRNA.tallyHits(orthoSeqs)
print "Counting hits in orthos..."
randMiRNA.countHitsInOrthos()

Exemple #16
0
    def countHitsInOrthos4(self,genomeToken,returnGenes=True):
        """
        Uses results of miRNA.tallyHits() and self.orthos to count how many genes the miRNA seed
        hits in at least one genome, in at least two orthologs, and in all three orthologs.  If
        returnGenes: returns tuple of two dicts:
        matchDict(keys=seedType : vals=[None,genesWithMatch,genePairsWithMatch,geneTriplesWithMatch])
        ctrlDict(keys=seedType : vals=[[],genesWithMatch_1,genePairsWithMatch_1,geneTriplesWithMatch_1],
                                       [],genesWithMatch_2,genePairsWithMatch_2,geneTriplesWithMatch_2],
                                       ...])
        """
        # make sure we have tallied the hits already.
        assert self.matchData and self.ctrlData, \
               'ERROR:  It looks like we have not tallied the hits yet. Call miRNA.tallyHits() first.'
        
        if returnGenes:
            rGeneNames = {}
            rCtrlNames = {}
            for seedType in _seedModels:
                rGeneNames[seedType] = [None,[],[],[]]
                rCtrlNames[seedType] = JamesDefs.initList(len(self.matchVersions[seedType][1]),[None,[],[],[]])
             
        # Initialize self.matchCounts/self.ctrlCounts
        for seedType in _seedModels:
            self.matchCounts[seedType] = [0,0,0,0]
            self.ctrlCounts[seedType]  = [[0]*4 for i in range(len(self.matchVersions[seedType][1]))]
        # Cycle through self.orthos 
        for orthoSet in self.orthos:
            assert len(orthoSet) == 3,\
                   'ERROR: It seems len(%s) != 3.'
            # Query the matcheData and ctrlData for hits in orthoSet
            for seedType in _seedModels:
                genesInMatchD = 0
                genesInCtrlD  = [0]*len(self.matchVersions[seedType][1])
                if returnGenes:
                    geneNames = [] 
                    ctrlNames = JamesDefs.initList(len(self.matchVersions[seedType][1]),[])
                # Count how many genes in each orthoSet were hit by the respective seedTypes
                for gene in orthoSet:
                    if gene in self.matchData[seedType]:
                        genesInMatchD += 1
                        if returnGenes: geneNames.append(gene)
                    for i in range(len(self.ctrlData[seedType])):
                        if gene in self.ctrlData[seedType][i]:
                            genesInCtrlD[i] += 1
                            if returnGenes: ctrlNames[i].append(gene)
                        
                # Update self.matchData based on how many hits the orthoSet got for seedType
                if genesInMatchD == 0:
                    ##self.matchCounts[seedType][0] += 3
                    pass
                elif genesInMatchD == 1:
                    if ''.join(geneNames).find(genomeToken) != -1:
                        ##self.matchCounts[seedType][0] += 2
                        self.matchCounts[seedType][1] += 1
                        if returnGenes:
                            rGeneNames[seedType][1].extend(geneNames)
                elif genesInMatchD == 2:
                    if ''.join(geneNames).find(genomeToken) != -1:
                        ##self.matchCounts[seedType][0] += 1
                        ##self.matchCounts[seedType][1] += 1 # self.matchCounts[seedType][1] += 2
                        ##self.matchCounts[seedType][2] += 1
                        if returnGenes:
                            rGeneNames[seedType][1].extend([x for x in geneNames if x.find(genomeToken) != -1])
                            rGeneNames[seedType][2].append(tuple(sorted(geneNames)))
                elif genesInMatchD == 3:
                    if ''.join(geneNames).find(genomeToken) != -1:
                        ##self.matchCounts[seedType][1] += 1 # self.matchCounts[seedType][1] += 3
                        ##self.matchCounts[seedType][2] += 2 # self.matchCounts[seedType][2] += 3
                        ##self.matchCounts[seedType][3] += 1
                        if returnGenes:
                            rGeneNames[seedType][1].extend([x for x in geneNames if x.find(genomeToken) != -1])
                            type2 = [tuple(sorted(x)) for x in xpermutations.xuniqueCombinations(geneNames,2) if ''.join(x).find(genomeToken) != -1]
                            type3 = tuple(sorted(geneNames))
                            rGeneNames[seedType][2].extend(type2)
                            rGeneNames[seedType][3].append(type3)
                # Update self.ctrlData based on how many hits the orthoSet got in each ctrl for seedType
                for i in range(len(self.ctrlData[seedType])):
                    if genesInCtrlD[i] == 0:
                        ##self.ctrlCounts[seedType][i][0] += 3
                        pass
                    elif genesInCtrlD[i] == 1:
                        if ''.join(ctrlNames[i]).find(genomeToken) != -1:
                            ##self.ctrlCounts[seedType][i][1] += 1
                            if returnGenes:
                                rCtrlNames[seedType][i][1].extend(ctrlNames[i])
                    elif genesInCtrlD[i] == 2:
                        if ''.join(ctrlNames[i]).find(genomeToken) != -1:
                            ##self.ctrlCounts[seedType][i][1] += 1 # self.ctrlCounts[seedType][i][1] += 2
                            ##self.ctrlCounts[seedType][i][2] += 1
                            if returnGenes:
                                rCtrlNames[seedType][i][1].extend([x for x in ctrlNames[i] if x.find(genomeToken) != -1])
                                rCtrlNames[seedType][i][2].append(tuple(sorted(ctrlNames[i])))
                    elif genesInCtrlD[i] == 3:
                        if ''.join(ctrlNames[i]).find(genomeToken) != -1:
                            ##self.ctrlCounts[seedType][i][1] += 1 # self.ctrlCounts[seedType][i][1] += 3
                            ##self.ctrlCounts[seedType][i][2] += 2
                            ##self.ctrlCounts[seedType][i][3] += 1
                            if returnGenes:
                                rCtrlNames[seedType][i][1].extend([x for x in ctrlNames[i] if x.find(genomeToken) != -1])
                                type2 = [tuple(sorted(x)) for x in xpermutations.xuniqueCombinations(ctrlNames[i],2) if ''.join(x).find(genomeToken) != -1]
                                type3 = tuple(sorted(ctrlNames[i]))
                                rCtrlNames[seedType][i][2].extend(type2)
                                rCtrlNames[seedType][i][3].append(type3)

                if returnGenes:
                    for i in range(1,4):
                        assert len(rGeneNames[seedType][i]) == len(set(rGeneNames[seedType][i])),\
                               "ERROR: rGeneNames[%s] in miRNA(%s) has redundancy." % (i, self.name)
        if returnGenes:
            # store and return rGeneNames
            self.matchEvents = rGeneNames
            self.ctrlEvents  = rCtrlNames
            return (rGeneNames,rCtrlNames)
#print 'sys args = %s' % (len(sys.argv))
assert len(sys.argv) == 5, 'Usage: combineOrthologs_fromSeedTabs.py tabFile1 tabfile2 orthoDefs outfile'

tabFile1  = map(lambda line: line.strip(), open(sys.argv[1],'rU').readlines())
tabFile2  = map(lambda line: line.strip(), open(sys.argv[2],'rU').readlines())
orthoDefs = map(lambda line: line.strip(), open(sys.argv[3],'rU').readlines())
oFile     = sys.argv[4]


# affirm that column titles match in each tabFile and save the info
assert tabFile1[0] == tabFile2[0], 'Column titles do not match between source files.'
columnTitles = tabFile1[0]

# cleanse commented lines from both lists
tabFile1 = JamesDefs.removeCommentLines(tabFile1,'#')
tabFile2 = JamesDefs.removeCommentLines(tabFile2,'#')


    
# create one dict from tabFile1&2
combinedDict = {}
for line in tabFile1:
    fields = line.split('\t',1)
    combinedDict[fields[0]]=fields[1]

for line in tabFile2:
    fields = line.split('\t',1)
    combinedDict[fields[0]]=fields[1]
    
# write the new list
#========================= User Defined Variables =========================

#  Path to original file
originalFastaDict = open('/Users/biggus/Documents/James/Data/2KB/2kb_Sequence/2kb_Anopheles/2KBupTSS_goodAffyAGAPsFastasOUT.masked.nr.fas', 'rU')

desiredFastaList  = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Clusters.txt'

outDir            = '/Users/biggus/Documents/James/Data/ClusterDefs/TC-Fastas/'

#==========================================================================

desiredFastaList = map(lambda line : line.strip(), open(desiredFastaList, 'rU').readlines())

# Parse clusterDefs into list of clusters
listOfClusterDefs = JamesDefs.groupByField(desiredFastaList,0)




#  Instantiate the fasta rec lists with BioPython Seq using geneID field of discriptor as key to seq objects
originalFastaDict = SeqIO.to_dict(SeqIO.parse(originalFastaDict, 'fasta'),
                                    key_function = lambda rec : rec.description.split()[0])

for cluster in listOfClusterDefs:
    print "Working on Cluster: %s" % (cluster[0][0])
    #  New dict to catch copied seqObjs
    desiredFastaObjList = []
    
    for rec in cluster:
        if originalFastaDict.has_key(rec[1]):    
Exemple #19
0


#  Instantiate the fasta rec lists with BioPython Seq using geneID field of discriptor as key to seq objects
genomeOneFastasDict = SeqIO.to_dict(SeqIO.parse(open(genomeFileOne, "rU"), 'fasta'),
                                    key_function = lambda rec : rec.description.split()[0])

genomeTwoFastasDict = SeqIO.to_dict(SeqIO.parse(open(genomeFileTwo, "rU"), 'fasta'),
                                    key_function = lambda rec : rec.description.split()[0])


#  Initiate resultList
resultList = []

#  Explode orthologList into list of lists
JamesDefs.explodeDelimitedList(orthologList, '\t')

#  Populate a list of GeneIDs in each genome's dict of boundary seqs
genomeOneGeneIDs = genomeOneFastasDict.keys()
genomeTwoGeneIDs = genomeTwoFastasDict.keys()

#  Loop through orthologList and call each fasta in orthoPair, format
#  the new comboFasta and append it to resultList
for orthoPair in orthologList:
    
    #  Test for orthoPair[0] in genomeOneFastasDict and same for orthoPair[1] in genomeTwoFastasDict
    orthoPair_0_warn = None
    orthoPair_1_warn = None
    if orthoPair[0] not in genomeOneGeneIDs:
        orthoPair_0_warn = 'Yes'
    if orthoPair[1] not in genomeTwoGeneIDs:
from gusPyCode.defs import JamesDefs
import string

delimitedList = open('/Users/biggus/Documents/MBGB/Rotations/James/Data/Sequence/Culex/Culex_Exon_Location.txt', 'r').readlines()

delimitedList = map(string.strip, delimitedList)

newList = JamesDefs.reOrderDelimitedList(delimitedList, '\t', [7,1,8,4,6,2,3,5,0])


outFile = open('/Users/biggus/Documents/MBGB/Rotations/James/Data/Sequence/Culex/Culex_Exon_Location_Reordered.txt','w')

for rec in newList:
    outFile.write(rec+'\n')


print 'Yay'
Exemple #21
0



# Read source data into list
bioMartList = srcFile.readlines()

# remove trailing '\n' from every record
LEN_bML = len(bioMartList)
i = 0
while i < LEN_bML:
    bioMartList[i] = bioMartList[i].rstrip('\n')
    i = i + 1

# Grouping records by gene name and splitting record fields into lists
groupedList = JamesDefs.groupByField(bioMartList, 0)

# Combine exon records into a single gene line record with start and stop coords for coding region
# TranscriptID field will be removed and fields representing the number of exons encountered and 
# the chromosomal coverage will be appended respectivly to the end of each record
oneLineRecordList = combineExons(groupedList, bdryLen)

# Write out oneLineRecordList to outFile
boundaryFile.writelines(oneLineRecordList)
boundaryFile.close()




print 'Tada!'
Exemple #22
0
from gusPyCode.defs import JamesDefs




#--------- Script Specific Function Definitions ---------------------


#--------------------------------------------------------------------



#========================= User Defined Variables =========================
inFile = '/Users/biggus/Documents/MBGB/Rotations/James/Data/mdosJAR_testing/JAR_2KBupAedesAnopheles_7mer.rvCmp.smt2.sortedMotifsOnly.motifs'
#outFile = '/Users/biggus/Documents/MBGB/Rotations/James/Data/mdosJAR_testing/JAR_2KBupAedesAnopheles_7mer.rvCmp.smt2.sorted.motifs'

#==========================================================================


inFile = map(lambda line : line.strip(), open(inFile, 'rU').readlines())

nrList = JamesDefs.nrListBySets(inFile)




        
        
x=1