예제 #1
0
def countBySeed(editFN, microFN, flankAmount, outFN):
        '''FlankF amount should be +/- 6'''
        flankAmount = int(flankAmount)

        eSites = cgEdit.loadEditingSites(editFN)
        micros = cgMicroRNA.loadMicroRNAFromTargetScan(microFN, 'hsa')
        gf = GenomeFetch.GenomeFetch('hg19')

        for eSite in eSites:

                chrom = eSite.chromosome
                coord = eSite.coordinate
                strand = eSite.strand

                flankingSeq = gf.get_seq_from_to(chrom, coord - flankAmount, coord + flankAmount, strand)
                eFlankingSeq = flankingSeq[:flankAmount] + 'G' + flankingSeq[flankAmount + 1:]
                flankingSeq.replace('T', 'U')
                eFlankingSeq.replace('T','U')

                checkID = 'hsa-miR-330-5p'
                
                for microRNA in micros:
                                
                        comSeed = cgSeqMod.reverseComplementSequence(microRNA.seed, True)

                        if comSeed in flankingSeq:
                                eSite.microTargets.append(microRNA.id)
                                if microRNA.id == checkID:
                                        print '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.strand, eSite.gene
                                microRNA.numBefore += 1
                                #print '@', 'flank', eSite.ID, microRNA.id, flankingSeq, comSeed                                

                        if comSeed in eFlankingSeq:
                                microRNA.numAfter += 1
                                if microRNA.id == checkID:
                                        print '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.strand, eSite.gene
                                eSite.eMicroTargets.append(microRNA.id)
                                #print '@', 'eFlank', eSite.ID, microRNA.id, eFlankingSeq, comSeed
                                
        for micro in micros:
                if micro.numBefore > 0 or micro.numAfter > 0:
                        #print micro.id, micro.numBefore, micro.numAfter
                        pass

        #write contents to file...
        outF = open(outFN, 'w')
        for eSite in eSites:
                if len(eSite.microTargets) == 0:
                        targets = 'None'
                else:
                        targets = ','.join(eSite.microTargets)

                if len(eSite.eMicroTargets) == 0:
                        eTargets = 'None'
                else:
                        eTargets = ','.join(eSite.eMicroTargets)

                outF.write('%s\t%s:%s\t%s\t%s\t%s\n' % (eSite.ID, eSite.chromosome, eSite.coordinate, eSite.strand, targets, eTargets))
예제 #2
0
def getFolded(fN):


        eSites = cgEdit.loadEditingSites(fN)
        gf = GenomeFetch.GenomeFetch('hg19')

        for eSite in eSites:

                #Get +/- 200 bp of eSite
                chrom, strand, coord = eSite.chromosome, eSite.strand, eSite.coordinate
                start, end = coord - 200, coord + 200

                seq = gf.get_seq_from_to(chrom, start, end, strand)

                print '>', eSite.ID
                print seq
예제 #3
0
def getEditInfo(fN, idList):
        
        eSites = cgEdit.loadEditingSites(fN)
        
        idDict = {}
        for eSite in eSites:
                idDict[eSite.ID] = eSite

        list = []
        f = open(idList, 'r')
        for line in f:
                ls = line.strip().split('\t')
                list.append(int(ls[0]))

        for id in list:
                eSite = idDict[id]
                print eSite.ID, '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.gene, eSite.eRatio
예제 #4
0
def updateMicroTargets(editFN, microFN, flankAmount, outFN):
        '''FlankF amount should be +/- 6'''
        flankAmount = int(flankAmount)

        eSites = cgEdit.loadEditingSites(editFN)
        micros = cgMicroRNA.loadMicroRNAFromTargetScan(microFN, 'hsa')
        gf = GenomeFetch.GenomeFetch('hg19')

        for eSite in eSites:

                chrom = eSite.chromosome
                coord = eSite.coordinate
                strand = eSite.strand

                flankingSeq = gf.get_seq_from_to(chrom, coord - flankAmount, coord + flankAmount, strand)
                eFlankingSeq = flankingSeq[:flankAmount] + 'G' + flankingSeq[flankAmount + 1:]
                

                for microRNA in micros:
                        
                        comSeed = cgSeqMod.reverseComplementSequence(microRNA.seed, True)

                        if comSeed in flankingSeq:
                                eSite.microTargets.append(microRNA.id)
                                print '@', 'flank', eSite.ID, microRNA.id, flankingSeq, comSeed                                

                        if comSeed in eFlankingSeq:
                                eSite.eMicroTargets.append(microRNA.id)
                                print '@', 'eFlank', eSite.ID, microRNA.id, eFlankingSeq, comSeed
                                

                
        #write contents to file...
        outF = open(outFN, 'w')
        for eSite in eSites:
                if len(eSite.microTargets) == 0:
                        targets = 'None'
                else:
                        targets = ','.join(eSite.microTargets)

                if len(eSite.eMicroTargets) == 0:
                        eTargets = 'None'
                else:
                        eTargets = ','.join(eSite.eMicroTargets)

                outF.write('%s\t%s:%s\t%s\t%s\t%s\n' % (eSite.ID, eSite.chromosome, eSite.coordinate, eSite.strand, targets, eTargets))
예제 #5
0
def getEditInfo(fN, idList):

    eSites = cgEdit.loadEditingSites(fN)

    idDict = {}
    for eSite in eSites:
        idDict[eSite.ID] = eSite

    list = []
    f = open(idList, 'r')
    for line in f:
        ls = line.strip().split('\t')
        list.append(int(ls[0]))

    for id in list:
        eSite = idDict[id]
        print eSite.ID, '%s:%s' % (eSite.chromosome,
                                   eSite.coordinate), eSite.gene, eSite.eRatio
예제 #6
0
def overlapWithDegradome(dFN, eFN):

    eSites = cgEdit.loadEditingSites(eFN)

    degTccs = []
    f = open(dFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = bioLibCG.tccSplit(ls[1])
        start = start - 3
        end = end + 3
        degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end))
    print degTccs[0:5]
    eTccs = [eSite.tcc for eSite in eSites]

    overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1)

    print len(overlaps)
예제 #7
0
def overlapWithDegradome(dFN, eFN):

    eSites = cgEdit.loadEditingSites(eFN)

    degTccs = []
    f = open(dFN, "r")
    for line in f:
        ls = line.strip().split("\t")
        chrom, strand, start, end = bioLibCG.tccSplit(ls[1])
        start = start - 3
        end = end + 3
        degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end))
    print degTccs[0:5]
    eTccs = [eSite.tcc for eSite in eSites]

    overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1)

    print len(overlaps)
예제 #8
0
def updateLocationBasedTargets(editFN, contextFN, miLocationFN, gFN):

        eSites = cgEdit.loadEditingSites(editFN)
        cgEdit.updateContextEditingSites(eSites, contextFN) #puts the UTR, EXON in eSite.context
        
        geneSet = cgGenes3.createGeneSetEditing(gFN)

        tName_t = {}
        for t in geneSet.transcripts:
                tName_t[t.id] = t


        tName_miInfo = {}
        f = open(miLocationFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tName = ls[0]
                miName = ls[1]
                loc = int(ls[2])
                tName_miInfo.setdefault(tName, []).append([miName, loc])


        for eSite in eSites:
                if '3UTR' not in eSite.context:
                        continue
                for tName in eSite.transcripts:
                        if tName in tName_miInfo:
                                
                                t = tName_t[tName]
                                
                                for info in tName_miInfo[tName]:
                                        
                                        
                                        miName = info[0]
                                        loc = info[1]

                                        #get the position of e site in mrna for this transcript
                                        ePosition = t.getRelativePositionMRNA(eSite.coordinate, coding = False)
                                        
                                        print tName, miName, loc, ePosition
                                        if loc - 22 <= ePosition <= loc:
                                                print tName, miName, '%s:%s' % (eSite.chromosome, eSite.coordinate)
                                                pass
예제 #9
0
def makeTable(fN, eFN):

    eSites = cgEdit.loadEditingSites(eFN)

    eID_eSite = {}
    for eSite in eSites:
        eID_eSite[eSite.ID] = eSite

    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')

        eID = int(ls[0])
        eSite = eID_eSite[eID]
        gName = ls[1]
        b = ls[8]
        a = ls[9]
        eRatio = eSite.eRatio
        eLoc = eSite.tcc

        print '%s\t%s\t%s\t%s\t%s' % (gName, eLoc, eRatio, b, a)
예제 #10
0
def makeTable(fN, eFN):

        eSites = cgEdit.loadEditingSites(eFN)

        eID_eSite = {}
        for eSite in eSites:
                eID_eSite[eSite.ID] = eSite


        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                
                eID = int(ls[0])
                eSite = eID_eSite[eID]
                gName = ls[1]
                b = ls[8]
                a = ls[9]
                eRatio = eSite.eRatio
                eLoc = eSite.tcc

                print '%s\t%s\t%s\t%s\t%s' % (gName, eLoc, eRatio, b, a)
예제 #11
0
def makeTargetExpressionHistogram(eFN, targetFN, contextFN, geneFN, eChangeFN):

    print 'loading expression ratios'
    gName_eChange = getERatioDict(eChangeFN)

    print 'loading eSites and Transcripts'
    eSites = cgEdit.loadEditingSites(eFN)
    geneSet = cgGenes3.createGeneSetEditing(geneFN)

    print 'making joint dicts and loading extra data'
    #joint
    eID_eSite = {}
    for eSite in eSites:
        eID_eSite[eSite.ID] = eSite

    #joint
    tID_gName = {}
    for transcript in geneSet.transcripts:
        tID_gName[transcript.id] = transcript.parent

    #load context data
    f = open(contextFN, 'r')
    eID_tID = {}  # eID: tID
    tID_tType = {}
    for line in f:
        ls = line.strip().split('\t')
        eID = int(ls[0])
        tID = ls[1]
        tType = ls[2]
        tID_tType[tID] = tType
        if eID in eID_tID:
            eID_tID[eID].append(tID)
        else:
            eID_tID[eID] = [tID]
    f.close()

    print 'analyzing'
    #Get created or destroyed
    f = open(targetFN, 'r')
    altered = []
    for line in f:
        ls = line.strip().split('\t')
        #created/destroyed
        if (ls[3] != 'None') and (ls[4] == 'None'):
            altered.append(int(ls[0]))
    f.close()

    print 'number of created/destroyed sites:', len(altered)

    alteredSites = []
    for id in altered:
        alteredSites.append(eID_eSite[id])

    eChanges = []
    gDone = []
    #Get gene names for each eSite
    for eSite in alteredSites:
        genes = []
        for tID in eID_tID[eSite.ID]:
            if tID == 'NONE':
                continue

            gName = tID_gName[tID]
            if tID_tType[tID] != '3UTR':
                print 'Not 3UTR', tID
                continue
            if gName not in genes:
                genes.append(gName)

        if len(genes) > 1:
            print 'more than one gene for eSite...', genes
            continue

        if gName in gDone:
            continue
        else:
            gDone.append(gName)

        #Now add expression to HistoGram List...
        if gName in gName_eChange:
            eChange = gName_eChange[gName]
        else:
            print 'gene not in expression list', gName
            continue
        eChange = gName_eChange[gName]
        eChange = math.log(eChange, 2)
        eChanges.append(eChange)

    #Now plot the histogram
    plt.hist(eChanges, 40)
    plt.xlabel('log2(RPKM KD/ RPKM CONTROL)')
    plt.ylabel('# Genes')
    plt.show()
예제 #12
0
def updateSynonomous(eFN, gFN, resultsFN, outFN):

    #Load Transcripts and Editing Sites
    print 'Loading editing sites'
    eSites = cgEdit.loadEditingSites(eFN)
    print 'Loading gene set'
    geneSet = cgGenes3.createGeneSetEditing(gFN)

    codingTID_eID = {}
    f = open(resultsFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        if ls[4] == 'C':
            codingTID_eID[ls[2]] = int(ls[0])

    #Get coding Transcripts
    codingTranscripts = {}  #tID : eID ! many:one always!
    f = open(resultsFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        if ls[4] == 'C':
            codingTranscripts[ls[2]] = int(ls[0])

    eID_eSite = {}
    for eSite in eSites:
        eID_eSite[eSite.ID] = eSite

    tID_transcript = {}
    for transcript in geneSet.transcripts:
        tID[transcript.id] = transcript

    codingT_eSite
    for tID in codingTID_eID:
        eID = codingTID_eID[tID]
        t = tID_transcript[tID]
        e = eID_eSite[eID]

    print 'Creating scroll dict'
    scrollDict = {}  # transcript: eSite
    for tID in codingTranscripts:
        e = eJoint[codingTranscripts[tID]]
        try:
            t = tJoint[tID]
            scrollDict[t] = e
        except KeyError:
            pass

    print 'Deducing synonomous'
    map = cgSeqMod.loadCodonMap('hg19')
    finalDict = {}  # tID: [SYN, AAA, AAB, G, A]
    #Figure out if they are synonomous
    for t in scrollDict:

        eSite = scrollDict[t]
        #dumpObj.dumpObj(t)
        #dumpObj.dumpObj(eSite)

        ePositionInMRNA = t.getRelativePositionMRNA(eSite.coordinate - 1)

        if ePositionInMRNA == -1:
            print t.id, 'should not be designated coding...'
            continue

        #grab mRNA and emRNA
        mRNA = t.getMRNA(coding=True)
        emRNA = t.getMRNA(coding=True)

        if mRNA[ePositionInMRNA] != 'A':
            print 'wrong position', t.id, '%s:%s' % (
                eSite.chromosome, eSite.coordinate), eSite.strand, mRNA[
                    ePositionInMRNA - 5:ePositionInMRNA -
                    1], mRNA[ePositionInMRNA], mRNA[ePositionInMRNA +
                                                    1:ePositionInMRNA + 5]

        #edit the site
        emRNA = list(emRNA)
        emRNA[ePositionInMRNA] = 'G'
        emRNA = ''.join(emRNA)

        #Test the protein sequences
        pRNA = cgSeqMod.translateRNA(mRNA, map)
        epRNA = cgSeqMod.translateRNA(emRNA, map)

        #print t.parent, t.id
        newString = ['%s  ' % x for x in list(pRNA)]
        newString = ''.join(newString)

        if pRNA[0] != 'M':
            print 'Non-canonical Start AA:', pRNA[0:5], mRNA[:10]
        if pRNA[-1] != '*':
            print 'Non-canonical End AA:', pRNA[-5:], mRNA[-10:]

        #compare the codons.

        mCodonList = cgSeqMod.getCodonListFromRNA(mRNA)
        emCodonList = cgSeqMod.getCodonListFromRNA(emRNA)
        compareList = zip(mCodonList, emCodonList)
        synFlag = 'SYN'

        codonNumber = ePositionInMRNA // 3

        codonPair = compareList[codonNumber]
        print t.id
        print eSite.ID
        print mCodonList[:codonNumber]
        print mRNA[:ePositionInMRNA]
        bCodon = codonPair[0]
        aCodon = codonPair[1]

        baa = cgSeqMod.translateRNA(bCodon, map)
        aaa = cgSeqMod.translateRNA(aCodon, map)
        if baa != aaa:
            synFlag = 'NON'
            bCodonList = list(bCodon)
            aCodonList = list(aCodon)
            matchedLetters = zip(bCodonList, aCodonList)
            for pair in matchedLetters:
                if pair[0] != 'A':
                    if pair[1] == 'G' and pair[0] != 'G':
                        print 'messed up codon switch', bCodonList, aCodonList
                        print t.parent, '%s:%s' % (
                            eSite.chromosome, eSite.coordinate
                        ), eSite.strand, bCodon, aCodon, baa, aaa
        else:
            synFlag = 'SYN'

        finalDict[t.id] = [synFlag, bCodon, aCodon, baa, aaa]

    print 'writing to file'
    #update line by line
    newLines = []
    f = open(resultsFN, 'r')
    for line in f:
        newLine = line.strip()
        tID = line.strip().split('\t')[2]

        if tID in finalDict:
            newLine = newLine + '\t%s\t%s\t%s\t%s\t%s\n' % (
                finalDict[tID][0], finalDict[tID][1], finalDict[tID][2],
                finalDict[tID][3], finalDict[tID][4])
        else:
            newLine = newLine + '\tNA\tNA\tNA\tNA\tNA\n'

        newLines.append(newLine)
    f.close()

    #update file
    f = open(outFN, 'w')
    f.writelines(newLines)
    f.close()
예제 #13
0
def updateContext(editFN, geneSetFN, outFN, refBase = 'A'):
        print refBase
        #Load Transcripts and Editing Sites
        print 'loading editing sites'
        eSites = cgEdit.loadEditingSites(editFN, refBase)
        print 'loading gene set'
        geneSet = cgGenes3.createGeneSetEditing(geneSetFN)

        
        #make the eSites 0 based
        for eSite in eSites:
                #redo coordinate and tcc
                eSite.coordinate = eSite.coordinate - 1
                eSite.tcc = bioLibCG.makeTcc(eSite.chromosome, eSite.strand, eSite.coordinate, eSite.coordinate)

        #Create Joint dictionaries
        print 'creating joint dictionaries'
        eJoint = {} #tcc : eSite
        for eSite in eSites:
                eJoint[eSite.tcc] = eSite

        tJoint = {} # tcc : [transcript, ...]
        for transcript in geneSet.transcripts:
                if transcript.tcc in tJoint:
                        tJoint[transcript.tcc].append(transcript)
                else:
                        tJoint[transcript.tcc] = [transcript]

        #Overlap tccs
        print 'overlapping joints'
        ##make new 0-based keys
        tccOverlaps = compareData.getIndividualOverlaps(eJoint.keys(), tJoint.keys(), 1)


        print 'creating final dictionary'
        #create final dictionary containing {edit sites : [transcript, ..]}
        eSiteTranscripts = {} # edit site: [transcript, ..]
        for eTcc in tccOverlaps:
                eSite = eJoint[eTcc]
                eSiteTranscripts[eSite] = []
                for tTcc in tccOverlaps[eTcc]:
                        eSiteTranscripts[eSite].extend(tJoint[tTcc])

        print 'get context info'
        #Go through each site and find out what it overlaps, and if it is in a coding region...
        fOut = open(outFN, 'w')
        for eSite in eSiteTranscripts:
                if len(eSiteTranscripts[eSite]) == 0:
                        #label intergenic
                        tType = 'INTER'
                        codingFlag = 'NC'
                        fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, 'NONE', 'NONE', tType, codingFlag, 'NONE'))
                        continue

                for transcript in eSiteTranscripts[eSite]:
                       
                        codingTranscript =  '_coding' in transcript.tType
                        tType = None
                        codingFlag = None
                        tTypes = [ x[1] for x in transcript.getOverlappingElements(eSite.tcc)]
                        

                        if '3UTR' in tTypes:
                                tType = '3UTR'
                        elif '5UTR' in tTypes:
                                tType = '5UTR'
                        else:
                                tType = tTypes[0] #has to be one thing...exon or intron
                        #This only works because UTR takes precedence over EXON in TYPE.
                        if tType == 'EXON':
                                if codingTranscript:
                                        codingFlag = 'C'
                                else:
                                        codingFlag = 'NC'
                        else:
                                codingFlag = 'NC'


                        fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, transcript.parent, transcript.id, tType, codingFlag, transcript.tType))
                        #fOut.write('%s:%s:%s\t%s\t%s\t%s\t%s\n' % (eSite.chromosome, eSite.strand, eSite.coordinate, transcript.parent, transcript.id, tType, codingFlag))

        fOut.close()
예제 #14
0
def newExpression(eFN, gFN, contextFN, targetFN, rpkmFN):
        """docstring for newExpression"""

        #print 'loading editing/genes'        
        eSites = cgEdit.loadEditingSites(eFN)
        geneSet = cgGenes3.createGeneSetEditing(gFN)


        #print 'making joints'
        #joints
        eID_eSite = {}
        for eSite in eSites:
                eID_eSite[eSite.ID] = eSite



        gID_eIDs = {}
        eID_tTypes = {}
        f = open(contextFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                gID = ls[1]
                eID = int(ls[0])
                tType = ls[3]

                if gID in gID_eIDs:
                        if eID not in gID_eIDs[gID]:
                                gID_eIDs[gID].append(eID)
                else:
                        gID_eIDs[gID] = [eID]

                if eID in eID_tTypes:
                        eID_tTypes[eID].append(tType)
                else:
                        eID_tTypes[eID] = [tType]

        #print 'updating target sites'
        #update targetting for eSites:
        f = open(targetFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                eID = int(ls[0])
                before = ls[3].split(',')
                if before == ['None']: before = []
                after = ls[4].split(',')
                if after == ['None']: after = []
                eID_eSite[eID].before = before
                eID_eSite[eID].after = after

        #scrolldict
        gene_eSites = {}
        for gID in gID_eIDs:
                try:
                        gene = geneSet.set[gID]
                except KeyError:
                        #print gID, 'not in geneSet'
                        pass

                gene_eSites[gene] = []
                #get eSites
                for eID in gID_eIDs[gID]:
                        eSite = eID_eSite[eID]
                        gene_eSites[gene].append(eSite)

        #print 'updating gene target site info'
        #update before/after editing target sites for GENES
        createdGenes = []
        destroyedGenes = []
        histoVals = []

        for gene in gene_eSites:
                gene.before = []
                gene.after = []
                for eSite in gene_eSites[gene]:
                        if '3UTR' in eID_tTypes[eSite.ID]:
                                for micro in eSite.before:
                                        gene.before.append('%s %s:%s' % (micro, eSite.ID, eSite.coordinate))
                                for micro in eSite.after:
                                        gene.after.append('%s %s:%s' % (micro, eSite.ID, eSite.coordinate))

                if len(gene.before) > len(gene.after):
                        destroyedGenes.append(gene)

                if len(gene.after) > len(gene.before):
                        createdGenes.append(gene)

                change = len(gene.after) - len(gene.before)
                if (len(gene.after) != 0) or (len(gene.before) != 0):
                        histoVals.append(change)

        #print the created/destroyed sites
        for gene in createdGenes:
                print gene.before
                print gene.after

        for gene in destroyedGenes:
                print gene.before
                print gene.after

        '''
        plt.title('Target Site Changes Due To Editing')
        plt.xlabel('Change in Number of Target Sites After Editing')
        plt.ylabel('Number of Genes')
        plt.hist(histoVals, 6)
        plt.show()

        return 0
        '''

        #print out microRNA gene List
        uniqueMicros = {}
        for gene in createdGenes:
                for micro in gene.after:
                        uniqueMicros[micro] = 1
        for gene in destroyedGenes:
                for micro in gene.before:
                        uniqueMicros[micro] = 1

        for micro in uniqueMicros:
                #print micro
                pass

        #print 'loading rpkm'
        gName_ratio = getERatioDict(rpkmFN)
        eChanges = []
        
        for gene in createdGenes:
                try:
                        ratio = gName_ratio[gene.id]
                except KeyError:
                        #print gene.id, 'not in RPKM file --> not expressed'
                        pass
                eChange = math.log(ratio, 2)
                eChanges.append(eChange)
        
        eChanges2 = []
        for gene in destroyedGenes:
                try:
                        ratio = gName_ratio[gene.id]
                except KeyError:
                        #print gene.id, 'not in RPKM file --> not expressed'
                        pass

                eChange = math.log(ratio, 2)
                eChanges2.append(eChange)
        

        #Now plot the histogram
        plt.hist(eChanges, 40, cumulative = True, histtype = 'step', normed = True, label = 'Created')
        plt.hist(eChanges2, 40, cumulative = True, histtype = 'step', normed = True, label = 'Destroyed')
        plt.legend()
        plt.title('eCDF of Genes with Target Site Changes in 3UTRs')
        plt.xlabel('log2(RPKM KD/ RPKM CONTROL)')
        plt.ylabel('Fraction of Genes')
        plt.show()
예제 #15
0
def updateValidatedMicroTargets(editFN, microTargetFN, microSequenceFN, outFN, gFN):
        flankAmount = 6
        eSites = cgEdit.loadEditingSites(editFN)
        cgEdit.updateContextEditingSites(eSites)

        miNames_micros = cgMicroRNA.loadMicroRNAFromValidated(microTargetFN, microSequenceFN)
        gf = GenomeFetch.GenomeFetch('hg19')
        
        #update flanking region
        for eSite in eSites:

                chrom = eSite.chromosome
                coord = eSite.coordinate
                strand = eSite.strand

                flankingSeq = gf.get_seq_from_to(chrom, coord - flankAmount, coord + flankAmount, strand)
                eFlankingSeq = flankingSeq[:flankAmount] + 'G' + flankingSeq[flankAmount + 1:]

                eSite.flank = flankingSeq.replace('T', 'U')
                eSite.eFlank = eFlankingSeq.replace('T', 'U')

        #joint
        gName_micros = {}
        for micro in miNames_micros.values():
                for target in micro.targetGenes:
                        if micro not in gName_micros.setdefault(target, []): gName_micros[target].append(micro)

        gene_m = {}       
        for eSite in eSites:

                sharedMicros = gName_micros.get(eSite.gene)
                if sharedMicros is None:
                        continue
                for micro in sharedMicros:
                        print '' 
                        print micro.name, eSite.gene, micro.sequence, micro.seed
                        print micro.comSeed
                        print eSite.flank, eSite.eFlank
                        print '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.flank, eSite.gene
                        if micro.comSeed == None:
                                #dumpObj.dumpObj(micro)
                                #print 'miR not in sequence file:', micro.name
                                #print micro.targetGenes
                                continue

                        if eSite.gene in gene_m:
                                if micro not in gene_m[eSite.gene]:
                                        gene_m[eSite.gene].append(micro)
                        else:
                                gene_m[eSite.gene] = [micro]

                        #flanking
                        if micro.comSeed in eSite.flank:
                                eSite.before.append(micro.name)

                        if micro.comSeed in eSite.eFlank:
                                eSite.after.append(micro.name)

        print len(gene_m)
        count = 0
        for g in gene_m:
                print g
                for m in gene_m[g]:
                        print '...', m.name
                        count += 1
        print count
        
        #check if these seeds are in the 
        checkIfSeedPresent(gene_m, gFN) 
                
     
        #write contents to file...
        outF = open(outFN, 'w')
        for eSite in eSites:
                if len(eSite.before) == 0:
                        targets = 'None'
                else:
                        targets = ','.join(eSite.before)

                if len(eSite.after) == 0:
                        eTargets = 'None'
                else:
                        eTargets = ','.join(eSite.after)

                outF.write('%s\t%s:%s\t%s\t%s\t%s\n' % (eSite.ID, eSite.chromosome, eSite.coordinate, eSite.strand, targets, eTargets))
예제 #16
0
def makeTargetExpressionHistogram(eFN, targetFN, contextFN, geneFN, eChangeFN):
       

        print 'loading expression ratios'
        gName_eChange = getERatioDict(eChangeFN)

        print 'loading eSites and Transcripts'
        eSites = cgEdit.loadEditingSites(eFN)
        geneSet = cgGenes3.createGeneSetEditing(geneFN)
        
        print 'making joint dicts and loading extra data'
        #joint
        eID_eSite = {}
        for eSite in eSites:
                eID_eSite[eSite.ID] = eSite

        
        #joint
        tID_gName = {}
        for transcript in geneSet.transcripts:
                tID_gName[transcript.id] = transcript.parent


        #load context data
        f = open(contextFN, 'r')
        eID_tID = {} # eID: tID
        tID_tType = {}
        for line in f:
                ls = line.strip().split('\t')
                eID = int(ls[0])
                tID = ls[1]
                tType = ls[2]
                tID_tType[tID] = tType
                if eID in eID_tID:
                        eID_tID[eID].append(tID)
                else:
                        eID_tID[eID] = [tID]
        f.close()
        

        print 'analyzing'
        #Get created or destroyed
        f = open(targetFN, 'r')
        altered = []
        for line in f:
                ls = line.strip().split('\t')
                #created/destroyed
                if (ls[3] != 'None') and (ls[4] == 'None'):
                        altered.append(int(ls[0]))
        f.close()
        
                                       
        print 'number of created/destroyed sites:', len(altered)

        alteredSites = []
        for id in altered:
                alteredSites.append(eID_eSite[id])


        eChanges = []
        gDone = []
        #Get gene names for each eSite
        for eSite in alteredSites:
                genes = []
                for tID in eID_tID[eSite.ID]:
                        if tID == 'NONE':
                                continue

                        gName = tID_gName[tID]
                        if tID_tType[tID] != '3UTR':
                                print 'Not 3UTR', tID
                                continue
                        if gName not in genes:
                                genes.append(gName)

                
                if len(genes) > 1:
                        print 'more than one gene for eSite...', genes
                        continue
                
                if gName in gDone:
                        continue
                else:
                        gDone.append(gName)


                #Now add expression to HistoGram List...
                if gName in gName_eChange:
                        eChange = gName_eChange[gName]
                else:
                        print 'gene not in expression list', gName
                        continue
                eChange = gName_eChange[gName]
                eChange = math.log(eChange, 2)
                eChanges.append(eChange)

     
        #Now plot the histogram
        plt.hist(eChanges, 40)
        plt.xlabel('log2(RPKM KD/ RPKM CONTROL)')
        plt.ylabel('# Genes')
        plt.show()
예제 #17
0
def betterSynonymous(eFN, gFN, contextFN, outFN, refBase='A', eBase='G'):

    print 'loading e sites'
    eSites = cgEdit.loadEditingSites(eFN)
    print 'loading geneSet'
    geneSet = cgGenes3.createGeneSetEditing(gFN)

    contextInfo = {}  # eID: tID : [UTR, C]
    f = open(contextFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        eID = int(ls[0])
        tID = ls[2]
        cInfo = [ls[3], ls[4]]
        if eID not in contextInfo:
            contextInfo[eID] = {}
            contextInfo[eID][tID] = cInfo
        else:
            contextInfo[eID][tID] = cInfo

    eID_tIDs = {}
    f = open(contextFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        eID = int(ls[0])
        tID = ls[2]
        if tID not in eID_tIDs.setdefault(eID, []): eID_tIDs[eID].append(tID)

    eID_eSite = {}
    for eSite in eSites:
        eID_eSite[eSite.ID] = eSite

    tID_transcript = {}
    for transcript in geneSet.transcripts:
        tID_transcript[transcript.id] = transcript

    eSite_transcripts = {}
    for eID in eID_tIDs:
        eSite = eID_eSite[eID]
        tList = []
        for tID in eID_tIDs[eID]:
            if tID == 'NONE': continue
            if tID_transcript.get(tID, None) == None: continue
            tList.append(tID_transcript[tID])
        eSite_transcripts[eSite] = tList

    outF = open(outFN, 'w')
    map = cgSeqMod.loadCodonMap('hg19')
    for eSite in eSite_transcripts:

        for transcript in eSite_transcripts[eSite]:

            siteType, codingType = contextInfo[eSite.ID][transcript.id]
            if '_noncoding' in transcript.tType:
                continue
            if codingType != 'C':
                continue

            ePositionInMRNA = transcript.getRelativePositionMRNA(
                eSite.coordinate - 1)
            mRNA = transcript.getMRNA(coding=True)
            emRNA = transcript.getMRNA(coding=True)

            if mRNA[ePositionInMRNA] != refBase:
                print 'Editing site was not an A...'

            #edit the site
            emRNA = list(emRNA)
            emRNA[ePositionInMRNA] = eBase
            emRNA = ''.join(emRNA)

            #Test the protein sequences
            pRNA = cgSeqMod.translateRNA(mRNA, map)
            epRNA = cgSeqMod.translateRNA(emRNA, map)
            if pRNA[0] != 'M':
                print 'Non-canonical Start AA:', pRNA[0:5], mRNA[:10]
            if pRNA[-1] != '*':
                print 'Non-canonical End AA:', pRNA[-5:], mRNA[-10:]

            #compare the codons.
            mCodonList = cgSeqMod.getCodonListFromRNA(mRNA)
            emCodonList = cgSeqMod.getCodonListFromRNA(emRNA)
            compareList = zip(mCodonList, emCodonList)
            codonNumber = ePositionInMRNA // 3
            codonPair = compareList[codonNumber]
            bCodon = codonPair[0]
            aCodon = codonPair[1]
            baa = cgSeqMod.translateRNA(bCodon, map)
            aaa = cgSeqMod.translateRNA(aCodon, map)
            synFlag = 'SYN'
            if baa != aaa:
                synFlag = 'NON'
                bCodonList = list(bCodon)
                aCodonList = list(aCodon)
                matchedLetters = zip(bCodonList, aCodonList)
                for pair in matchedLetters:
                    if pair[0] != 'A':
                        if pair[1] == 'G' and pair[0] != 'G':
                            print 'messed up codon switch', bCodonList, aCodonList
                            print t.parent, '%s:%s' % (
                                eSite.chromosome, eSite.coordinate
                            ), eSite.strand, bCodon, aCodon, baa, aaa

            outF.write('\t'.join([
                str(eSite.ID), transcript.parent, transcript.id, synFlag,
                bCodon, aCodon, baa, aaa
            ]) + '\n')
예제 #18
0
def updateContext(editFN, geneSetFN, outFN, refBase='A'):
    print refBase
    #Load Transcripts and Editing Sites
    print 'loading editing sites'
    eSites = cgEdit.loadEditingSites(editFN, refBase)
    print 'loading gene set'
    geneSet = cgGenes3.createGeneSetEditing(geneSetFN)

    #make the eSites 0 based
    for eSite in eSites:
        #redo coordinate and tcc
        eSite.coordinate = eSite.coordinate - 1
        eSite.tcc = bioLibCG.makeTcc(eSite.chromosome, eSite.strand,
                                     eSite.coordinate, eSite.coordinate)

    #Create Joint dictionaries
    print 'creating joint dictionaries'
    eJoint = {}  #tcc : eSite
    for eSite in eSites:
        eJoint[eSite.tcc] = eSite

    tJoint = {}  # tcc : [transcript, ...]
    for transcript in geneSet.transcripts:
        if transcript.tcc in tJoint:
            tJoint[transcript.tcc].append(transcript)
        else:
            tJoint[transcript.tcc] = [transcript]

    #Overlap tccs
    print 'overlapping joints'
    ##make new 0-based keys
    tccOverlaps = compareData.getIndividualOverlaps(eJoint.keys(),
                                                    tJoint.keys(), 1)

    print 'creating final dictionary'
    #create final dictionary containing {edit sites : [transcript, ..]}
    eSiteTranscripts = {}  # edit site: [transcript, ..]
    for eTcc in tccOverlaps:
        eSite = eJoint[eTcc]
        eSiteTranscripts[eSite] = []
        for tTcc in tccOverlaps[eTcc]:
            eSiteTranscripts[eSite].extend(tJoint[tTcc])

    print 'get context info'
    #Go through each site and find out what it overlaps, and if it is in a coding region...
    fOut = open(outFN, 'w')
    for eSite in eSiteTranscripts:
        if len(eSiteTranscripts[eSite]) == 0:
            #label intergenic
            tType = 'INTER'
            codingFlag = 'NC'
            fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                       (eSite.ID, 'NONE', 'NONE', tType, codingFlag, 'NONE'))
            continue

        for transcript in eSiteTranscripts[eSite]:

            codingTranscript = '_coding' in transcript.tType
            tType = None
            codingFlag = None
            tTypes = [
                x[1] for x in transcript.getOverlappingElements(eSite.tcc)
            ]

            if '3UTR' in tTypes:
                tType = '3UTR'
            elif '5UTR' in tTypes:
                tType = '5UTR'
            else:
                tType = tTypes[0]  #has to be one thing...exon or intron
            #This only works because UTR takes precedence over EXON in TYPE.
            if tType == 'EXON':
                if codingTranscript:
                    codingFlag = 'C'
                else:
                    codingFlag = 'NC'
            else:
                codingFlag = 'NC'

            fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                       (eSite.ID, transcript.parent, transcript.id, tType,
                        codingFlag, transcript.tType))
            #fOut.write('%s:%s:%s\t%s\t%s\t%s\t%s\n' % (eSite.chromosome, eSite.strand, eSite.coordinate, transcript.parent, transcript.id, tType, codingFlag))

    fOut.close()
예제 #19
0
def checkSeeds(editFN, contextFN, miLocationFN, miSequenceFN, gFN):

        eSites = cgEdit.loadEditingSites(editFN)
        cgEdit.updateContextEditingSites(eSites, contextFN) #puts the UTR, EXON in eSite.context
        
        geneSet = cgGenes3.createGeneSetEditing(gFN)

        tName_t = {}
        for t in geneSet.transcripts:
                tName_t[t.id] = t

        
        miName_miSequence = {}
        f = open(miSequenceFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                name = ls[0]
                seq = ls[1]
                name = 'hsa-' + name
                miName_miSequence[name] = seq

        tName_miInfo = {}
        f = open(miLocationFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tName = ls[0]
                miName = ls[1]
                loc = int(ls[2])
                tName_miInfo.setdefault(tName, []).append([miName, loc])

        foundIt = []
        notFoundIt = []
        for tName in tName_miInfo:
                
                try:
                        t = tName_t[tName]
                except:
                        continue
                checkSeq = get3UTRSeq(t)
                try:
                        mRNA = t.getMRNA()
                except:
                        continue
                for miInfo in tName_miInfo[tName]:
                
                        miName = miInfo[0]
                        loc = miInfo[1]
                        try:
                                miSequence = miName_miSequence[miName]
                                miSeed = miSequence[1:8]
                        except:
                                continue

                        rcMiSeed = cgSeqMod.reverseComplementSequence(miSeed, True)
                        
                        newLoc = loc - (len(mRNA) - len(checkSeq)) 
                        finding = checkSeq.find(rcMiSeed, newLoc - 25)    
                        if finding != -1:
                                if (0 < newLoc - finding < 30):
                                        newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc)
                                        if newResult not in foundIt: foundIt.append(newResult)
                        else:
                                        
                                        if miName == 'hsa-miR-21':
                                                print loc, len(checkSeq), len(mRNA)
                                                print mRNA
                                                print checkSeq
                                               
                                        newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc)
                                        if newResult not in notFoundIt: notFoundIt.append(newResult)

        print len(foundIt)
        print len(notFoundIt)
        print ''
        for i in foundIt:
                print i
        print ''
        for i in notFoundIt:
                print i