예제 #1
0
def makeBins5(fN, fOut, typeFilter):
        
        fOut = open(fOut, 'w')
        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                id = ls[0]
                type, tcc = ls[1:3]
                chrom, strand, st, en = bioLibCG.tccSplit(tcc)

                #only take seqs that are long enough
                if en - st < 100: continue
                if not typeFilter in type: continue
                
                tccBins = [] #0 is the first nt from the 3' end
                if strand == '1':
                        for i in range(0, 100):
                                s, e = st + i, st + i
                                tccBins.append(bioLibCG.makeTcc(chrom,strand,s,e)) 

                elif strand == '-1':
                        for i in range(0, 100):
                                s, e = en - i, en - i
                                tccBins.append(bioLibCG.makeTcc(chrom,strand,s,e))


                pString = [id] + tccBins
                fOut.write('\t'.join([str(x) for x in pString]) + '\n')
예제 #2
0
def collectIntronSeqs(fN, outFN, assembly, amount=100, prime3=True):

    myG = gf.GenomeFetch(assembly)
    fOut = open(outFN, 'w')
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        if not 'C_INTRON' in ls[1]: continue
        chrom, strand, start, end = bioLibCG.tccSplit(ls[2])

        if end - start < amount: continue

        if strand == '1':
            if prime3:
                tcc = bioLibCG.makeTcc(chrom, strand, end - amount, end)
            seq = myG.getSequence(tcc)
            fOut.write('%s\n' % seq[::-1])

        else:
            if prime3:
                tcc = bioLibCG.makeTcc(chrom, strand, start, start + amount)
            seq = myG.getSequence(tcc)
            fOut.write('%s\n' % seq[::-1])

    f.close()
    fOut.close()
예제 #3
0
def collectIntronSeqs(fN, outFN, assembly, amount = 100, prime3 = True):

    myG = gf.GenomeFetch(assembly)
    fOut = open(outFN, 'w')
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        if not 'C_INTRON' in ls[1]: continue
        chrom, strand, start, end = bioLibCG.tccSplit(ls[2])

        if end - start < amount: continue
        
        if strand == '1':
            if prime3:
                tcc = bioLibCG.makeTcc(chrom, strand, end - amount, end)
            seq = myG.getSequence(tcc)
            fOut.write('%s\n' % seq[::-1])

        else:
            if prime3:
                tcc = bioLibCG.makeTcc(chrom, strand, start, start + amount)
            seq = myG.getSequence(tcc)
            fOut.write('%s\n' % seq[::-1])


    f.close()
    fOut.close()
예제 #4
0
def get3UTRFromTranscriptome(tranFN, outFN, wholeGene = False ):

        fOut = open(outFN, 'w')
        f = open(tranFN, 'r')
        for i, line in enumerate(f):
            ls = line.strip().split('\t')
            tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
            tStart, tEnd = int(ls[3]), int(ls[4]) - 1
            cStart, cEnd = int(ls[5]), int(ls[6]) - 1
            
            if wholeGene:
                utrTcc = bioLibCG.makeTcc(tChrom, tStrand, tStart, tEnd)
                fOut.write('%s\n' % utrTcc) 
                continue

            #5UTR
            if tStrand == '1':
                range5 = (tStart, cStart - 1)
            else:
                range5 = (cEnd + 1, tEnd)

            
            #3UTR
            if tStrand == '1':
                range3 = (cEnd + 1, tEnd)
            else:
                range3 = (tStart, cStart - 1)

            utrTcc = bioLibCG.makeTcc(tChrom, tStrand, range3[0], range3[1])
            fOut.write('%s\n' % utrTcc) 
        f.close()
        fOut.close()
예제 #5
0
def getTccs(fN):

        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                chrom, strand, start, end = ls[0], bioLibCG.switchStrandFormat(ls[5]), ls[1], ls[2]

                print bioLibCG.makeTcc(chrom, strand, start, end)
예제 #6
0
def transcriptSetOverlapTargets(aDir):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        
        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()
        
        #create list of unique tccs.
        uniqTccs = []
        for alignment in id_alignment.values():
                chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                tcc = cg.makeTcc(chrom, strand, start, end)
                if tcc not in uniqTccs: uniqTccs.append(tcc)

        degTccs = [cg.convertToAS(x) for x in uniqTccs]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

        #update
        for obj in id_alignment.values():         
                chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                tcc = cg.makeTcc(chrom, strand, start, end)
                degTcc = cg.convertToAS(tcc)

                if degTcc in overlappingDegTccs:
                        obj.transcriptOverlap = True
	        else:
                        obj.transcriptOverlap = False 

        aDC.commit(id_alignment)
예제 #7
0
def transcriptSetOverlapTargets(aDir):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    #create list of unique tccs.
    uniqTccs = []
    for alignment in id_alignment.values():
        chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        tcc = cg.makeTcc(chrom, strand, start, end)
        if tcc not in uniqTccs: uniqTccs.append(tcc)

    degTccs = [cg.convertToAS(x) for x in uniqTccs]

    #find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    #update
    for obj in id_alignment.values():
        chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        tcc = cg.makeTcc(chrom, strand, start, end)
        degTcc = cg.convertToAS(tcc)

        if degTcc in overlappingDegTccs:
            obj.transcriptOverlap = True
        else:
            obj.transcriptOverlap = False

    aDC.commit(id_alignment)
예제 #8
0
def getTccs(fN):

    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = ls[0], bioLibCG.switchStrandFormat(
            ls[5]), ls[1], ls[2]

        print bioLibCG.makeTcc(chrom, strand, start, end)
예제 #9
0
def addOne(fN):
        
        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                chrom, strand, start, end = bioLibCG.tccSplit(ls[0])
                start += 1
                end += 1

                print bioLibCG.makeTcc(chrom,strand,start,end)
예제 #10
0
def loadEditingSites(fN, nt = 'A'):
        '''Using our labs format, load the editing site into a list'''

        cBases = {'A':'T', 'T':'A', 'G':'C', 'C':'G'} 

        f = open(fN, 'r')
        f.readline() #header

        eList = []
        for line in f:
                ls = line.strip().split('\t')

                e = EditingSite()
                e.chromosome = ls[0]
                e.coordinate = int(ls[1])
                e.gene = ls[3]
                e.eRatio = ls[6]
                refBase = ls[2]
                cBase = cBases[nt]        
                if refBase == cBase:
                        e.strand = '-1'
                else:
                        e.strand = '1'
                
                e.tcc = bioLibCG.makeTcc(e.chromosome, e.strand, e.coordinate, e.coordinate)

                e.ID = int(ls[13])

                eList.append(e)
        f.close()

        return eList
def markCenterExpression(aFN, wigDir, rn = None, tn = None):

        extend = 25
        
        timer = bioLibCG.cgTimer()
        timer.start()

        aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
        aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'], [rn, tn])
        
        #load expression of degradome
        wigDict = cgWig.loadWigDict(wigDir)
        
        for aID in aNX.centerExpression:
                aNX.centerExpression[aID] = [0.0, 0.0, 0.0]      
                chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID])
                offset = aNX.tStart[aID]
                sLen = aNX.sLength[aID]

                if strand == '1':
                        start = start - extend + offset
                        end = start + sLen
                else:
                        end = end + extend - offset
                        start = end - sLen

                scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                stretch = cgWig.getExpressionProfile(scanRange, wigDict)

                #make sure peak is in the small range
                peakLevel = aNX.tELevel[aID]
                peakInRange = (peakLevel in stretch.values())
                

                expressionSum = sum(stretch.values())
                sortedKeys = stretch.keys()
                sortedKeys.sort()

                if strand == '-1':
                        sortedKeys.reverse()
                

                if expressionSum != 0 and peakInRange:

                        sumE = 0.0
                        for key in sortedKeys[8:12]:
                                sumE += stretch[key]
                        aNX.centerExpression[aID][0] = sumE/expressionSum

                        sumE = 0.0
                        for key in sortedKeys[7:13]:
                                sumE += stretch[key]
                        aNX.centerExpression[aID][1] = sumE/expressionSum

                        sumE = 0.0
                        for key in sortedKeys[6:14]:
                                sumE += stretch[key]
                        aNX.centerExpression[aID][2] = sumE/expressionSum
        
        aNX.save()
예제 #12
0
def truncate5(fN):
       
       tCount = 0
       shortCount = 0
       fOut = open(fN + '.trun5', 'w')
       f = open(fN, 'r')
       for line in f:
               
               ls = line.strip().split('\t')
               type, tcc = ls[1], ls[2]

               tCount += 1
               c, s, st, en = bioLibCG.tccSplit(tcc)

               cLen = en - st

               if cLen < 50:
                        shortCount += 1
                        continue

               if s == '1':
                       st = st + 50
               elif s == '-1':
                       en = en - 50
               else:
                       print 'error'
                       return 1

               ls[2] = bioLibCG.makeTcc(c, s, st, en)
               line = '\t'.join(str(x) for x in [ls[0], ls[1], ls[2]]) + '\n'
               fOut.write(line)

       print shortCount, tCount
예제 #13
0
def makeTranscriptome(tranFN, outFN):

        p = bioLibCG.cgPrint()                               
        p.show = False
        gf = GenomeFetch.GenomeFetch('hg19')
        

        fOut = open(outFN, 'w')
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                exonStarts = [int(x) + 1 for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                tID = ls[0]
                gID = ls[10]

                seqList = []
                for eStart, eEnd in exonPairs:
                        tcc = bioLibCG.makeTcc(tChrom, tStrand, eStart, eEnd)
                        seqList.append(gf.getSequence(tcc))

                mRNA = ''.join(seqList)

                #reverse direction if negative strand
                if tStrand == '-1':
                        mRNA = mRNA[::-1]

                fOut.write('> %s:%s:%s\n' % (tID, gID, len(mRNA)))
                fOut.write(mRNA + '\n\n')
                        
        fOut.close()
        f.close()
예제 #14
0
def truncate(fN):

    tCount = 0
    shortCount = 0
    fOut = open(fN + '.trun', 'w')
    f = open(fN, 'r')
    for line in f:

        ls = line.strip().split('\t')
        type, tcc = ls[1], ls[2]

        tCount += 1
        c, s, st, en = bioLibCG.tccSplit(tcc)

        cLen = en - st

        if cLen < 50:
            shortCount += 1
            continue

        if s == '1':
            en = en - 50
        elif s == '-1':
            st = st + 50
        else:
            print 'error'
            return 1

        ls[2] = bioLibCG.makeTcc(c, s, st, en)
        line = '\t'.join(str(x) for x in [ls[0], ls[1], ls[2]]) + '\n'
        fOut.write(line)

    print shortCount, tCount
예제 #15
0
def makePeakInputQ(cName, minExpression = 2000):
	'''Uses shell script and qsub to get peaks quickly'''
	
	mConf = c.getConfig('Main.conf')
	conf = c.getConfig(cName)
	
	assembly = conf.conf['assembly']
	
	tccList = []
	
	chromLens = cg.returnChromLengthDict(assembly)
	
	for chrom in chromLens:
		if chrom not in cg.acceptableChroms: continue
		for strand in ['1','-1']:
			print 'Getting Peaks for ', chrom, strand
			prevI = 0
			for i in rangePoints(1, chromLens[chrom], 30):
				if i == 1:
					prevI = i
					continue
				
				start = prevI
				end = i
				prevI = i
				
				tcc = cg.makeTcc(chrom, strand, start, end)
								
				log = 'logs/o-' + str(start)
				elog = 'logs/e-%s-%s-%s-%s' % (chrom, strand, start, end)
				subprocess.Popen(['qsub', '-V', '-cwd', '-e', elog, '-o', log, '-l', 'mem=3G', '-l', 'rt=3600', 'q.sh', tcc, cName, str(minExpression)]).wait()
예제 #16
0
def makeTranscriptome(tranFN, outFN):

    p = bioLibCG.cgPrint()
    p.show = False
    gf = GenomeFetch.GenomeFetch('hg19')

    fOut = open(outFN, 'w')
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        exonStarts = [int(x) + 1 for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        tID = ls[0]
        gID = ls[10]

        seqList = []
        for eStart, eEnd in exonPairs:
            tcc = bioLibCG.makeTcc(tChrom, tStrand, eStart, eEnd)
            seqList.append(gf.getSequence(tcc))

        mRNA = ''.join(seqList)

        #reverse direction if negative strand
        if tStrand == '-1':
            mRNA = mRNA[::-1]

        fOut.write('> %s:%s:%s\n' % (tID, gID, len(mRNA)))
        fOut.write(mRNA + '\n\n')

    fOut.close()
    f.close()
예제 #17
0
def profileAroundPoint(zeroPoint, span, cName, ratio=False, ratioCoord=None):
    '''span is +/- that number...  if you put in 30, you'll get 
	-29 to 29 around 0
	zeroPoint must be in tcc format with point at start
	ratio will return all points around zero point as a ratio of zero point
	so zeroPoint = 1.0 and the rest will be fractions of that...'''
    chrom, strand, zPoint, end = cg.tccSplit(zeroPoint)
    rStart = zPoint - span
    rEnd = zPoint + span

    rTcc = cg.makeTcc(chrom, strand, rStart, rEnd)
    scanDict = svCoord([rTcc], cName)

    #reorient so that zero is at zero
    #find position of zero
    returnDict = {}
    if not ratio:
        for i in range(1 - span, span):
            returnDict[i] = scanDict[zPoint + i]
    else:
        if ratioCoord:
            zeroVal = scanDict[ratioCoord]
        else:
            zeroVal = scanDict[zPoint]
        if zeroVal == 0: zeroVal = 1
        for i in range(1 - span, span):
            r = float(scanDict[zPoint + i]) / float(zeroVal)
            returnDict[i] = r

    return returnDict
예제 #18
0
def makePeakInputQ(cName, minExpression=2000):
    '''Uses shell script and qsub to get peaks quickly'''

    mConf = c.getConfig('Main.conf')
    conf = c.getConfig(cName)

    assembly = conf.conf['assembly']

    tccList = []

    chromLens = cg.returnChromLengthDict(assembly)

    for chrom in chromLens:
        if chrom not in cg.acceptableChroms: continue
        for strand in ['1', '-1']:
            print 'Getting Peaks for ', chrom, strand
            prevI = 0
            for i in rangePoints(1, chromLens[chrom], 30):
                if i == 1:
                    prevI = i
                    continue

                start = prevI
                end = i
                prevI = i

                tcc = cg.makeTcc(chrom, strand, start, end)

                log = 'logs/o-' + str(start)
                elog = 'logs/e-%s-%s-%s-%s' % (chrom, strand, start, end)
                subprocess.Popen([
                    'qsub', '-V', '-cwd', '-e', elog, '-o', log, '-l',
                    'mem=3G', '-l', 'rt=3600', 'q.sh', tcc, cName,
                    str(minExpression)
                ]).wait()
예제 #19
0
def profileAroundPoint(zeroPoint, span, cName, ratio = False, ratioCoord = None):
	'''span is +/- that number...  if you put in 30, you'll get 
	-29 to 29 around 0
	zeroPoint must be in tcc format with point at start
	ratio will return all points around zero point as a ratio of zero point
	so zeroPoint = 1.0 and the rest will be fractions of that...'''
	chrom, strand, zPoint, end = cg.tccSplit(zeroPoint)
	rStart = zPoint - span
	rEnd = zPoint + span
	
	rTcc = cg.makeTcc(chrom, strand, rStart, rEnd)
	scanDict = svCoord([rTcc], cName)
	
	#reorient so that zero is at zero 
	#find position of zero
	returnDict = {}
	if not ratio:
		for i in range(1-span, span):
			returnDict[i] = scanDict[zPoint + i]
	else:
		if ratioCoord:
			zeroVal = scanDict[ratioCoord]
		else:
			zeroVal = scanDict[zPoint]
		if zeroVal == 0: zeroVal = 1 
		for i in range(1-span, span):
			r = float(scanDict[zPoint + i])/float(zeroVal)
			returnDict[i] = r
	
	return returnDict
예제 #20
0
def markCenterExpression(aFN, wigDir, rn=None, tn=None):

    extend = 25

    timer = bioLibCG.cgTimer()
    timer.start()

    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'],
             [rn, tn])

    #load expression of degradome
    wigDict = cgWig.loadWigDict(wigDir)

    for aID in aNX.centerExpression:
        aNX.centerExpression[aID] = [0.0, 0.0, 0.0]
        chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID])
        offset = aNX.tStart[aID]
        sLen = aNX.sLength[aID]

        if strand == '1':
            start = start - extend + offset
            end = start + sLen
        else:
            end = end + extend - offset
            start = end - sLen

        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
        stretch = cgWig.getExpressionProfile(scanRange, wigDict)

        #make sure peak is in the small range
        peakLevel = aNX.tELevel[aID]
        peakInRange = (peakLevel in stretch.values())

        expressionSum = sum(stretch.values())
        sortedKeys = stretch.keys()
        sortedKeys.sort()

        if strand == '-1':
            sortedKeys.reverse()

        if expressionSum != 0 and peakInRange:

            sumE = 0.0
            for key in sortedKeys[8:12]:
                sumE += stretch[key]
            aNX.centerExpression[aID][0] = sumE / expressionSum

            sumE = 0.0
            for key in sortedKeys[7:13]:
                sumE += stretch[key]
            aNX.centerExpression[aID][1] = sumE / expressionSum

            sumE = 0.0
            for key in sortedKeys[6:14]:
                sumE += stretch[key]
            aNX.centerExpression[aID][2] = sumE / expressionSum

    aNX.save()
예제 #21
0
def extendPeakTest(tcc, pRange, minVal, maxAvgNoise, minPeakLength, maxPeakLength, cName):  	
	        
                chrom, strand, peakPosition, end = cg.tccSplit(tcc)
		cProfile = stepVectorScan.profileAroundPoint(tcc, pRange, cName, ratio = True)
		
                #extend this peak left and right
                leftRange = range(1-pRange, 0)
                rightRange = range(1, pRange)
                leftRange.reverse() #going from the middle outward

                #left
                startFinal = leftRange[-1]
		for i in leftRange:
			if cProfile[i] > minVal:
				print ' extending stretch'
			else:
                                print ' end of stretch L'
                                startFinal = i + 1
                                break
                #right
                endFinal = rightRange[-1]
                for i in rightRange:
                        if cProfile[i] > minVal:
                                print ' extending stretch'
                        else:
                                print ' end of stretch R'
                                endFinal = i - 1
                                break

	        peakLength = endFinal - startFinal + 1

		
                #avg expression around peak check...
                #get total expression before peak
                low = startFinal
                high = endFinal
                noiseExpression = 0
                lowRange = range(1 - pRange, low)
                highRange = range(high + 1, pRange)
                totalLength = len(lowRange) + len(highRange)
                print totalLength, pRange, low, high, lowRange, highRange
                for i in lowRange:
                        noiseExpression += cProfile[i]
                for i in highRange:
                        noiseExpression += cProfile[i]
                try:
                        avgNoise = noiseExpression/float(totalLength)
                except:
                        return False

		#filter out peaks that look a certain way.
		if (minPeakLength < peakLength < maxPeakLength) and (avgNoise < maxAvgNoise):
                        goodTcc = cg.makeTcc(chrom, strand, peakPosition + startFinal, peakPosition + endFinal)
                        print '*KEEPER'
                        return goodTcc
                else:
                        return False
예제 #22
0
    def getOverlappingElements(self, tcc):
        '''Given region, Which element (INTRON, EXON, 5UTR, 3UTR)'''
        overlappingElements = []
        try:
            for utrSegment in self.utr5:
                utr5Tcc = bioLibCG.makeTcc(self.chromosome, self.strand,
                                           utrSegment[0], utrSegment[1])
                if bioLibCG.tccOverlap(utr5Tcc, tcc):
                    overlappingElements.append([utrSegment, '5UTR'])
        except IndexError:
            pass

        for exon in self.exonList:
            exonTcc = bioLibCG.makeTcc(self.chromosome, self.strand, exon[0],
                                       exon[1])
            #print '@ ', exonTcc, tcc, 'EXON'
            if bioLibCG.tccOverlap(exonTcc, tcc):
                overlappingElements.append([exon, 'EXON'])

        for intron in self.intronList:
            intronTcc = bioLibCG.makeTcc(self.chromosome, self.strand,
                                         intron[0], intron[1])
            #print '@ ', intronTcc, tcc, 'INTRON'
            if bioLibCG.tccOverlap(intronTcc, tcc):
                overlappingElements.append([intron, 'INTRON'])

        try:
            for utrSegment in self.utr3:
                utr3Tcc = bioLibCG.makeTcc(self.chromosome, self.strand,
                                           utrSegment[0], utrSegment[1])
                if bioLibCG.tccOverlap(utr3Tcc, tcc):
                    overlappingElements.append([utrSegment, '3UTR'])
        except IndexError:
            pass

        #!!!Eventually add a way to find if overlapping EXON_UTR as well
        if 'EXON' in overlappingElements and 'INTRON' in overlappingElements:
            overlappingElements.append('EXON_INTRON')
            overlappingElements.remove('EXON')
            overlappingElements.remove('INTRON')

        return overlappingElements
예제 #23
0
def getTccFromBowtieLine(line):
	chrom = line.strip().split('\t')[2]
	strand = line.strip().split('\t')[1]
	if strand == '+':
		strand = '1'
	else:
		strand = '-1'
	start = int(line.strip().split('\t')[3])
	end = start + len(line.strip().split('\t')[4])
	
	return cg.makeTcc(chrom, strand, start, end)
예제 #24
0
def getTccFromBedLine(line):
    chrom = line.strip().split('\t')[0]
    strand = line.strip().split('\t')[5]
    if strand == '+':
        strand = '1'
    else:
        strand = '-1'
    start = int(line.strip().split('\t')[1])
    end = int(line.strip().split('\t')[2])

    return cg.makeTcc(chrom, strand, start, end)
예제 #25
0
def getTccFromBowtieLine(line):
    chrom = line.strip().split('\t')[2]
    strand = line.strip().split('\t')[1]
    if strand == '+':
        strand = '1'
    else:
        strand = '-1'
    start = int(line.strip().split('\t')[3])
    end = start + len(line.strip().split('\t')[4])

    return cg.makeTcc(chrom, strand, start, end)
예제 #26
0
def getTccFromBedLine(line):
	chrom = line.strip().split('\t')[0]
	strand = line.strip().split('\t')[5]
	if strand == '+':
		strand = '1'
	else:
		strand = '-1'
	start = int(line.strip().split('\t')[1])
	end = int(line.strip().split('\t')[2])
	
	return cg.makeTcc(chrom, strand, start, end)
예제 #27
0
def toTcc(fN, strand):

        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                
                chrom = ls[0].split(':')[0]
                start = ls[0].split(':')[1].split('-')[0]
                end = ls[0].split(':')[1].split('-')[1]

                newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
                print newTcc
예제 #28
0
def toTcc(fN, strand):

    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')

        chrom = ls[0].split(':')[0]
        start = ls[0].split(':')[1].split('-')[0]
        end = ls[0].split(':')[1].split('-')[1]

        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        print newTcc
예제 #29
0
def getTccFromUCSCLine(line):
	'''format may change...'''
	chrom = line.strip().split('\t')[1]
	strand = line.strip().split('\t')[6]
	if strand == '+':
		strand = '1'
	else:
		strand = '-1'
	start = int(line.strip().split('\t')[2])
	end = int(line.strip().split('\t')[3])
	
	return cg.makeTcc(chrom, strand, start, end)
예제 #30
0
def getTccFromUCSCLine(line):
    '''format may change...'''
    chrom = line.strip().split('\t')[1]
    strand = line.strip().split('\t')[6]
    if strand == '+':
        strand = '1'
    else:
        strand = '-1'
    start = int(line.strip().split('\t')[2])
    end = int(line.strip().split('\t')[3])

    return cg.makeTcc(chrom, strand, start, end)
예제 #31
0
def peakToSeq(peakFN, extend, outFN, assembly):
        #extend is +25 for degradome and -6/-4 for oRNA
        extend = int(extend)
        gf = GenomeFetch.GenomeFetch(assembly)

        outF = open(outFN, 'w')
        f = open(peakFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                chrom, strand, start, end = bioLibCG.tccSplit(ls[0])
                start, end = start - extend, end + extend
                newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
                outF.write(gf.getSequence(newTcc) + '\n')
예제 #32
0
def plotResults(rFN, smallCName, degCName):

        f = open(rFN, 'r')
        
        i = 1
        for line in f:
               chrom, strand, start, end = bioLibCG.tccSplit(line.strip().split('\t')[0])
               start = start - 30
               end = end + 30

               newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
               cgPlot.plotSmallDeg(newTcc, smallCName, degCName, 'newResults', line.strip(), str(i))
               i += 1
예제 #33
0
def peakToSeq(peakFN, extend, outFN):
    #extend is +25 for degradome and -6/-4 for oRNA
    extend = int(extend)
    gf = GenomeFetch.GenomeFetch('hg19')

    outF = open(outFN, 'w')
    f = open(peakFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = bioLibCG.tccSplit(ls[0])
        start, end = start - extend, end + extend
        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        outF.write(gf.getSequence(newTcc) + '\n')
def plotPairs(oDir, aDir, cName):

        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()

        for oID, oRNA in id_oRNA.items():
                
                if not oRNA.passedFilter:
                        continue

                for aID in oRNA.filteredTargets:

                        alignment = id_alignment[aID]
                        chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
                        offset = alignment.tStart
                        sLen = alignment.sLength
                        print sLen
                        print oRNA.sequence
                        print oRNA.tcc
                        print alignment.tTcc
                        if strand == '1':
                                start = start - 19 + offset
                                end = start + sLen
                        else:
                                end = end + 19 - offset
                                start = end - sLen

                        print chrom, strand, start, end
                        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                        
                        stretch = cgPeaks.stretch(scanRange, cName)
                        sortedKeys = stretch.profile.keys()
                        sortedKeys.sort()

                        if strand == '-1':
                                sortedKeys.reverse()
                        

                        xVals = range(1, sLen + 2)
                        xVals = sortedKeys
                        yVals = [stretch.profile[x] for x in sortedKeys]
                        print xVals, len(xVals)
                        print yVals, len(yVals)
                        
                        plt.plot(xVals, yVals)
                        plt.show()

                        return 0
예제 #35
0
    def getOverlappingElements(self, tcc):
        """Given region, Which element (INTRON, EXON, 5UTR, 3UTR)"""
        overlappingElements = []
        try:
            for utrSegment in self.utr5:
                utr5Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1])
                if bioLibCG.tccOverlap(utr5Tcc, tcc):
                    overlappingElements.append([utrSegment, "5UTR"])
        except IndexError:
            pass

        for exon in self.exonList:
            exonTcc = bioLibCG.makeTcc(self.chromosome, self.strand, exon[0], exon[1])
            # print '@ ', exonTcc, tcc, 'EXON'
            if bioLibCG.tccOverlap(exonTcc, tcc):
                overlappingElements.append([exon, "EXON"])

        for intron in self.intronList:
            intronTcc = bioLibCG.makeTcc(self.chromosome, self.strand, intron[0], intron[1])
            # print '@ ', intronTcc, tcc, 'INTRON'
            if bioLibCG.tccOverlap(intronTcc, tcc):
                overlappingElements.append([intron, "INTRON"])

        try:
            for utrSegment in self.utr3:
                utr3Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1])
                if bioLibCG.tccOverlap(utr3Tcc, tcc):
                    overlappingElements.append([utrSegment, "3UTR"])
        except IndexError:
            pass

        #!!!Eventually add a way to find if overlapping EXON_UTR as well
        if "EXON" in overlappingElements and "INTRON" in overlappingElements:
            overlappingElements.append("EXON_INTRON")
            overlappingElements.remove("EXON")
            overlappingElements.remove("INTRON")

        return overlappingElements
예제 #36
0
def plotASProfile(tcc, cName, directory = None, min = 0, extra = "0"):
	if not directory:
		fN = extra + '.' + tcc + '.png'
	else:
		fN = directory + '/' + extra + '.' + tcc + '.png'
	
	#Get S Profile
	tccStretch = cgPeaks.stretch(tcc, cName)
	highest = tccStretch.getHighestLevel()
	if highest < min:
		return 0
		
	sortedX = tccStretch.profile.keys()
	sortedX.sort()
	
	sortedY = []
	for X in sortedX:
		sortedY.append(tccStretch.profile[X])
	
	#Get AS Profile
	chr, strand, start, end = tcc.strip().split(':')
	if strand == '1':
		strand = '-1'
	else:
		strand = '1'
	tcc = cg.makeTcc(chr, strand, start, end)
	
	tccStretchAS = cgPeaks.stretch(tcc, cName)
	highest = tccStretchAS.getHighestLevel()
	if highest < min:
		return 0 #AS can have minimum I guess...
		
	sortedXAS = tccStretchAS.profile.keys()
	sortedXAS.sort()
	
	sortedYAS = []
	for X in sortedXAS:
		sortedYAS.append(tccStretchAS.profile[X])
	
	#Plot them
	gDevice = importr('grDevices')
	gDevice.png(file=fN, width=1680, height=1050)
	r('split.screen(c(2,1))')
	r('screen(1)')
	r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "(Syn) Expression Level" )
	r.lines(sortedX, sortedY, type = "b")
	r('screen(2)')
	r.plot(sortedXAS, sortedYAS, xlab = "Coordinates", ylab = "(Anti) Expression Level")
	r.lines(sortedXAS, sortedYAS, type = "b")
	gDevice.dev_off()
def plotPairs(oDir, aDir, cName):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    for oID, oRNA in id_oRNA.items():

        if not oRNA.passedFilter:
            continue

        for aID in oRNA.filteredTargets:

            alignment = id_alignment[aID]
            chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
            offset = alignment.tStart
            sLen = alignment.sLength
            print sLen
            print oRNA.sequence
            print oRNA.tcc
            print alignment.tTcc
            if strand == '1':
                start = start - 19 + offset
                end = start + sLen
            else:
                end = end + 19 - offset
                start = end - sLen

            print chrom, strand, start, end
            scanRange = bioLibCG.makeTcc(chrom, strand, start, end)

            stretch = cgPeaks.stretch(scanRange, cName)
            sortedKeys = stretch.profile.keys()
            sortedKeys.sort()

            if strand == '-1':
                sortedKeys.reverse()

            xVals = range(1, sLen + 2)
            xVals = sortedKeys
            yVals = [stretch.profile[x] for x in sortedKeys]
            print xVals, len(xVals)
            print yVals, len(yVals)

            plt.plot(xVals, yVals)
            plt.show()

            return 0
예제 #38
0
def updateSequence(oFN, oFF, extend, assembly):
        
    NX = Nexus(oFN, oFF)
    NX.load(['sequence', 'tcc'])
        
    gf = GenomeFetch.GenomeFetch(assembly)

    while NX.nextID():
        
        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)
        start, end = start - extend, end + extend
        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        NX.sequence = gf.getSequence(newTcc)

    NX.save()
예제 #39
0
def plotResults(rFN, smallCName, degCName):

    f = open(rFN, 'r')

    i = 1
    for line in f:
        chrom, strand, start, end = bioLibCG.tccSplit(
            line.strip().split('\t')[0])
        start = start - 30
        end = end + 30

        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        cgPlot.plotSmallDeg(newTcc, smallCName, degCName, 'newResults',
                            line.strip(), str(i))
        i += 1
예제 #40
0
def updateSequence(oFN, oFF, extend, assembly):

    NX = Nexus(oFN, oFF)
    NX.load(['sequence', 'tcc'])

    gf = GenomeFetch.GenomeFetch(assembly)

    while NX.nextID():

        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)
        start, end = start - extend, end + extend
        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        NX.sequence = gf.getSequence(newTcc)

    NX.save()
예제 #41
0
def countWithBinsSet(dFN, binDir, type = 'INTRON'):

        dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak)
        dNX.load(['tcc'])

        
        numBins = 1
        
        c_s_bin_set = {}

        for chrom in bioLibCG.humanChromosomes:
                for strand in ('1', '-1'):
                        #initialize data structure
                        for i in range(0, numBins):
                                c_s_bin_set.setdefault(chrom, {}).setdefault(strand, {})[i] = set()
                        f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r')
                        for line in f:
                                ls = line.strip().split('\t')
                                tccs = ls[1:numBins + 1]
                                for i in range(0,numBins):
                                        ch, st, sta, end = bioLibCG.tccSplit(tccs[i])
                                        for j in range(sta, end + 1):
                                                c_s_bin_set[chrom][strand][i].add(j)

        #collect dTtcs in list
        dTccs = []
        for dID in dNX.tcc:
                tcc = dNX.tcc[dID]
                c, s, st, en = bioLibCG.tccSplit(tcc)
                if s == '1':
                        s = '-1'
                        en = st
                else:
                        s = '1'
                        st = en
                dTccs.append(bioLibCG.makeTcc(c,s,st,en))

        #make bCounts
        binCounts = [0] * numBins

        #count for each bin
        for i in range(0, numBins):
                for dTcc in dTccs:
                        c, s, st, en = bioLibCG.tccSplit(dTcc)
                        for j in range(st, en + 1):
                                if j in c_s_bin_set[c][s][i]:
                                        binCounts[i] += 1
                print '%s\t%s' % (i, binCounts[i])
예제 #42
0
def countWithBinsSet(dFN, binDir, type='INTRON'):

    dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak)
    dNX.load(['tcc'])

    numBins = 1

    c_s_bin_set = {}

    for chrom in bioLibCG.humanChromosomes:
        for strand in ('1', '-1'):
            #initialize data structure
            for i in range(0, numBins):
                c_s_bin_set.setdefault(chrom, {}).setdefault(strand,
                                                             {})[i] = set()
            f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r')
            for line in f:
                ls = line.strip().split('\t')
                tccs = ls[1:numBins + 1]
                for i in range(0, numBins):
                    ch, st, sta, end = bioLibCG.tccSplit(tccs[i])
                    for j in range(sta, end + 1):
                        c_s_bin_set[chrom][strand][i].add(j)

    #collect dTtcs in list
    dTccs = []
    for dID in dNX.tcc:
        tcc = dNX.tcc[dID]
        c, s, st, en = bioLibCG.tccSplit(tcc)
        if s == '1':
            s = '-1'
            en = st
        else:
            s = '1'
            st = en
        dTccs.append(bioLibCG.makeTcc(c, s, st, en))

    #make bCounts
    binCounts = [0] * numBins

    #count for each bin
    for i in range(0, numBins):
        for dTcc in dTccs:
            c, s, st, en = bioLibCG.tccSplit(dTcc)
            for j in range(st, en + 1):
                if j in c_s_bin_set[c][s][i]:
                    binCounts[i] += 1
        print '%s\t%s' % (i, binCounts[i])
def markCenterExpression(aDir, cName):
        
        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()


        for alignment in id_alignment.values():
                alignment.centerExpression = [0.0, 0.0, 0.0]      
                chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                
                stretch = cgPeaks.stretch(scanRange, cName)
                expressionSum = stretch.getSumOfLevels()
                sortedKeys = stretch.profile.keys()
                sortedKeys.sort()

                if strand == '-1':
                        sortedKeys.reverse()
                
                if expressionSum != 0:

                        sum = 0.0
                        for key in sortedKeys[8:12]:
                                sum += stretch.profile[key]
                        alignment.centerExpression[0] = sum/expressionSum

                        sum = 0.0
                        for key in sortedKeys[7:13]:
                                sum += stretch.profile[key]
                        alignment.centerExpression[1] = sum/expressionSum

                        sum = 0.0
                        for key in sortedKeys[6:14]:
                                sum += stretch.profile[key]
                        alignment.centerExpression[2] = sum/expressionSum

        aDC.commit(id_alignment)
예제 #44
0
def markCenterExpression(aDir, cName):

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    for alignment in id_alignment.values():
        alignment.centerExpression = [0.0, 0.0, 0.0]
        chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)

        stretch = cgPeaks.stretch(scanRange, cName)
        expressionSum = stretch.getSumOfLevels()
        sortedKeys = stretch.profile.keys()
        sortedKeys.sort()

        if strand == '-1':
            sortedKeys.reverse()

        if expressionSum != 0:

            sum = 0.0
            for key in sortedKeys[8:12]:
                sum += stretch.profile[key]
            alignment.centerExpression[0] = sum / expressionSum

            sum = 0.0
            for key in sortedKeys[7:13]:
                sum += stretch.profile[key]
            alignment.centerExpression[1] = sum / expressionSum

            sum = 0.0
            for key in sortedKeys[6:14]:
                sum += stretch.profile[key]
            alignment.centerExpression[2] = sum / expressionSum

    aDC.commit(id_alignment)
def markCenterExpression(aFN, cName, rn=None, tn=None):

    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength'], [rn, tn])

    for aID in aNX.centerExpression:
        aNX.centerExpression[aID] = [0.0, 0.0, 0.0]
        chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID])
        offset = aNX.tStart[aID]
        sLen = aNX.sLength[aID]

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)

        stretch = cgPeaks.stretch(scanRange, cName)
        expressionSum = stretch.getSumOfLevels()
        sortedKeys = stretch.profile.keys()
        sortedKeys.sort()

        if strand == '-1':
            sortedKeys.reverse()

        if expressionSum != 0:

            sum = 0.0
            for key in sortedKeys[8:12]:
                sum += stretch.profile[key]
            aNX.centerExpression[aID][0] = sum / expressionSum

            sum = 0.0
            for key in sortedKeys[7:13]:
                sum += stretch.profile[key]
            aNX.centerExpression[aID][1] = sum / expressionSum

            sum = 0.0
            for key in sortedKeys[6:14]:
                sum += stretch.profile[key]
            aNX.centerExpression[aID][2] = sum / expressionSum

    aNX.save()
예제 #46
0
def profileTargetsHistoAS(tccList, cName, name='boxplot'):

    range = 50
    histDict = {}  # {coord: []}
    histDictAS = {}
    for tcc in tccList:

        chrom, strand, start, end = cg.tccSplit(tcc)
        #Get highest peak (sense)
        tccStretch = cgPeaks.stretch(tcc, cName)
        tccStretch.createPeaks(span=2)
        highestCoord = tccStretch.getHighestPeak()
        if highestCoord == None: continue

        #AS
        tccAS = cg.convertToAS(tcc)
        tccStretch = cgPeaks.stretch(tccAS, cName)
        tccStretch.createPeaks(span=2)
        highestCoordAS = tccStretch.getHighestPeak()
        if highestCoordAS == None: continue

        #profile around point (Sense)
        zPoint = cg.makeTcc(chrom, strand, highestCoord, end)
        cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio=True)

        for coord in cProfile:
            try:
                histDict[coord].append(cProfile[coord])
            except:  #quicker way to initialize
                histDict[coord] = [cProfile[coord]]

        #profile around point (AS)
        zPoint = cg.convertToAS(zPoint)
        cProfile = svs.profileAroundPoint(zPoint,
                                          range,
                                          cName,
                                          ratio=True,
                                          ratioCoord=highestCoordAS)

        for coord in cProfile:
            try:
                histDictAS[coord].append(cProfile[coord])
            except:  #quicker way to initialize
                histDictAS[coord] = [cProfile[coord]]

    plot.boxPlotHistoAS(histDict, histDictAS, name=name)
def markCenterExpression(aFN, cName, rn = None, tn = None):
        
        aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
        aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength'], [rn, tn])

        for aID in aNX.centerExpression:
                aNX.centerExpression[aID] = [0.0, 0.0, 0.0]      
                chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID])
                offset = aNX.tStart[aID]
                sLen = aNX.sLength[aID]

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                
                stretch = cgPeaks.stretch(scanRange, cName)
                expressionSum = stretch.getSumOfLevels()
                sortedKeys = stretch.profile.keys()
                sortedKeys.sort()

                if strand == '-1':
                        sortedKeys.reverse()
                
                if expressionSum != 0:

                        sum = 0.0
                        for key in sortedKeys[8:12]:
                                sum += stretch.profile[key]
                        aNX.centerExpression[aID][0] = sum/expressionSum

                        sum = 0.0
                        for key in sortedKeys[7:13]:
                                sum += stretch.profile[key]
                        aNX.centerExpression[aID][1] = sum/expressionSum

                        sum = 0.0
                        for key in sortedKeys[6:14]:
                                sum += stretch.profile[key]
                        aNX.centerExpression[aID][2] = sum/expressionSum

        aNX.save()
예제 #48
0
def getTccFromSamLine(line):
	'''SAM has odd formatting at top'''
	try:
		lineSplit = line.strip().split('\t')
		chrom = lineSplit[2]
		strand = lineSplit[1]
		if strand == '16':
			strand = '-1'
		else:
			strand = '1'
		start = int(lineSplit[3])
		end = start + len(lineSplit[9])
		
		return cg.makeTcc(chrom, strand, start, end)
	except:
		print 'Warning: line failed parsing'
		print line.strip()
		return None
예제 #49
0
def locateASignals(dataFN, outFN, rn = None, tn = None):

    #load data
    NX = cgNexusFlat.Nexus(dataFN, ASite)
    NX.load(['coord', 'sequence'], [rn, tn])

    f = open(outFN, 'w')
    for id in NX.ids:
        chrom, strand, start, end = bioLibCG.tccSplit(NX.coord[id])
        if len(NX.sequence[id]) < 10: continue
        print NX.sequence[id], '\n'
        checkFrames = bioLibCG.returnFrames(NX.sequence[id], 6)
        for i, frame in enumerate(checkFrames):
            if frame == 'AATAAA':
                #assume 0-based...?
                siteStart, siteEnd = start + i, start + i + 5
                f.write('%s\n' % bioLibCG.makeTcc(chrom, strand, siteStart, siteEnd))
    f.close()
예제 #50
0
def overlapWithDegradome(dFN, eFN):

    eSites = cgEdit.loadEditingSites(eFN)

    degTccs = []
    f = open(dFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = bioLibCG.tccSplit(ls[1])
        start = start - 3
        end = end + 3
        degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end))
    print degTccs[0:5]
    eTccs = [eSite.tcc for eSite in eSites]

    overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1)

    print len(overlaps)
예제 #51
0
def overlapWithDegradome(dFN, eFN):

    eSites = cgEdit.loadEditingSites(eFN)

    degTccs = []
    f = open(dFN, "r")
    for line in f:
        ls = line.strip().split("\t")
        chrom, strand, start, end = bioLibCG.tccSplit(ls[1])
        start = start - 3
        end = end + 3
        degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end))
    print degTccs[0:5]
    eTccs = [eSite.tcc for eSite in eSites]

    overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1)

    print len(overlaps)
예제 #52
0
def getTccFromSamLine(line):
    '''SAM has odd formatting at top'''
    try:
        lineSplit = line.strip().split('\t')
        chrom = lineSplit[2]
        strand = lineSplit[1]
        if strand == '16':
            strand = '-1'
        else:
            strand = '1'
        start = int(lineSplit[3])
        end = start + len(lineSplit[9])

        return cg.makeTcc(chrom, strand, start, end)
    except:
        print 'Warning: line failed parsing'
        print line.strip()
        return None
예제 #53
0
def profileTargetsHistoAS(tccList, cName, name = 'boxplot'):
	
	range = 50
	histDict = {} # {coord: []}
	histDictAS = {}
	for tcc in tccList:
		
		chrom, strand, start, end = cg.tccSplit(tcc)
		#Get highest peak (sense)
		tccStretch = cgPeaks.stretch(tcc, cName)
		tccStretch.createPeaks(span = 2)
		highestCoord = tccStretch.getHighestPeak()
		if highestCoord == None: continue
		
		#AS
		tccAS = cg.convertToAS(tcc)
		tccStretch = cgPeaks.stretch(tccAS, cName)
		tccStretch.createPeaks(span = 2)
		highestCoordAS = tccStretch.getHighestPeak()
		if highestCoordAS == None: continue
		
		#profile around point (Sense)
		zPoint = cg.makeTcc(chrom, strand, highestCoord, end)
		cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio = True)
		
		for coord in cProfile:
			try:
				histDict[coord].append(cProfile[coord])
			except: #quicker way to initialize
				histDict[coord] = [cProfile[coord]]
	
		#profile around point (AS)
		zPoint = cg.convertToAS(zPoint)
		cProfile = svs.profileAroundPoint(zPoint, range, cName, ratio = True, ratioCoord = highestCoordAS)
		
		for coord in cProfile:
			try:
				histDictAS[coord].append(cProfile[coord])
			except: #quicker way to initialize
				histDictAS[coord] = [cProfile[coord]]
	
	plot.boxPlotHistoAS(histDict, histDictAS, name = name)