Esempio n. 1
0
	def transcriptOverlaps(self, tccs):
		'''return list of overlapping transcripts'''
		if not isinstance(tccs, type([])):
			print 'transcript overlaps: NEED TCC LIST, not a single tcc!'
			return 1
		
		print 'num of tccs being compared', len(tccs)
		#gather all transcript tccs(make tcc --> id).
		tccDict = {}
		for gene in self.genes:
			for transcript in gene.transcripts:
				if transcript.tcc in tccDict:
					tccDict[transcript.tcc].append(transcript.id)
				else:
					tccDict[transcript.tcc] = [transcript.id]
					
		print 'num of tcc transcript tccs', len(tccDict.keys())
		
		
		overlapped = compare.compareTwoTcc(tccDict.keys(), tccs, 1)
		
		tList = []
		for gene in self.genes:
			for transcript in gene.transcripts:
					if transcript.tcc in overlapped:
						tList.append(transcript)
		
		return tList
Esempio n. 2
0
def transcriptSetOverlap(aDir, AS):
    AS = bool(AS)

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript
    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = oRNA_DC.load()
    if AS == True:
        degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()]
    else:
        degTccs = [x.tcc for x in id_oRNA.values()]

#find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    #print len(overlappingExons), "num of overlapping exons"
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    #write new file
    for obj in id_oRNA.values():
        if AS:
            degTcc = cg.convertToAS(obj.tcc)
        else:
            degTcc = obj.tcc

        if degTcc in overlappingDegTccs:
            obj.transcriptOverlap = True
        else:
            obj.transcriptOverlap = False

    oRNA_DC.commit(id_oRNA)
def transcriptSetOverlap(aDir, AS):
        AS = bool(AS)

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        oRNA_DC = cgNexusFlat.dataController(aDir, cgOriginRNA.OriginRNA)
	id_oRNA = oRNA_DC.load()
        if AS == True:
                degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()]
        else:
                degTccs = [x.tcc for x in id_oRNA.values()]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
	#print len(overlappingExons), "num of overlapping exons"
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)


	#write new file
        for obj in id_oRNA.values():         
                if AS:
                        degTcc = cg.convertToAS(obj.tcc)
                else:
                        degTcc = obj.tcc

                if degTcc in overlappingDegTccs:
                        obj.transcriptOverlap = True
	        else:
                        obj.transcriptOverlap = False 

        oRNA_DC.commit(id_oRNA)	
Esempio n. 4
0
	def transcriptOverlaps(self, tccs):
		'''return list of overlapping transcripts'''
		if not isinstance(tccs, type([])):
			print 'transcript overlaps: NEED TCC LIST, not a single tcc!'
			return 1
		
		print 'num of tccs being compared', len(tccs)
		#gather all transcript tccs(make tcc --> id).
		tccDict = {}
		for gene in self.genes:
			for transcript in gene.transcripts:
				if transcript.tcc in tccDict:
					tccDict[transcript.tcc].append(transcript.id)
				else:
					tccDict[transcript.tcc] = [transcript.id]
					
		print 'num of tcc transcript tccs', len(tccDict.keys())
		
		
		overlapped = compare.compareTwoTcc(tccDict.keys(), tccs, 1)
		
		tList = []
		for gene in self.genes:
			for transcript in gene.transcripts:
					if transcript.tcc in overlapped:
						tList.append(transcript)
		
		return tList
Esempio n. 5
0
def compareMouseHuman(mFN, hFNLift):

    aDC = cgNexusFlat.Nexus(mFN, cgOriginRNAFlat.OriginRNA)
    aDC.load(['tcc'])

    bTccs = []
    f = open(hFNLift, 'r')
    for line in f:
        ls = line.strip().split('\t')
        bTccs.append(ls[0])

    aTccs = []

    for id in aDC.tcc:
        aTccs.append(aDC.tcc[id])

    print compareTwoTcc(aTccs, bTccs, amount=True)
Esempio n. 6
0
def compareTwoMouse(aFN, bFN):

    aDC = cgNexusFlat.Nexus(aFN, cgOriginRNAFlat.OriginRNA)
    aDC.load(['tcc'])

    bDC = cgNexusFlat.Nexus(bFN, cgOriginRNAFlat.OriginRNA)
    bDC.load(['tcc'])

    aTccs = []
    bTccs = []

    for id in aDC.tcc:
        aTccs.append(aDC.tcc[id])

    for id in bDC.tcc:
        bTccs.append(bDC.tcc[id])

    print compareTwoTcc(aTccs, bTccs, amount=True)
def compareMouseHuman(mFN, hFNLift):

        aDC = cgNexusFlat.Nexus(mFN, cgOriginRNAFlat.OriginRNA)
        aDC.load(['tcc'])
        
        bTccs = []
        f = open(hFNLift, 'r')
        for line in f:
                ls = line.strip().split('\t')
                bTccs.append(ls[0])

        aTccs = []

        for id in aDC.tcc:
                aTccs.append(aDC.tcc[id])

        
        print compareTwoTcc(aTccs, bTccs, amount = True)
Esempio n. 8
0
def transcriptSetOverlapTargets(aDir):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        
        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()
        
        #create list of unique tccs.
        uniqTccs = []
        for alignment in id_alignment.values():
                chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                tcc = cg.makeTcc(chrom, strand, start, end)
                if tcc not in uniqTccs: uniqTccs.append(tcc)

        degTccs = [cg.convertToAS(x) for x in uniqTccs]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

        #update
        for obj in id_alignment.values():         
                chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                tcc = cg.makeTcc(chrom, strand, start, end)
                degTcc = cg.convertToAS(tcc)

                if degTcc in overlappingDegTccs:
                        obj.transcriptOverlap = True
	        else:
                        obj.transcriptOverlap = False 

        aDC.commit(id_alignment)
Esempio n. 9
0
def transcriptSetOverlapTargets(aDir):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    #create list of unique tccs.
    uniqTccs = []
    for alignment in id_alignment.values():
        chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        tcc = cg.makeTcc(chrom, strand, start, end)
        if tcc not in uniqTccs: uniqTccs.append(tcc)

    degTccs = [cg.convertToAS(x) for x in uniqTccs]

    #find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    #update
    for obj in id_alignment.values():
        chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        tcc = cg.makeTcc(chrom, strand, start, end)
        degTcc = cg.convertToAS(tcc)

        if degTcc in overlappingDegTccs:
            obj.transcriptOverlap = True
        else:
            obj.transcriptOverlap = False

    aDC.commit(id_alignment)
def compareTwoMouse(aFN, bFN):

        aDC = cgNexusFlat.Nexus(aFN, cgOriginRNAFlat.OriginRNA)
        aDC.load(['tcc'])

        bDC = cgNexusFlat.Nexus(bFN, cgOriginRNAFlat.OriginRNA)
        bDC.load(['tcc'])
        
        aTccs = []
        bTccs = []

        for id in aDC.tcc:
                aTccs.append(aDC.tcc[id])

        for id in bDC.tcc:
                bTccs.append(bDC.tcc[id])

        
        print compareTwoTcc(aTccs, bTccs, amount = True)
def splitExonsIntrons(cName = None):
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)
	
	#init
	organism = conf.conf['organism']
	minOverlap = 50
	cHairs = getHairpins.getHairpins() #CID: HAIRPIN
	exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
	hairpins = []
	for CID in cHairs:
		hairpins.append(cHairs[CID])
	
	print 'checking overlaps'
	#check which hairpins overlap exons and by how much
	exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount = True)
	print '  ', len(exonOverlapped)
	
	print 'removing partial introns'
	#remove the ones that didn't overlap more than X:
	remList = []
	for tcc, oAmount in exonOverlapped:
		if oAmount < minOverlap:
			remList.append([tcc, oAmount])
	
	for item in remList:
		exonOverlapped.remove(item)
	print '  ', len(exonOverlapped), 'out of', len(cHairs.keys())
		
	#get CIDs of exons
	exonCIDs = []
	for tcc, oAmount in exonOverlapped:
		for CID in cHairs:
			if cHairs[CID] == tcc:
				exonCIDs.append(str(CID))
	
	
	#Open sorted predictions and write lines with CIDs to respective files
	predFile = open(conf.conf['resultsSorted'], 'r')
	exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w')
	intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w')
	for line in predFile:
		if line.split('\t')[7] in exonCIDs:
			exonFile.write(line)
		else:
			intronFile.write(line)
	predFile.close()
	exonFile.close()
	intronFile.close()
Esempio n. 12
0
def splitExonsIntrons(cName=None):
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    #init
    organism = conf.conf['organism']
    minOverlap = 50
    cHairs = getHairpins.getHairpins()  #CID: HAIRPIN
    exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
    hairpins = []
    for CID in cHairs:
        hairpins.append(cHairs[CID])

    print 'checking overlaps'
    #check which hairpins overlap exons and by how much
    exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount=True)
    print '  ', len(exonOverlapped)

    print 'removing partial introns'
    #remove the ones that didn't overlap more than X:
    remList = []
    for tcc, oAmount in exonOverlapped:
        if oAmount < minOverlap:
            remList.append([tcc, oAmount])

    for item in remList:
        exonOverlapped.remove(item)
    print '  ', len(exonOverlapped), 'out of', len(cHairs.keys())

    #get CIDs of exons
    exonCIDs = []
    for tcc, oAmount in exonOverlapped:
        for CID in cHairs:
            if cHairs[CID] == tcc:
                exonCIDs.append(str(CID))

    #Open sorted predictions and write lines with CIDs to respective files
    predFile = open(conf.conf['resultsSorted'], 'r')
    exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w')
    intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w')
    for line in predFile:
        if line.split('\t')[7] in exonCIDs:
            exonFile.write(line)
        else:
            intronFile.write(line)
    predFile.close()
    exonFile.close()
    intronFile.close()
Esempio n. 13
0
def reportDifference(oldFN, newFN):

        f = open(oldFN, 'r')
        oldList = [x for x in f]
        oldCoords = [x.strip().split('\t')[0] for x in oldList]
        f.close()

        f = open(newFN, 'r')
        newList = [x for x in f]
        newCoords = [x.strip().split('\t')[0] for x in newList]
        f.close()
        
        bothList = []
        bothList.extend(newList)
        bothList.extend(oldList)

        bothCoords = compareData.compareTwoTcc(oldCoords, newCoords, 1)

        onlyOld = []
        for x in oldCoords:
                if x not in bothCoords:
                        onlyOld.append(x)

        onlyNew = []
        for x in newCoords:
                if x not in bothCoords:
                        onlyNew.append(x)


        print 'Both'
        knownList = []
        for x in bothCoords:
                for y in bothList:
                        if x in y:
                                print y.strip()
                                knownList.append(y)
                                break

        print 'old'
        for y in oldList:
                if y not in knownList:
                        print y.strip()

        print 'new'
        for y in newList:
                if y not in knownList:
                        print y.strip()
Esempio n. 14
0
def reportDifference(oldFN, newFN):

    f = open(oldFN, 'r')
    oldList = [x for x in f]
    oldCoords = [x.strip().split('\t')[0] for x in oldList]
    f.close()

    f = open(newFN, 'r')
    newList = [x for x in f]
    newCoords = [x.strip().split('\t')[0] for x in newList]
    f.close()

    bothList = []
    bothList.extend(newList)
    bothList.extend(oldList)

    bothCoords = compareData.compareTwoTcc(oldCoords, newCoords, 1)

    onlyOld = []
    for x in oldCoords:
        if x not in bothCoords:
            onlyOld.append(x)

    onlyNew = []
    for x in newCoords:
        if x not in bothCoords:
            onlyNew.append(x)

    print 'Both'
    knownList = []
    for x in bothCoords:
        for y in bothList:
            if x in y:
                print y.strip()
                knownList.append(y)
                break

    print 'old'
    for y in oldList:
        if y not in knownList:
            print y.strip()

    print 'new'
    for y in newList:
        if y not in knownList:
            print y.strip()
Esempio n. 15
0
def overlapWithDegradome(dFN, eFN):

    eSites = cgEdit.loadEditingSites(eFN)

    degTccs = []
    f = open(dFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = bioLibCG.tccSplit(ls[1])
        start = start - 3
        end = end + 3
        degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end))
    print degTccs[0:5]
    eTccs = [eSite.tcc for eSite in eSites]

    overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1)

    print len(overlaps)
Esempio n. 16
0
def overlapWithDegradome(dFN, eFN):

    eSites = cgEdit.loadEditingSites(eFN)

    degTccs = []
    f = open(dFN, "r")
    for line in f:
        ls = line.strip().split("\t")
        chrom, strand, start, end = bioLibCG.tccSplit(ls[1])
        start = start - 3
        end = end + 3
        degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end))
    print degTccs[0:5]
    eTccs = [eSite.tcc for eSite in eSites]

    overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1)

    print len(overlaps)
Esempio n. 17
0
def countWithBins(dFN, binDir, type = 'INTRON'):

        print 'loading degradome'
        dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak)
        dNX.load(['tcc'])

        
        print 'loading bins'
        bins = []
        for i in range(50):
                bins.append([])

        for chrom in bioLibCG.humanChromosomes:
                for strand in ('1', '-1'):
                        f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r')
                        for line in f:
                                ls = line.strip().split('\t')
                                tccs = ls[1:51]
                                for i in range(0,50):
                                        bins[i].append(tccs[i])

        #collect dTtcs in list
        dTccs = []
        for dID in dNX.tcc:
                tcc = dNX.tcc[dID]
                c, s, st, en = bioLibCG.tccSplit(tcc)
                if s == '1':
                        s = '-1'
                        en = st
                else:
                        s = '1'
                        st = en
                dTccs.append(bioLibCG.makeTcc(c,s,st,en))

        print len(dTccs), len(bins[0])
        
        for i in range(0, 50):
                print i
                overlaps = compareData.compareTwoTcc(dTccs, bins[i], 1)
                print len(overlaps)
                print overlaps
Esempio n. 18
0
def compareTccs(humanFN, liftCoords, rn = None, tn = None):
        '''compare if there is an overlap between a mouse alignment and any human''' 
        mouseList = []
        f = open(liftCoords, 'r')
        for line in f:
                ls = line.strip().split('\t')
                mouseList.append(ls[0])

        humanList = []
        DC = cgNexusFlat.Nexus(humanFN, cgOriginRNAFlat.OriginRNA)
        DC.load(['tcc'], [rn, tn])
        for id in DC.tcc:
                humanList.append(DC.tcc[id])

        mouseList = list(set(mouseList))
        humanList = list(set(humanList))

        x = compareData.compareTwoTcc(humanList, mouseList)

        for i in x:
                print i
Esempio n. 19
0
def compareTccs(humanFN, liftCoords, rn=None, tn=None):
    '''compare if there is an overlap between a mouse alignment and any human'''
    mouseList = []
    f = open(liftCoords, 'r')
    for line in f:
        ls = line.strip().split('\t')
        mouseList.append(ls[0])

    humanList = []
    DC = cgNexusFlat.Nexus(humanFN, cgOriginRNAFlat.OriginRNA)
    DC.load(['tcc'], [rn, tn])
    for id in DC.tcc:
        humanList.append(DC.tcc[id])

    mouseList = list(set(mouseList))
    humanList = list(set(humanList))

    x = compareData.compareTwoTcc(humanList, mouseList)

    for i in x:
        print i
Esempio n. 20
0
def updateMicroRNAOverlap(aDir, microFN):

    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = oRNA_DC.load()

    #Put micro and small coords into lists
    microCoords = []
    smallCoords = []
    f = open(microFN, 'r')
    microCoords = [x.strip() for x in f]
    f.close()
    smallCoords = [x.tcc for x in id_oRNA.values()]

    #overlap them
    smallOverlaps = compare.compareTwoTcc(microCoords, smallCoords, 2)

    #For each sRNA, save overlap value.
    for oRNA in id_oRNA.values():
        oRNA.microOverlap = oRNA.tcc in smallOverlaps

    oRNA_DC.commit(id_oRNA)
Esempio n. 21
0
def countWithBins(dFN, binDir, type='INTRON'):

    print 'loading degradome'
    dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak)
    dNX.load(['tcc'])

    print 'loading bins'
    bins = []
    for i in range(50):
        bins.append([])

    for chrom in bioLibCG.humanChromosomes:
        for strand in ('1', '-1'):
            f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r')
            for line in f:
                ls = line.strip().split('\t')
                tccs = ls[1:51]
                for i in range(0, 50):
                    bins[i].append(tccs[i])

    #collect dTtcs in list
    dTccs = []
    for dID in dNX.tcc:
        tcc = dNX.tcc[dID]
        c, s, st, en = bioLibCG.tccSplit(tcc)
        if s == '1':
            s = '-1'
            en = st
        else:
            s = '1'
            st = en
        dTccs.append(bioLibCG.makeTcc(c, s, st, en))

    print len(dTccs), len(bins[0])

    for i in range(0, 50):
        print i
        overlaps = compareData.compareTwoTcc(dTccs, bins[i], 1)
        print len(overlaps)
        print overlaps
def transcriptSetOverlapDegFile(degFile):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        
        degTccs = []
        f = open(degFile, 'r')
        for line in f:
                ls = line.strip().split('\t')
                degTccs.append(ls[1])
        f.close()
                        

        degTccs = [cg.convertToAS(x) for x in degTccs]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
	#print len(overlappingExons), "num of overlapping exons"
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

        
        f = open(degFile, 'r')
	newLines = []
	for line in f:
	        
                degTcc = cg.convertToAS(ls[1])
               
                inTran = '0'
                if degTcc in overlappingDegTccs:
                        inTran = '1'

		#update newLines
                newLine = cg.appendToLine(line, inTran, 3)
                
	f.close()
def updateMicroRNAOverlap(aDir, microFN):
	
        oRNA_DC = cgNexusFlat.dataController(aDir, cgOriginRNA.OriginRNA)
	id_oRNA = oRNA_DC.load()
        
        #Put micro and small coords into lists
        microCoords = []
        smallCoords = []
        f = open(microFN, 'r')
        microCoords = [x.strip() for x in f]
        f.close()
        smallCoords = [x.tcc for x in id_oRNA.values()]

        #overlap them
        smallOverlaps = compare.compareTwoTcc(microCoords, smallCoords, 2)


        #For each sRNA, save overlap value.
        for oRNA in id_oRNA.values():
                oRNA.microOverlap = oRNA.tcc in smallOverlaps
	        	
	
        oRNA_DC.commit(id_oRNA)
Esempio n. 24
0
def transcriptSetOverlapDegFile(degFile):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript

    degTccs = []
    f = open(degFile, 'r')
    for line in f:
        ls = line.strip().split('\t')
        degTccs.append(ls[1])
    f.close()

    degTccs = [cg.convertToAS(x) for x in degTccs]

    #find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    #print len(overlappingExons), "num of overlapping exons"
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    f = open(degFile, 'r')
    newLines = []
    for line in f:

        degTcc = cg.convertToAS(ls[1])

        inTran = '0'
        if degTcc in overlappingDegTccs:
            inTran = '1'

    #update newLines
        newLine = cg.appendToLine(line, inTran, 3)

    f.close()
Esempio n. 25
0
    def getOverlappingTranscriptList(self, tccs):
        """given tccs, return the transcripts that overlap with them in a single list"""
        if not isinstance(tccs, type([])):
            # print 'transcript overlaps: NEED TCC LIST, not a single tcc!'
            return 1

            # print 'num of tccs being compared', len(tccs)
            # gather all transcript tccs(make tcc --> id).
        tccDict = {}
        for gene in self.genes:
            for transcript in gene.transcripts:
                tccDict[transcript.tcc] = 1

            # print 'num of tcc transcript tccs', len(tccDict.keys())

        overlapped = compare.compareTwoTcc(tccDict.keys(), tccs, 1)

        tList = []
        for gene in self.genes:
            for transcript in gene.transcripts:
                if transcript.tcc in overlapped:
                    tList.append(transcript)

        return tList
Esempio n. 26
0
    def getOverlappingTranscriptList(self, tccs):
        '''given tccs, return the transcripts that overlap with them in a single list'''
        if not isinstance(tccs, type([])):
            #print 'transcript overlaps: NEED TCC LIST, not a single tcc!'
            return 1

        #print 'num of tccs being compared', len(tccs)
        #gather all transcript tccs(make tcc --> id).
        tccDict = {}
        for gene in self.genes:
            for transcript in gene.transcripts:
                tccDict[transcript.tcc] = 1

        #print 'num of tcc transcript tccs', len(tccDict.keys())

        overlapped = compare.compareTwoTcc(tccDict.keys(), tccs, 1)

        tList = []
        for gene in self.genes:
            for transcript in gene.transcripts:
                if transcript.tcc in overlapped:
                    tList.append(transcript)

        return tList
Esempio n. 27
0
#get results that are only noncoding

import bioLibCG as cg
import compareData as compare
predName = '/home/chrisgre/projects/NoncodingMouse/results/NCmouse-s3k8b17.bothNCandC.results'
keepList = compare.tccFileToList('keepNoncoding.tcc', 0)
predList = compare.tccFileToList(predName, 1)

keepers = compare.compareTwoTcc(predList, keepList, 1)
print len(keepers)

#now go back through pred file and create a new file with only lines that have noncoding in them

predFile = open(predName, 'r')
outFile = open('NCmouse.noncoding.results', 'w')

predLines = predFile.readlines()
predFile.close()
newLines = {}
for keeper in keepers:
	for line in predLines:
		if keeper in line:
			newLines[line] = 1

for line in newLines:
	outFile.write(line)
Esempio n. 28
0
frameLength = 200
frameShift = 1
for cluster in sortedClusters:
	#grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope
	clusterChrom = cluster[0].split(":")[0]
	clusterStrand = cluster[0].split(":")[1]
	firstCoord = int(cluster[0].split(":")[2])
	#print cluster[-1]
	lastCoord = int(cluster[-1].split(":")[3])
	
	
	startCoord = firstCoord
	while startCoord < lastCoord:
		#count how many hits there are in this range
		rangeStart = startCoord - (frameLength/2)
		rangeEnd = startCoord + (frameLength/2)
		rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd)
		overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2)
		hitCount = len(overlappedList) 
		
		#output 
		outputFile.write('%s\t%s\n' % (rangeTcc, hitCount))
		startCoord = startCoord + frameShift #check overlap with range
outputFile.close()

print 'Output Hits per Frame:', timer.split()
print 'Overall Time:', timer.report()


Esempio n. 29
0
def defineClusters(cName=None):
    #Start Timer
    timer = cg.cgTimer()
    timer.start()

    #Get list of mature tccs
    conf = cgConfig.getConfig(cName)  #passed or default
    finalMirFileName = conf.conf['resultsRaw']
    matureTccs = compare.tccFileToList(finalMirFileName,
                                       1)  # list of all mature micro in tcc
    print 'List getting', timer.split()

    #make connections dict
    matureConnections = compare.makeConnectionsDict(matureTccs)
    print 'Make connections:', timer.split()

    #Now have to define Clusters...
    clusters = []
    addedList = []

    #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P
    def createClusters(item=None, mode=None):

        if item in addedList:
            return 0
        elif mode == "top":
            clusters.append([item])
            addedList.append(
                item)  ##creates new cluster with the item already stored in it
            for connectedItem in matureConnections[item]:
                createClusters(connectedItem, "neighbor")
        elif mode == "neighbor":
            clusters[-1].append(
                item)  #add this item to the last cluster created
            addedList.append(item)
            for connectedItem in matureConnections[item]:
                createClusters(connectedItem, "neighbor")

    for tcc in matureTccs:
        createClusters(tcc, "top")

    print 'Make Clusters', timer.split()

    #Sort Clusters.
    sortedClusters = []

    for cluster in clusters:
        sortedClusters.append(cg.sortTccList(cluster))

    print 'Sort Clusters:', timer.split()

    #Output sorted cluster file
    clusterFileName = conf.conf['sortedClusters']
    clusterFile = open(clusterFileName, 'w')
    for cluster in sortedClusters:
        for hit in cluster:
            clusterFile.write('%s,' % hit)
        clusterFile.write('\n')
    clusterFile.close()
    '''
	#re-create sortedClusters list:
	clusterFileName = 'sortedClusters.data'
	clusterFile = open(clusterFileName, 'r')
	sortedClusters = []
	
	
	for line in clusterFile:
		sortedClusters.append([])
		line = line.strip()[0:-1] #take off last comma ;P
		for hit in (line.strip().split(',')):
			sortedClusters[-1].append(hit)
	'''

    print 'Store intermediate data:', timer.split()

    #output hitsAround file
    outputFile = open(conf.conf['hitsPerFrame'], 'w')

    frameLength = 200
    frameShift = 1
    for cluster in sortedClusters:
        #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope
        clusterChrom = cluster[0].split(":")[0]
        clusterStrand = cluster[0].split(":")[1]
        firstCoord = int(cluster[0].split(":")[2])
        #print cluster[-1]
        lastCoord = int(cluster[-1].split(":")[3])

        startCoord = firstCoord
        while startCoord < lastCoord:
            #count how many hits there are in this range
            rangeStart = startCoord - (frameLength / 2)
            rangeEnd = startCoord + (frameLength / 2)
            rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand,
                                        rangeStart, rangeEnd)
            overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2)
            hitCount = len(overlappedList)

            #output
            outputFile.write('%s\t%s\n' % (rangeTcc, hitCount))
            startCoord = startCoord + frameShift  #check overlap with range
    outputFile.close()

    print 'Output Hits per Frame:', timer.split()
    print 'Overall Time:', timer.report()
Esempio n. 30
0
def defineClusters(cName = None):
	#Start Timer
	timer = cg.cgTimer()
	timer.start()
	
	#Get list of mature tccs
	conf = cgConfig.getConfig(cName) #passed or default
	finalMirFileName = conf.conf['resultsRaw']
	matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc
	print 'List getting', timer.split()
	
	
	#make connections dict
	matureConnections = compare.makeConnectionsDict(matureTccs)
	print 'Make connections:', timer.split()
	
	#Now have to define Clusters...
	clusters = []
	addedList = []
	
	#I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P
	def createClusters(item = None, mode = None):
			
		if item in addedList:
			return 0
		elif mode == "top":
			clusters.append([item])
			addedList.append(item) ##creates new cluster with the item already stored in it
			for connectedItem in matureConnections[item]:
				createClusters(connectedItem, "neighbor")
		elif mode == "neighbor":
			clusters[-1].append(item) #add this item to the last cluster created
			addedList.append(item)
			for connectedItem in matureConnections[item]:
				createClusters(connectedItem, "neighbor")
		
	for tcc in matureTccs:
		createClusters(tcc, "top")
	
	print 'Make Clusters', timer.split()
	
	
	#Sort Clusters.
	sortedClusters = []
	
	for cluster in clusters:
		sortedClusters.append(cg.sortTccList(cluster))
	
	print 'Sort Clusters:', timer.split()
	
	
	#Output sorted cluster file
	clusterFileName = conf.conf['sortedClusters']
	clusterFile = open(clusterFileName, 'w')
	for cluster in sortedClusters:
		for hit in cluster:
			clusterFile.write('%s,' % hit)
		clusterFile.write('\n')
	clusterFile.close()
	
	'''
	#re-create sortedClusters list:
	clusterFileName = 'sortedClusters.data'
	clusterFile = open(clusterFileName, 'r')
	sortedClusters = []
	
	
	for line in clusterFile:
		sortedClusters.append([])
		line = line.strip()[0:-1] #take off last comma ;P
		for hit in (line.strip().split(',')):
			sortedClusters[-1].append(hit)
	'''
	
	
	print 'Store intermediate data:', timer.split()
	
	
	#output hitsAround file
	outputFile = open(conf.conf['hitsPerFrame'], 'w')
	
	frameLength = 200
	frameShift = 1
	for cluster in sortedClusters:
		#grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope
		clusterChrom = cluster[0].split(":")[0]
		clusterStrand = cluster[0].split(":")[1]
		firstCoord = int(cluster[0].split(":")[2])
		#print cluster[-1]
		lastCoord = int(cluster[-1].split(":")[3])
		
		
		startCoord = firstCoord
		while startCoord < lastCoord:
			#count how many hits there are in this range
			rangeStart = startCoord - (frameLength/2)
			rangeEnd = startCoord + (frameLength/2)
			rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd)
			overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2)
			hitCount = len(overlappedList) 
			
			#output 
			outputFile.write('%s\t%s\n' % (rangeTcc, hitCount))
			startCoord = startCoord + frameShift #check overlap with range
	outputFile.close()
	
	print 'Output Hits per Frame:', timer.split()
	print 'Overall Time:', timer.report()