Ejemplo n.º 1
0
def name(fN):
        

if __name__ == "__main__":
        import sys
        if sys.argv[1] == "help":
                bioLibCG.gd(sys.argv[0])
        else:
                bioLibCG.submitArgs(globals()[sys.argv[1]], sys.argv[1:])
Ejemplo n.º 2
0
import bioLibCG
import cgDB
import cgOriginRNA


def probeMicro(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for oRNA in id_oRNA.values():

        if oRNA.passedFilter:
            print oRNA.id, oRNA.sequence, oRNA.tcc, oRNA.tccs


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(probeMicro, sys.argv)
Ejemplo n.º 3
0
import bioLibCG

def parseTargets(fN):

        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                micro = ls[3].split(',')
                micro.extend(ls[4].split(','))
                done = []
                for m in micro:
                        if m != 'None' and (m not in done):
                                print m
                                done.append(m)



if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(parseTargets, sys.argv)

Ejemplo n.º 4
0
			for line in f:
				
				lChrom, start, end, val = (line.strip().split('\t'))
				start, end, val = int(start), int(end), int(val)
				if val < 1: continue
				#print start, end, val
				for i in range(start, end):
					try:
						hitDict[lChrom][strand][i] += val
					except (KeyError,TypeError):
						if not lChrom in hitDict:
							hitDict[lChrom] = {}
						if not strand in hitDict[lChrom]:
							hitDict[lChrom][strand] = {}
						hitDict[lChrom][strand][i] = val
		
		#write results to wig file
		writeWigFromHitDict(hitDict, assembly, name, directory)


if __name__ == "__main__":
	import sys
	
        cg.submitArgs(makeWigMem, sys.argv)
	#cg.submitArgs(mixWig, sys.argv)

				
				

	
Ejemplo n.º 5
0
import bioLibCG

def createResultsFile(peakFN, outFN):

        f = open(peakFN, 'r')
        peaks = [x.strip() for x in f]
        f.close()

        outF = open(outFN, 'w')
        for i, peak in enumerate(peaks):
                outF.write('%s\t%s\n' % (i, peak))

if __name__ == "__main__":
        import sys

        bioLibCG.submitArgs(createResultsFile, sys.argv)

Ejemplo n.º 6
0
import bioLibCG

def filterDups(fN, oFN):

        outF = open(oFN, 'w')
        knownSeqs = [] 
        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                seq = ls[1]
                if seq not in knownSeqs:
                        outF.write(line)
                        knownSeqs.append(seq)
        
        f.close()
        outF.close()

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(filterDups, sys.argv)
Ejemplo n.º 7
0
import bioLibCG
import GenomeFetch

def peakToSeq(peakFN, extend, outFN, assembly):
        #extend is +25 for degradome and -6/-4 for oRNA
        extend = int(extend)
        gf = GenomeFetch.GenomeFetch(assembly)

        outF = open(outFN, 'w')
        f = open(peakFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                chrom, strand, start, end = bioLibCG.tccSplit(ls[0])
                start, end = start - extend, end + extend
                newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
                outF.write(gf.getSequence(newTcc) + '\n')

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(peakToSeq, sys.argv)
Ejemplo n.º 8
0
                longestT = transcript

        # Get the coordinate ends/etc
        starts, ends = [], []
        if longestUTR is None:
            continue
        for utrPair in longestUTR:
            starts.append(utrPair[0])
            ends.append(utrPair[1])

        starts.sort()
        ends.sort()

        startS = ",".join([str(x) for x in starts])
        endS = ",".join([str(x) for x in ends])

        print "%s\t%s\t%s\t%s\t%s\t%s" % (
            transcript.id,
            transcript.parent,
            transcript.chromosome,
            transcript.strand,
            startS,
            endS,
        )


if __name__ == "__main__":
    import sys

    bioLibCG.submitArgs(getCoords, sys.argv)
        for each duplicate sequence

        make set of targets for each oID --> set each oid's targets'''

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['sequence', 'filteredTargets'], [rn, tn])

    knownSeq_targets = {}

    #create oID groups and target sets.
    for oID in oNX.sequence:
        currSeq = oNX.sequence[oID]

        #add targets to set
        for tID in oNX.filteredTargets[oID]:
            knownSeq_targets.setdefault(currSeq, set()).add(tID)

    for oID in oNX.sequence:

        currSeq = oNX.sequence[oID]

        newTargets = list(knownSeq_targets.get(currSeq, set()))
        oNX.filteredTargets[oID] = newTargets

    oNX.save()


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(updateSeqDuplicateMultiTcc, sys.argv)
Ejemplo n.º 10
0
import bioLibCG
import GenomeFetch


def getPeakSequences(peakFN, extend=0):

    extend = int(extend)

    f = open(peakFN, 'r')
    peaks = [x.strip() for x in f]
    f.close()

    gf = GenomeFetch.GenomeFetch('hg19')

    for peak in peaks:
        chrom, strand, start, end = bioLibCG.tccSplit(peak)
        start = start - extend
        end = end + extend
        print gf.get_seq_from_to(chrom, start, end, strand)


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(getPeakSequences, sys.argv)
Ejemplo n.º 11
0
import bioLibCG


def parseTargets(fN):

    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        micro = ls[3].split(',')
        micro.extend(ls[4].split(','))
        done = []
        for m in micro:
            if m != 'None' and (m not in done):
                print m
                done.append(m)


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(parseTargets, sys.argv)
Ejemplo n.º 12
0
        ls = line.strip().split('\t')
        '''
                for i, text in enumerate(ls):
                        print i, text
                '''
        tID = ls[0]
        chr = 'chr' + ls[1]
        strand = ls[4]
        tss, tse = ls[2], ls[3]
        css, cse = tss, tss
        exons = getBracketList(ls[18])
        numExons = len(exons)
        exonStarts = ','.join([x[0] for x in exons])
        exonEnds = ','.join([x[1] for x in exons])
        geneName = ls[8]
        geneName = ensID_gID.get(geneName, geneName)
        stat5 = 'none'
        stat3 = 'none'
        tCoding = 'pseudogene_noncoding'
        unused = 'none'
        gCoding = 'pseudogene_noncoding'

        print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
            tID, chr, strand, tss, tse, css, cse, numExons, exonStarts,
            exonEnds, geneName, stat5, stat3, tCoding, unused, gCoding)


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(convertPsuedo, sys.argv)
Ejemplo n.º 13
0
    cLength = 1
    letters = list(seq)
    for i, letter in enumerate(letters):
        if i == 0: continue

        if letters[i] == letters[i - 1]:
            cLength += 1
            if cLength > highestLength:
                highestLength = cLength
        else:
            cLength = 1

    return highestLength


def filterContigs(fN, outFN):

    fOut = open(outFN, 'w')
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        seq = ls[1]

        if getContigLength(seq) < 7:
            fOut.write(line)


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(filterContigs, sys.argv)
Ejemplo n.º 14
0
import subprocess
import bioLibCG
import os


def parRun(numParts, memoryAmount, scriptName, *args):
    numParts = int(numParts)

    for i in xrange(1, numParts + 1):

        #specific the correct qJob with correct memory
        qJobX = '%s/exec/qJob%s.sh' % (os.environ['HOME'], memoryAmount)
        qDo = '%s/exec/qDo.sh' % (os.environ['HOME'])

        #construct command to pass
        com = [qJobX, qDo, scriptName]
        for arg in args:
            com.append(arg)
        com.append(str(i))
        com.append(str(numParts))
        #run each job
        subprocess.Popen(com).wait()


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(parRun, sys.argv)
Ejemplo n.º 15
0
                        codingFlag = None
                        tTypes = [ x[1] for x in transcript.getOverlappingElements(eSite.tcc)]
                        

                        if '3UTR' in tTypes:
                                tType = '3UTR'
                        elif '5UTR' in tTypes:
                                tType = '5UTR'
                        else:
                                tType = tTypes[0] #has to be one thing...exon or intron
                        #This only works because UTR takes precedence over EXON in TYPE.
                        if tType == 'EXON':
                                if codingTranscript:
                                        codingFlag = 'C'
                                else:
                                        codingFlag = 'NC'
                        else:
                                codingFlag = 'NC'


                        fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, transcript.parent, transcript.id, tType, codingFlag, transcript.tType))
                        #fOut.write('%s:%s:%s\t%s\t%s\t%s\t%s\n' % (eSite.chromosome, eSite.strand, eSite.coordinate, transcript.parent, transcript.id, tType, codingFlag))

        fOut.close()


if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(updateContext, sys.argv)

Ejemplo n.º 16
0
    complexities = []
    seqNumTargets = []

    for line in f:
        ls = line.strip().split('\t')
        id = int(ls[0])
        complexity = id_comp[id]
        numTargets = float(ls[1])
        complexities.append(complexity)
        seqNumTargets.append(numTargets)

    plt.plot(complexities,
             seqNumTargets,
             'bo',
             label='simulated sRNA',
             color='blue')
    plt.legend()
    plt.ylabel(
        'Number of targets per small RNA (filter: O >0-NoMicroTran, T YesTran4Mis > .55 6bp'
    )
    plt.xlabel('Complexity of smallRNA')
    plt.title('Origin RNA Target simulation')
    plt.show()
    f.close()


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(plotEntropyTargets, sys.argv)
Ejemplo n.º 17
0
import bioLibCG
from siRnaPredict import getEntropy 


def filterEntropy(fN, outFN, minEntropy = 1.15):
        minEntropy = float(minEntropy)

        fOut = open(outFN, 'w')
        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                seq = ls[1]
                ent = getEntropy(seq)

                if ent > minEntropy:
                        fOut.write(line)
                        

        

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(filterEntropy, sys.argv)


Ejemplo n.º 18
0
import bioLibCG


def numTargets(fN):

    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        id, targets = ls[0], ls[4].split(',')
        uniq = []
        for target in targets:
            if target not in uniq:
                uniq.append(target)

        print id, ','.join(uniq)


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(numTargets, sys.argv)
Ejemplo n.º 19
0
        for line in f:
                ls = line.strip().split(' ')
                sID = int(ls[0])
                tID = int(ls[1])
                sOffset = int(ls[4])
                try:
                        mismatchPositions = [(int(x) + sOffset) for x in ls[9].split(',')]
                except IndexError:
                        mismatchPositions = []
                ss = id_qSeq[sID]
                ts = id_dSeq[tID]

                sSpaces = ''.join([' ' for i in list(range(0,sOffset))])
                tSpaces = [' ' for i in list(ts)]
                for i in mismatchPositions:
                        tSpaces[i] = 'X'
                

                #graphically display
                print ts
                print ''.join(tSpaces)
                print '%s%s' % (sSpaces, ss)
                print 
                

        

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(checkAlignment, sys.argv)
Ejemplo n.º 20
0
            compareList = zip(mCodonList, emCodonList)
            codonNumber = ePositionInMRNA // 3
            codonPair = compareList[codonNumber]
            bCodon = codonPair[0]
            aCodon = codonPair[1]
            baa = cgSeqMod.translateRNA(bCodon, map)
            aaa = cgSeqMod.translateRNA(aCodon, map)
            synFlag = 'SYN'
            if baa != aaa:
                synFlag = 'NON'
                bCodonList = list(bCodon)
                aCodonList = list(aCodon)
                matchedLetters = zip(bCodonList, aCodonList)
                for pair in matchedLetters:
                    if pair[0] != 'A':
                        if pair[1] == 'G' and pair[0] != 'G':
                            print 'messed up codon switch', bCodonList, aCodonList
                            print t.parent, '%s:%s' % (
                                eSite.chromosome, eSite.coordinate
                            ), eSite.strand, bCodon, aCodon, baa, aaa

            outF.write('\t'.join([
                str(eSite.ID), transcript.parent, transcript.id, synFlag,
                bCodon, aCodon, baa, aaa
            ]) + '\n')


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(betterSynonymous, sys.argv)
Ejemplo n.º 21
0
import bioLibCG
import cgAlign


def createDatabases(targetsFN, wordSize, runName):
    wordSize = int(wordSize)

    # make sequence list out of targets, make db, write to file
    f = open(targetsFN, "r")
    seqList = []
    print "obtaining sequences"
    i = 0
    for line in f:
        seqList.append(cgAlign.cgSeq(i, line.strip()))
        i += 1
    f.close()

    print "making word db"
    wordDB = cgAlign.createWordDatabase(seqList, wordSize)
    cgAlign.writeWordDatabase(wordDB, runName)

    print "making seq db"
    seqDB = cgAlign.createSequenceDatabase(seqList)
    cgAlign.writeSequenceDatabase(seqDB, runName)


if __name__ == "__main__":
    import sys

    bioLibCG.submitArgs(createDatabases, sys.argv)
Ejemplo n.º 22
0
                print a.tTcc, a.id

    aDC.commit(id_alignment)


def appendTranInfo(aDir, degSmallFN):

    aDC = cgDB.dataController(aDir, cgAlignment)
    id_alignment = aDC.load()

    tID_tranVal = {}
    f = open(degSmallFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        val = int(ls[3])
        if val == 1: val = True
        if val == 0: val = False

        tID_tranVal[int(ls[0])] = val

    for alignment in id_alignment.values():
        transcriptOverlap = tID_tranVal[alignment.tID]
        alignment.transcriptOverlap = transcriptOverlap

    aDC.commit(id_alignment)


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(loadAlignments, sys.argv)
Ejemplo n.º 23
0
import bioLibCG
import compareData
import cgEdit


def overlapWithDegradome(dFN, eFN):

    eSites = cgEdit.loadEditingSites(eFN)

    degTccs = []
    f = open(dFN, "r")
    for line in f:
        ls = line.strip().split("\t")
        chrom, strand, start, end = bioLibCG.tccSplit(ls[1])
        start = start - 3
        end = end + 3
        degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end))
    print degTccs[0:5]
    eTccs = [eSite.tcc for eSite in eSites]

    overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1)

    print len(overlaps)


if __name__ == "__main__":
    import sys

    bioLibCG.submitArgs(overlapWithDegradome, sys.argv)
Ejemplo n.º 24
0
import bioLibCG


def uniqueColumn(fN, column, whole=False):

    u = {}
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        u[ls[int(column)]] = line.strip()

    if whole:
        for i in u:
            print u[i]
    else:
        for i in u:
            print i


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(uniqueColumn, sys.argv)
Ejemplo n.º 25
0
import bioLibCG
import cgEdit

def getEditInfo(fN, idList):
        
        eSites = cgEdit.loadEditingSites(fN)
        
        idDict = {}
        for eSite in eSites:
                idDict[eSite.ID] = eSite

        list = []
        f = open(idList, 'r')
        for line in f:
                ls = line.strip().split('\t')
                list.append(int(ls[0]))

        for id in list:
                eSite = idDict[id]
                print eSite.ID, '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.gene, eSite.eRatio
               
if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(getEditInfo, sys.argv)
               
Ejemplo n.º 26
0
def plotContextPie(oDir, contextFN):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    oID_tTypes = {}
    f = open(contextFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        id = int(ls[0])
        tType = ls[5]
        oID_tTypes.setdefault(id, []).append(tType)

    tType_count = {}
    for oRNA in id_oRNA.values():
        if not oRNA.passedFilter:
            continue

        for tType in oID_tTypes[oRNA.id]:
            num = tType_count.get(tType, 0)
            tType_count[tType] = num + 1

    for tType, count in tType_count.items():
        print tType, count


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(plotContextPie, sys.argv)
Ejemplo n.º 27
0
import subprocess
import bioLibCG
import os

def parRun(numParts, memoryAmount, scriptName, *args):
        numParts = int(numParts)
        
        for i in xrange(1, numParts + 1):
               
                #specific the correct qJob with correct memory
                qJobX = '%s/exec/qJob%s.sh' % (os.environ['HOME'], memoryAmount)
                qDo = '%s/exec/qDo.sh' % (os.environ['HOME'])

                #construct command to pass
                com = [qJobX, qDo, scriptName]
                for arg in args:
                        com.append(arg)
                com.append(str(i))
                com.append(str(numParts))
                #run each job
                subprocess.Popen(com).wait()
        

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(parRun, sys.argv)



Ejemplo n.º 28
0
                if gType == t:
                    #totalNum += 1.0/lg
                    #theSum += 1.0/lg
                    totalNum += 1.0
                    theSum += 1
        fracs.append(totalNum)

    labels = [
        'Introns (%s)' % fracs[0],
        'Intergenic(%s)' % fracs[1],
        'Exons (%s)' % fracs[2],
        '3\'UTR (%s)' % fracs[3],
        '5\'UTR (%s)' % fracs[4]
    ]
    fracs = [float(x) / theSum for x in fracs]

    explode = (0.1, 0.1, 0.2, 0.1, 0.1)
    pie(fracs, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True)
    title('Editing Site Genomic Location',
          bbox={
              'facecolor': '1.0',
              'pad': 10
          })

    show()


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(makeContextPieBetter, sys.argv)
Ejemplo n.º 29
0
import bioLibCG
import matplotlib.pyplot as plt
import siRnaPredict as si

def makeComplexityHist(fN):
        
        f = open(fN, 'r')

        histVals = []
        for line in f:
                ls = line.strip().split('\t')
                name = ls[3]
                seq = ls[4]

                if 'hsa' in name:
                        histVals.append(si.getEntropy(seq))

        plt.hist(histVals, 30)
        plt.show()


if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(makeComplexityHist, sys.argv)


Ejemplo n.º 30
0
import bioLibCG


def addIDs(fN, outFN):

    fOut = open(outFN, 'w')
    i = 0
    f = open(fN, 'r')
    for line in f:
        fOut.write('%s\t%s\n' % (i, line.strip()))
        i += 1

    fOut.close()


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(addIDs, sys.argv)
Ejemplo n.º 31
0
	newLines = []
	for line in f:
	        ls = line.strip().split('\t') 
                degTcc = cg.convertToAS(ls[1])
                chrom, strand, start, end = cg.tccSplit(degTcc)
                if chrom != runningChrom:
                        continue

                if strand != runningStrand:
                        continue

                inTran = '0'
                for i in xrange(start, end + 1):
                        if i in coordSet:
                                inTran = '1'
                                break

		#update newLines
                newLine = cg.appendToLine(line, inTran, 3)
                newLines.append(newLine)         
	f.close()

        f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w')
        f.writelines(newLines)
        f.close()


if __name__ == "__main__":
	import sys
        cg.submitArgs(globals()[sys.argv[1]], sys.argv[1:])
Ejemplo n.º 32
0
def subtractTwoTccLists(tccListKeep, tccListOther):
    #Both Lists should already be COLLAPSED!!!

    if checkIfOverlaps(tccListKeep):
        print 'THE KEEPER LIST HAS OVERLAPS (SUBTRACTION)'
    if checkIfOverlaps(tccListOther):
        print 'THE OTHER LIST HAS OVERLAPS (SUBTRACTION)'

    overlapKeep = compareTwoTcc(tccListKeep, tccListOther, 1)
    overlapOther = compareTwoTcc(tccListKeep, tccListOther, 2)

    subList = recurseSubtract(overlapKeep, overlapOther)
    #print 'subList:',  subList
    #for those that didn't overlap return them

    newSeqs = []  # make a new list so as to not overwrite the other.
    newSeqs.extend(tccListKeep)

    for tcc in overlapKeep:
        newSeqs.remove(tcc)

    newSeqs.extend(subList)

    return newSeqs


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(getIndividualOverlaps, sys.argv)
Ejemplo n.º 33
0
        pRuns
        --run.00
        ----oRNA (slave: pRuns/run.00/oRNA)
        ----aDir
        --run.01
        '''

        mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment)
        id_masterObj = mDC.load()
        
        #recurse through all the runs
        masterBN = bioLibCG.getBaseFileName(masterDir)

        for slaveDir in bioLibCG.recursePaths(parDir, end = masterBN):

        
                oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment)
                id_slaveObj = oDC.load()
       
                id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj, cgOriginRNA.OriginRNA) 
        
        mDC.commit(id_masterObj)

def mergeDir(dirName):

        cgDB.mergeDirectory(dirName, cgOriginRNA.OriginRNA)

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(mergeDir, sys.argv)
Ejemplo n.º 34
0
import bioLibCG

def formatFile(fN):

        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                newList = []
                newList.extend(ls[1:11])
                newList.extend(ls[12:])
                print '\t'.join(newList)

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(formatFile, sys.argv)

                        midCheck = 0
                        highCheck = 0


                        #check mismatches
                        for i in lowRange:
                                if i in alignment.mismatchPositions:
                                        lowCheck = 1
                                        break

                
                        for i in midRange:
                                if i in alignment.mismatchPositions:
                                        midCheck = 1
                                        break

                        for i in highRange:
                                if i in alignment.mismatchPositions:
                                        highCheck = 1
                                        break

                        fOut.write('%s\t%s\t%s\t%s\t%s\n' % (sID, tID, lowCheck, midCheck, highCheck))
        f.close()


if __name__ == "__main__":
        import sys
        #bioLibCG.submitArgs(markMismatchedPairs, sys.argv)
        bioLibCG.submitArgs(markCenterExpression, sys.argv)
        
Ejemplo n.º 36
0
import bioLibCG


def blankIDs(fN, outFN, numIDs=None):
    '''Make a file with X number of blank IDs, or as many as there is lines in a file'''

    if numIDs:
        numIDs = int(numIDs)
    else:
        numIDs = bioLibCG.getNumFileLines(fN)

    newLines = []
    for i in xrange(0, numIDs):
        newLines.append('%s\n' % i)

    f = open(outFN, 'w')
    f.writelines(newLines)
    f.close()


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(blankIDs, sys.argv)
Ejemplo n.º 37
0
        fracs = []
        theSum = 0.0
        for t in types:
                totalNum = 0.0
                for gene in geneDict:
                        lg = len(geneDict[gene])
                        for gType in geneDict[gene]:   
                                if gType == t:
                                        #totalNum += 1.0/lg
                                        #theSum += 1.0/lg
                                        totalNum += 1.0
                                        theSum += 1
                fracs.append(totalNum)


        
        
        labels = ['Introns (%s)' % fracs[0], 'Intergenic(%s)' % fracs[1], 'Exons (%s)' % fracs[2], '3\'UTR (%s)' % fracs[3], '5\'UTR (%s)' % fracs[4]]
        fracs = [float(x)/theSum for x in fracs]

        explode=(0.1, 0.1, 0.2, 0.1, 0.1)
        pie(fracs, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True)
        title('Editing Site Genomic Location', bbox={'facecolor':'1.0', 'pad':10})

        show()

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(makeContextPieBetter, sys.argv)

Ejemplo n.º 38
0
import bioLibCG
import cgEdit
import GenomeFetch

def getFolded(fN):


        eSites = cgEdit.loadEditingSites(fN)
        gf = GenomeFetch.GenomeFetch('hg19')

        for eSite in eSites:

                #Get +/- 200 bp of eSite
                chrom, strand, coord = eSite.chromosome, eSite.strand, eSite.coordinate
                start, end = coord - 200, coord + 200

                seq = gf.get_seq_from_to(chrom, start, end, strand)

                print '>', eSite.ID
                print seq


if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(getFolded, sys.argv)
                
Ejemplo n.º 39
0
import bioLibCG

def countUniqueID(fN):

        f = open(fN, 'r')

        countDict = {}

        for line in f:
                id = line.strip().split('\t')[0]
                countDict[id] = line.strip()
        f.close()

        print len(countDict)
        for i in countDict:
                print countDict[i]


if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(countUniqueID, sys.argv)

Ejemplo n.º 40
0
        ----oRNA (slave: pRuns/run.00/oRNA)
        ----aDir
        --run.01
        '''

    mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment)
    id_masterObj = mDC.load()

    #recurse through all the runs
    masterBN = bioLibCG.getBaseFileName(masterDir)

    for slaveDir in bioLibCG.recursePaths(parDir, end=masterBN):

        oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment)
        id_slaveObj = oDC.load()

        id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj,
                                            cgOriginRNA.OriginRNA)

    mDC.commit(id_masterObj)


def mergeDir(dirName):

    cgDB.mergeDirectory(dirName, cgOriginRNA.OriginRNA)


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(mergeDir, sys.argv)
Ejemplo n.º 41
0
        f.readline()
        for line in f:
                ls = line.strip().split('\t')
                '''
                for i, text in enumerate(ls):
                        print i, text
                '''
                tID = ls[0]
                chr = 'chr' + ls[1]
                strand = ls[4]
                tss, tse = ls[2], ls[3]
                css, cse = tss, tss
                exons = getBracketList(ls[18])
                numExons = len(exons)
                exonStarts = ','.join([x[0] for x in exons])
                exonEnds = ','.join([x[1] for x in exons])
                geneName = ls[8]
                geneName = ensID_gID.get(geneName, geneName)
                stat5 = 'none'
                stat3 = 'none'
                tCoding = 'pseudogene_noncoding'
                unused = 'none'
                gCoding = 'pseudogene_noncoding'


                print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (tID, chr, strand, tss, tse, css, cse, numExons, exonStarts, exonEnds, geneName, stat5, stat3, tCoding, unused, gCoding)

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(convertPsuedo, sys.argv)
Ejemplo n.º 42
0
                        codingTranscript =  '_coding' in transcript.tType
                        tType = None
                        codingFlag = None

                        tTypes = [x[1] for x in transcript.getOverlappingElements(oRNA.tcc)]
                        
                        #categorize border types
                       
                        tType = ds.spotItem(tTypes)
                          
                        if tType == 'EXON' or 'EXON_INTRON':
                                if codingTranscript:
                                        codingFlag = 'C'
                                else:
                                        codingFlag = 'NC'
                        else:
                                codingFlag = 'NC'

                        
                        oRNA.transcriptIDs.append(transcript.id)
                        oRNA.transcriptContexts.append(tType)
                        oRNA.transcriptTypes.append(transcript.tType)
                        oRNA.transcriptCodingTypes.append(codingFlag)

        oDC.commit(id_oRNA)

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(updateContext, sys.argv)

import bioLibCG
import cgDB
import cgOriginRNA


def getSeqs(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for id, oRNA in id_oRNA.items():

        if oRNA.sequenceDuplicate:
            continue
        if oRNA.totalContigLength > 6:
            continue
        if oRNA.endContigLength > 6:
            continue

        print "%s" % id


if __name__ == "__main__":
    import sys

    bioLibCG.submitArgs(getSeqs, sys.argv)
Ejemplo n.º 44
0
import bioLibCG
import cgDB
import cgAlignment


def probeAlignments(aDir):

    probePairs = [[6, 35934]]

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    for alignment in id_alignment.values():
        for sID, tID in probePairs:

            if alignment.sID == sID and alignment.tID == tID:
                print alignment.id, alignment.sID, alignment.tID, alignment.centerExpression, alignment.mismatchStatus, alignment.numMismatches, alignment.transcriptOverlap


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(probeAlignments, sys.argv)
Ejemplo n.º 45
0
        rects1 = ax.bar(ind, menMeans, width, color='r')

        womenMeans = (49344, 43652, 40213, 36490, 25724, 6236, 5237, 4639, 4108, 2774)
        womenStd =   (220,209,202,191,152,13,12,11,11,9)
        rects2 = ax.bar(ind+width, womenMeans, width, color='y', yerr=womenStd)

        # add some
        ax.set_ylabel('Total Number of Targets')
        ax.set_xlabel('descriptor (a:b:c)\n a = # bp from center where NO mismatches allowed\n b = # of bp from center where at least c% of degradome expression must be found')
        ax.set_title('Total SNR')
        ax.set_xticks(ind+width)
        ax.set_xticklabels( ('4.6.30', '4.6.50', '4.6.60', '4.6.70', '4.6.90', '6.6.30', '6.6.50', '6.6.60', '6.6.70', '6.6.90'))
        ax.axis([0,10,0,70000])

        ax.legend( (rects1[0], rects2[0]), ('Observed', 'Simulated') )

        def autolabel(rects):
                # attach some text labels
                for rect in rects:
                        height = rect.get_height()
                        ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height), ha='center', va='bottom')

        autolabel(rects1)
        autolabel(rects2)

        plt.show()

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(plotBarSNR, sys.argv)
Ejemplo n.º 46
0
import bioLibCG
import compareData
import cgEdit


def overlapWithDegradome(dFN, eFN):

    eSites = cgEdit.loadEditingSites(eFN)

    degTccs = []
    f = open(dFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = bioLibCG.tccSplit(ls[1])
        start = start - 3
        end = end + 3
        degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end))
    print degTccs[0:5]
    eTccs = [eSite.tcc for eSite in eSites]

    overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1)

    print len(overlaps)


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(overlapWithDegradome, sys.argv)
import bioLibCG

def targetCount(fN):
       
       targetDict = {}
       f = open(fN, 'r')
       for line in f:
               ls = line.strip().split('\t')
               targets = ls[4].split(',')
               for target in targets:
                        if target in targetDict:
                                targetDict[target] += 1
                        else:
                                targetDict[target] = 1

       for target in targetDict:
               print target + '\t' + str(targetDict[target])


if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(targetCount, sys.argv)
Ejemplo n.º 48
0
    f = open(degFile, 'r')
    newLines = []
    for line in f:
        ls = line.strip().split('\t')
        degTcc = cg.convertToAS(ls[1])
        chrom, strand, start, end = cg.tccSplit(degTcc)
        if chrom != runningChrom:
            continue

        if strand != runningStrand:
            continue

        inTran = '0'
        for i in xrange(start, end + 1):
            if i in coordSet:
                inTran = '1'
                break

    #update newLines
        newLine = cg.appendToLine(line, inTran, 3)
        newLines.append(newLine)
    f.close()

    f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w')
    f.writelines(newLines)
    f.close()

if __name__ == "__main__":
    import sys
    cg.submitArgs(globals()[sys.argv[1]], sys.argv[1:])
Ejemplo n.º 49
0
                for i,letter in enumerate(seq):
                        if i == 0: continue

                        if seq[i] == seq[i-1]:
                                cLength5 += 1
                        else:
                                break
                #3'
                cLength = 1
                revSeq = [x for x in reversed(seq)]
                for i,letter in enumerate(revSeq):
                        if i == 0: continue

                        if revSeq[i] == revSeq[i-1]:
                                cLength += 1
                        else:
                                break

                highest = cLength5
                if cLength > cLength5:
                        highest = cLength

                oRNA.endContigLength = highest                        
               
        oDC.commit(id_oRNA)

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(updateEndContig, sys.argv)
        bioLibCG.submitArgs(updateTotalContig, sys.argv)
Ejemplo n.º 50
0
            highRange = range(7, 15)
            lowCheck = 0
            midCheck = 0
            highCheck = 0

            #check mismatches
            for i in lowRange:
                if i in alignment.mismatchPositions:
                    lowCheck = 1
                    break

            for i in midRange:
                if i in alignment.mismatchPositions:
                    midCheck = 1
                    break

            for i in highRange:
                if i in alignment.mismatchPositions:
                    highCheck = 1
                    break

            fOut.write('%s\t%s\t%s\t%s\t%s\n' %
                       (sID, tID, lowCheck, midCheck, highCheck))
    f.close()


if __name__ == "__main__":
    import sys
    #bioLibCG.submitArgs(markMismatchedPairs, sys.argv)
    bioLibCG.submitArgs(markCenterExpression, sys.argv)
Ejemplo n.º 51
0
                        mmVal = mmDict[sID][tID]
                        if mmVal == 1:
                                continue

                        #check center Expression
                        centerVal = centerDict[sID][tID]
                        if centerVal < minCenterLevel:
                                continue
	        	
                        newTargetList.append(str(tID))

                if len(newTargetList) < 1: continue 
                newTargets = ','.join(newTargetList)

		#update newLines
	        newLines.append(bioLibCG.appendToLine(line, newTargets, int(updatePosition)))
                
	f.close()
	
	
	#update file
	f = open(outFN, 'w')
	f.writelines(newLines)
	f.close()



if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(filterTargets, sys.argv)
Ejemplo n.º 52
0
            if fChrom != chrom: continue
            print '  ', fN, fChrom
            f = open(fN, 'r')
            f.readline()  #header
            strand = cg.getBaseFileName(fN).strip().split('.')[-2]
            for line in f:

                lChrom, start, end, val = (line.strip().split('\t'))
                start, end, val = int(start), int(end), int(val)
                if val < 1: continue
                #print start, end, val
                for i in range(start, end):
                    try:
                        hitDict[lChrom][strand][i] += val
                    except (KeyError, TypeError):
                        if not lChrom in hitDict:
                            hitDict[lChrom] = {}
                        if not strand in hitDict[lChrom]:
                            hitDict[lChrom][strand] = {}
                        hitDict[lChrom][strand][i] = val

        #write results to wig file
        writeWigFromHitDict(hitDict, assembly, name, directory)


if __name__ == "__main__":
    import sys

    cg.submitArgs(makeWigMem, sys.argv)
    #cg.submitArgs(mixWig, sys.argv)
Ejemplo n.º 53
0
import cgDB
import cgOriginRNA
import bioLibCG

def probeORNA(oDir):
        
        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        for oRNA in id_oRNA.values():
                if oRNA.passedFilter:
                        cgOriginRNA.prettyPrint(oRNA)


if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(probeORNA, sys.argv)
Ejemplo n.º 54
0
                for i,letter in enumerate(seq):
                        if i == 0: continue

                        if seq[i] == seq[i-1]:
                                cLength5 += 1
                        else:
                                break
                #3'
                cLength = 1
                revSeq = [x for x in reversed(seq)]
                for i,letter in enumerate(revSeq):
                        if i == 0: continue

                        if revSeq[i] == revSeq[i-1]:
                                cLength += 1
                        else:
                                break

                highest = cLength5
                if cLength > cLength5:
                        highest = cLength

                oNX.endContigLength[oID] = highest                        
               
        oNX.save()

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(updateEndContig, sys.argv)
        bioLibCG.submitArgs(updateTotalContig, sys.argv)
Ejemplo n.º 55
0
import bioLibCG

def fixup(eFN, tableFN):

        coord_gName = {}
        coord_eRatio = {}
        f = open(eFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                coord = '%s:%s' % (ls[0], ls[1])
                gName = ls[3]
                eRatio = ls[6]
                coord_gName[coord] = gName
                coord_eRatio[coord] = eRatio

        f = open(tableFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                coord = 'chr%s:%s' % (ls[0], ls[1])
                ls.append(coord_eRatio[coord])
                if ls[3] == 'NONE':
                        ls[3] = coord_gName[coord]

                print '\t'.join(ls)                        
                                
        

if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(fixup, sys.argv)
Ejemplo n.º 56
0
def createDatabases(targetsFN, wordSize, runName, hasIDs=False):
    wordSize = int(wordSize)
    hasIDs = (hasIDs == "True")
    print 'using IDs', hasIDs
    #make sequence list out of targets, make db, write to file
    f = open(targetsFN, 'r')
    seqList = []
    print 'obtaining sequences'
    i = 0
    for line in f:
        if hasIDs:
            theID, seq = line.strip().split('\t')
        else:
            theID, seq = i, line.strip()
        seqList.append(cgAlign.cgSeq(theID, seq))
        i += 1
    f.close()

    print 'making word db'
    wordDB = cgAlign.createWordDatabase(seqList, wordSize)
    cgAlign.writeWordDatabase(wordDB, runName)

    print 'making seq db'
    seqDB = cgAlign.createSequenceDatabase(seqList)
    cgAlign.writeSequenceDatabase(seqDB, runName)


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(createDatabases, sys.argv)
Ejemplo n.º 57
0
import bioLibCG
import cgGenes3
import cgSeqMod

def testit(gFN):

        geneSet = cgGenes3.createGeneSetEditing(gFN)

        map = cgSeqMod.loadCodonMap('hg19')
        for gene in geneSet.genes:
                for transcript in gene.transcripts:
                        try:
                                print ''
                                mRNA = transcript.getMRNA(coding = True)
                                i = transcript.getRelativePositionMRNA(35872409)
                                if i == -1:
                                        continue
                                print transcript.id
                                print i
                                print mRNA[:i], mRNA[i], mRNA[i + 1:]
                                print cgSeqMod.translateRNA(mRNA, map)
                        except:
                                pass


if __name__ == "__main__":
        import sys
        bioLibCG.submitArgs(testit, sys.argv)

Ejemplo n.º 58
0
        #error checks
        if len(c) > (keyPositions[-1] + 1):
            raise NameError("non-keyword arguments can not follow kw args")

        if continuityCheck(keyPositions):
            raise NameError("two keyword designations in a row!")

        for kw in possibleKeywords:
            if c.count(kw) > 1:
                raise NameError("keyword was used twice!")

        #update nonkw
        newNonkw = [x for x in nonkw[:keyPositions[0]]]

        #update kw
        for position in keyPositions:
            key, val = nonkw[position], nonkw[position + 1]
            kw[key] = val

        return dFxn(*newNonkw, **kw)

    return wrapped


if __name__ == "__main__":
    import sys
    if sys.argv[1] == "help":
        bioLibCG.gd(sys.argv[0])
    else:
        bioLibCG.submitArgs(globals()[sys.argv[1]], sys.argv[1:])
Ejemplo n.º 59
0
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('3 is none')
                                c += 1 
                                if codingStatus:
                                        e += 1
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('3 is none')
                                c += 1
                                if codingStatus:
                                        e += 1
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)

                a += 1

        print a, b, c, d, e                

if __name__ == "__main__":
        import sys
        if sys.argv[1] == "help":
                bioLibCG.gd(sys.argv[0])
        else:
                bioLibCG.submitArgs(globals()[sys.argv[1]], sys.argv[1:])
Ejemplo n.º 60
0
import bioLibCG
import cgEdit


def getEditInfo(fN, idList):

    eSites = cgEdit.loadEditingSites(fN)

    idDict = {}
    for eSite in eSites:
        idDict[eSite.ID] = eSite

    list = []
    f = open(idList, 'r')
    for line in f:
        ls = line.strip().split('\t')
        list.append(int(ls[0]))

    for id in list:
        eSite = idDict[id]
        print eSite.ID, '%s:%s' % (eSite.chromosome,
                                   eSite.coordinate), eSite.gene, eSite.eRatio


if __name__ == "__main__":
    import sys
    bioLibCG.submitArgs(getEditInfo, sys.argv)