def name(fN): if __name__ == "__main__": import sys if sys.argv[1] == "help": bioLibCG.gd(sys.argv[0]) else: bioLibCG.submitArgs(globals()[sys.argv[1]], sys.argv[1:])
import bioLibCG import cgDB import cgOriginRNA def probeMicro(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): if oRNA.passedFilter: print oRNA.id, oRNA.sequence, oRNA.tcc, oRNA.tccs if __name__ == "__main__": import sys bioLibCG.submitArgs(probeMicro, sys.argv)
import bioLibCG def parseTargets(fN): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') micro = ls[3].split(',') micro.extend(ls[4].split(',')) done = [] for m in micro: if m != 'None' and (m not in done): print m done.append(m) if __name__ == "__main__": import sys bioLibCG.submitArgs(parseTargets, sys.argv)
for line in f: lChrom, start, end, val = (line.strip().split('\t')) start, end, val = int(start), int(end), int(val) if val < 1: continue #print start, end, val for i in range(start, end): try: hitDict[lChrom][strand][i] += val except (KeyError,TypeError): if not lChrom in hitDict: hitDict[lChrom] = {} if not strand in hitDict[lChrom]: hitDict[lChrom][strand] = {} hitDict[lChrom][strand][i] = val #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory) if __name__ == "__main__": import sys cg.submitArgs(makeWigMem, sys.argv) #cg.submitArgs(mixWig, sys.argv)
import bioLibCG def createResultsFile(peakFN, outFN): f = open(peakFN, 'r') peaks = [x.strip() for x in f] f.close() outF = open(outFN, 'w') for i, peak in enumerate(peaks): outF.write('%s\t%s\n' % (i, peak)) if __name__ == "__main__": import sys bioLibCG.submitArgs(createResultsFile, sys.argv)
import bioLibCG def filterDups(fN, oFN): outF = open(oFN, 'w') knownSeqs = [] f = open(fN, 'r') for line in f: ls = line.strip().split('\t') seq = ls[1] if seq not in knownSeqs: outF.write(line) knownSeqs.append(seq) f.close() outF.close() if __name__ == "__main__": import sys bioLibCG.submitArgs(filterDups, sys.argv)
import bioLibCG import GenomeFetch def peakToSeq(peakFN, extend, outFN, assembly): #extend is +25 for degradome and -6/-4 for oRNA extend = int(extend) gf = GenomeFetch.GenomeFetch(assembly) outF = open(outFN, 'w') f = open(peakFN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[0]) start, end = start - extend, end + extend newTcc = bioLibCG.makeTcc(chrom, strand, start, end) outF.write(gf.getSequence(newTcc) + '\n') if __name__ == "__main__": import sys bioLibCG.submitArgs(peakToSeq, sys.argv)
longestT = transcript # Get the coordinate ends/etc starts, ends = [], [] if longestUTR is None: continue for utrPair in longestUTR: starts.append(utrPair[0]) ends.append(utrPair[1]) starts.sort() ends.sort() startS = ",".join([str(x) for x in starts]) endS = ",".join([str(x) for x in ends]) print "%s\t%s\t%s\t%s\t%s\t%s" % ( transcript.id, transcript.parent, transcript.chromosome, transcript.strand, startS, endS, ) if __name__ == "__main__": import sys bioLibCG.submitArgs(getCoords, sys.argv)
for each duplicate sequence make set of targets for each oID --> set each oid's targets''' oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['sequence', 'filteredTargets'], [rn, tn]) knownSeq_targets = {} #create oID groups and target sets. for oID in oNX.sequence: currSeq = oNX.sequence[oID] #add targets to set for tID in oNX.filteredTargets[oID]: knownSeq_targets.setdefault(currSeq, set()).add(tID) for oID in oNX.sequence: currSeq = oNX.sequence[oID] newTargets = list(knownSeq_targets.get(currSeq, set())) oNX.filteredTargets[oID] = newTargets oNX.save() if __name__ == "__main__": import sys bioLibCG.submitArgs(updateSeqDuplicateMultiTcc, sys.argv)
import bioLibCG import GenomeFetch def getPeakSequences(peakFN, extend=0): extend = int(extend) f = open(peakFN, 'r') peaks = [x.strip() for x in f] f.close() gf = GenomeFetch.GenomeFetch('hg19') for peak in peaks: chrom, strand, start, end = bioLibCG.tccSplit(peak) start = start - extend end = end + extend print gf.get_seq_from_to(chrom, start, end, strand) if __name__ == "__main__": import sys bioLibCG.submitArgs(getPeakSequences, sys.argv)
ls = line.strip().split('\t') ''' for i, text in enumerate(ls): print i, text ''' tID = ls[0] chr = 'chr' + ls[1] strand = ls[4] tss, tse = ls[2], ls[3] css, cse = tss, tss exons = getBracketList(ls[18]) numExons = len(exons) exonStarts = ','.join([x[0] for x in exons]) exonEnds = ','.join([x[1] for x in exons]) geneName = ls[8] geneName = ensID_gID.get(geneName, geneName) stat5 = 'none' stat3 = 'none' tCoding = 'pseudogene_noncoding' unused = 'none' gCoding = 'pseudogene_noncoding' print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( tID, chr, strand, tss, tse, css, cse, numExons, exonStarts, exonEnds, geneName, stat5, stat3, tCoding, unused, gCoding) if __name__ == "__main__": import sys bioLibCG.submitArgs(convertPsuedo, sys.argv)
cLength = 1 letters = list(seq) for i, letter in enumerate(letters): if i == 0: continue if letters[i] == letters[i - 1]: cLength += 1 if cLength > highestLength: highestLength = cLength else: cLength = 1 return highestLength def filterContigs(fN, outFN): fOut = open(outFN, 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') seq = ls[1] if getContigLength(seq) < 7: fOut.write(line) if __name__ == "__main__": import sys bioLibCG.submitArgs(filterContigs, sys.argv)
import subprocess import bioLibCG import os def parRun(numParts, memoryAmount, scriptName, *args): numParts = int(numParts) for i in xrange(1, numParts + 1): #specific the correct qJob with correct memory qJobX = '%s/exec/qJob%s.sh' % (os.environ['HOME'], memoryAmount) qDo = '%s/exec/qDo.sh' % (os.environ['HOME']) #construct command to pass com = [qJobX, qDo, scriptName] for arg in args: com.append(arg) com.append(str(i)) com.append(str(numParts)) #run each job subprocess.Popen(com).wait() if __name__ == "__main__": import sys bioLibCG.submitArgs(parRun, sys.argv)
codingFlag = None tTypes = [ x[1] for x in transcript.getOverlappingElements(eSite.tcc)] if '3UTR' in tTypes: tType = '3UTR' elif '5UTR' in tTypes: tType = '5UTR' else: tType = tTypes[0] #has to be one thing...exon or intron #This only works because UTR takes precedence over EXON in TYPE. if tType == 'EXON': if codingTranscript: codingFlag = 'C' else: codingFlag = 'NC' else: codingFlag = 'NC' fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, transcript.parent, transcript.id, tType, codingFlag, transcript.tType)) #fOut.write('%s:%s:%s\t%s\t%s\t%s\t%s\n' % (eSite.chromosome, eSite.strand, eSite.coordinate, transcript.parent, transcript.id, tType, codingFlag)) fOut.close() if __name__ == "__main__": import sys bioLibCG.submitArgs(updateContext, sys.argv)
complexities = [] seqNumTargets = [] for line in f: ls = line.strip().split('\t') id = int(ls[0]) complexity = id_comp[id] numTargets = float(ls[1]) complexities.append(complexity) seqNumTargets.append(numTargets) plt.plot(complexities, seqNumTargets, 'bo', label='simulated sRNA', color='blue') plt.legend() plt.ylabel( 'Number of targets per small RNA (filter: O >0-NoMicroTran, T YesTran4Mis > .55 6bp' ) plt.xlabel('Complexity of smallRNA') plt.title('Origin RNA Target simulation') plt.show() f.close() if __name__ == "__main__": import sys bioLibCG.submitArgs(plotEntropyTargets, sys.argv)
import bioLibCG from siRnaPredict import getEntropy def filterEntropy(fN, outFN, minEntropy = 1.15): minEntropy = float(minEntropy) fOut = open(outFN, 'w') f = open(fN, 'r') for line in f: ls = line.strip().split('\t') seq = ls[1] ent = getEntropy(seq) if ent > minEntropy: fOut.write(line) if __name__ == "__main__": import sys bioLibCG.submitArgs(filterEntropy, sys.argv)
import bioLibCG def numTargets(fN): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') id, targets = ls[0], ls[4].split(',') uniq = [] for target in targets: if target not in uniq: uniq.append(target) print id, ','.join(uniq) if __name__ == "__main__": import sys bioLibCG.submitArgs(numTargets, sys.argv)
for line in f: ls = line.strip().split(' ') sID = int(ls[0]) tID = int(ls[1]) sOffset = int(ls[4]) try: mismatchPositions = [(int(x) + sOffset) for x in ls[9].split(',')] except IndexError: mismatchPositions = [] ss = id_qSeq[sID] ts = id_dSeq[tID] sSpaces = ''.join([' ' for i in list(range(0,sOffset))]) tSpaces = [' ' for i in list(ts)] for i in mismatchPositions: tSpaces[i] = 'X' #graphically display print ts print ''.join(tSpaces) print '%s%s' % (sSpaces, ss) print if __name__ == "__main__": import sys bioLibCG.submitArgs(checkAlignment, sys.argv)
compareList = zip(mCodonList, emCodonList) codonNumber = ePositionInMRNA // 3 codonPair = compareList[codonNumber] bCodon = codonPair[0] aCodon = codonPair[1] baa = cgSeqMod.translateRNA(bCodon, map) aaa = cgSeqMod.translateRNA(aCodon, map) synFlag = 'SYN' if baa != aaa: synFlag = 'NON' bCodonList = list(bCodon) aCodonList = list(aCodon) matchedLetters = zip(bCodonList, aCodonList) for pair in matchedLetters: if pair[0] != 'A': if pair[1] == 'G' and pair[0] != 'G': print 'messed up codon switch', bCodonList, aCodonList print t.parent, '%s:%s' % ( eSite.chromosome, eSite.coordinate ), eSite.strand, bCodon, aCodon, baa, aaa outF.write('\t'.join([ str(eSite.ID), transcript.parent, transcript.id, synFlag, bCodon, aCodon, baa, aaa ]) + '\n') if __name__ == "__main__": import sys bioLibCG.submitArgs(betterSynonymous, sys.argv)
import bioLibCG import cgAlign def createDatabases(targetsFN, wordSize, runName): wordSize = int(wordSize) # make sequence list out of targets, make db, write to file f = open(targetsFN, "r") seqList = [] print "obtaining sequences" i = 0 for line in f: seqList.append(cgAlign.cgSeq(i, line.strip())) i += 1 f.close() print "making word db" wordDB = cgAlign.createWordDatabase(seqList, wordSize) cgAlign.writeWordDatabase(wordDB, runName) print "making seq db" seqDB = cgAlign.createSequenceDatabase(seqList) cgAlign.writeSequenceDatabase(seqDB, runName) if __name__ == "__main__": import sys bioLibCG.submitArgs(createDatabases, sys.argv)
print a.tTcc, a.id aDC.commit(id_alignment) def appendTranInfo(aDir, degSmallFN): aDC = cgDB.dataController(aDir, cgAlignment) id_alignment = aDC.load() tID_tranVal = {} f = open(degSmallFN, 'r') for line in f: ls = line.strip().split('\t') val = int(ls[3]) if val == 1: val = True if val == 0: val = False tID_tranVal[int(ls[0])] = val for alignment in id_alignment.values(): transcriptOverlap = tID_tranVal[alignment.tID] alignment.transcriptOverlap = transcriptOverlap aDC.commit(id_alignment) if __name__ == "__main__": import sys bioLibCG.submitArgs(loadAlignments, sys.argv)
import bioLibCG import compareData import cgEdit def overlapWithDegradome(dFN, eFN): eSites = cgEdit.loadEditingSites(eFN) degTccs = [] f = open(dFN, "r") for line in f: ls = line.strip().split("\t") chrom, strand, start, end = bioLibCG.tccSplit(ls[1]) start = start - 3 end = end + 3 degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end)) print degTccs[0:5] eTccs = [eSite.tcc for eSite in eSites] overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1) print len(overlaps) if __name__ == "__main__": import sys bioLibCG.submitArgs(overlapWithDegradome, sys.argv)
import bioLibCG def uniqueColumn(fN, column, whole=False): u = {} f = open(fN, 'r') for line in f: ls = line.strip().split('\t') u[ls[int(column)]] = line.strip() if whole: for i in u: print u[i] else: for i in u: print i if __name__ == "__main__": import sys bioLibCG.submitArgs(uniqueColumn, sys.argv)
import bioLibCG import cgEdit def getEditInfo(fN, idList): eSites = cgEdit.loadEditingSites(fN) idDict = {} for eSite in eSites: idDict[eSite.ID] = eSite list = [] f = open(idList, 'r') for line in f: ls = line.strip().split('\t') list.append(int(ls[0])) for id in list: eSite = idDict[id] print eSite.ID, '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.gene, eSite.eRatio if __name__ == "__main__": import sys bioLibCG.submitArgs(getEditInfo, sys.argv)
def plotContextPie(oDir, contextFN): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() oID_tTypes = {} f = open(contextFN, 'r') for line in f: ls = line.strip().split('\t') id = int(ls[0]) tType = ls[5] oID_tTypes.setdefault(id, []).append(tType) tType_count = {} for oRNA in id_oRNA.values(): if not oRNA.passedFilter: continue for tType in oID_tTypes[oRNA.id]: num = tType_count.get(tType, 0) tType_count[tType] = num + 1 for tType, count in tType_count.items(): print tType, count if __name__ == "__main__": import sys bioLibCG.submitArgs(plotContextPie, sys.argv)
if gType == t: #totalNum += 1.0/lg #theSum += 1.0/lg totalNum += 1.0 theSum += 1 fracs.append(totalNum) labels = [ 'Introns (%s)' % fracs[0], 'Intergenic(%s)' % fracs[1], 'Exons (%s)' % fracs[2], '3\'UTR (%s)' % fracs[3], '5\'UTR (%s)' % fracs[4] ] fracs = [float(x) / theSum for x in fracs] explode = (0.1, 0.1, 0.2, 0.1, 0.1) pie(fracs, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True) title('Editing Site Genomic Location', bbox={ 'facecolor': '1.0', 'pad': 10 }) show() if __name__ == "__main__": import sys bioLibCG.submitArgs(makeContextPieBetter, sys.argv)
import bioLibCG import matplotlib.pyplot as plt import siRnaPredict as si def makeComplexityHist(fN): f = open(fN, 'r') histVals = [] for line in f: ls = line.strip().split('\t') name = ls[3] seq = ls[4] if 'hsa' in name: histVals.append(si.getEntropy(seq)) plt.hist(histVals, 30) plt.show() if __name__ == "__main__": import sys bioLibCG.submitArgs(makeComplexityHist, sys.argv)
import bioLibCG def addIDs(fN, outFN): fOut = open(outFN, 'w') i = 0 f = open(fN, 'r') for line in f: fOut.write('%s\t%s\n' % (i, line.strip())) i += 1 fOut.close() if __name__ == "__main__": import sys bioLibCG.submitArgs(addIDs, sys.argv)
newLines = [] for line in f: ls = line.strip().split('\t') degTcc = cg.convertToAS(ls[1]) chrom, strand, start, end = cg.tccSplit(degTcc) if chrom != runningChrom: continue if strand != runningStrand: continue inTran = '0' for i in xrange(start, end + 1): if i in coordSet: inTran = '1' break #update newLines newLine = cg.appendToLine(line, inTran, 3) newLines.append(newLine) f.close() f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w') f.writelines(newLines) f.close() if __name__ == "__main__": import sys cg.submitArgs(globals()[sys.argv[1]], sys.argv[1:])
def subtractTwoTccLists(tccListKeep, tccListOther): #Both Lists should already be COLLAPSED!!! if checkIfOverlaps(tccListKeep): print 'THE KEEPER LIST HAS OVERLAPS (SUBTRACTION)' if checkIfOverlaps(tccListOther): print 'THE OTHER LIST HAS OVERLAPS (SUBTRACTION)' overlapKeep = compareTwoTcc(tccListKeep, tccListOther, 1) overlapOther = compareTwoTcc(tccListKeep, tccListOther, 2) subList = recurseSubtract(overlapKeep, overlapOther) #print 'subList:', subList #for those that didn't overlap return them newSeqs = [] # make a new list so as to not overwrite the other. newSeqs.extend(tccListKeep) for tcc in overlapKeep: newSeqs.remove(tcc) newSeqs.extend(subList) return newSeqs if __name__ == "__main__": import sys bioLibCG.submitArgs(getIndividualOverlaps, sys.argv)
pRuns --run.00 ----oRNA (slave: pRuns/run.00/oRNA) ----aDir --run.01 ''' mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment) id_masterObj = mDC.load() #recurse through all the runs masterBN = bioLibCG.getBaseFileName(masterDir) for slaveDir in bioLibCG.recursePaths(parDir, end = masterBN): oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment) id_slaveObj = oDC.load() id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj, cgOriginRNA.OriginRNA) mDC.commit(id_masterObj) def mergeDir(dirName): cgDB.mergeDirectory(dirName, cgOriginRNA.OriginRNA) if __name__ == "__main__": import sys bioLibCG.submitArgs(mergeDir, sys.argv)
import bioLibCG def formatFile(fN): f = open(fN, 'r') for line in f: ls = line.strip().split('\t') newList = [] newList.extend(ls[1:11]) newList.extend(ls[12:]) print '\t'.join(newList) if __name__ == "__main__": import sys bioLibCG.submitArgs(formatFile, sys.argv)
midCheck = 0 highCheck = 0 #check mismatches for i in lowRange: if i in alignment.mismatchPositions: lowCheck = 1 break for i in midRange: if i in alignment.mismatchPositions: midCheck = 1 break for i in highRange: if i in alignment.mismatchPositions: highCheck = 1 break fOut.write('%s\t%s\t%s\t%s\t%s\n' % (sID, tID, lowCheck, midCheck, highCheck)) f.close() if __name__ == "__main__": import sys #bioLibCG.submitArgs(markMismatchedPairs, sys.argv) bioLibCG.submitArgs(markCenterExpression, sys.argv)
import bioLibCG def blankIDs(fN, outFN, numIDs=None): '''Make a file with X number of blank IDs, or as many as there is lines in a file''' if numIDs: numIDs = int(numIDs) else: numIDs = bioLibCG.getNumFileLines(fN) newLines = [] for i in xrange(0, numIDs): newLines.append('%s\n' % i) f = open(outFN, 'w') f.writelines(newLines) f.close() if __name__ == "__main__": import sys bioLibCG.submitArgs(blankIDs, sys.argv)
fracs = [] theSum = 0.0 for t in types: totalNum = 0.0 for gene in geneDict: lg = len(geneDict[gene]) for gType in geneDict[gene]: if gType == t: #totalNum += 1.0/lg #theSum += 1.0/lg totalNum += 1.0 theSum += 1 fracs.append(totalNum) labels = ['Introns (%s)' % fracs[0], 'Intergenic(%s)' % fracs[1], 'Exons (%s)' % fracs[2], '3\'UTR (%s)' % fracs[3], '5\'UTR (%s)' % fracs[4]] fracs = [float(x)/theSum for x in fracs] explode=(0.1, 0.1, 0.2, 0.1, 0.1) pie(fracs, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True) title('Editing Site Genomic Location', bbox={'facecolor':'1.0', 'pad':10}) show() if __name__ == "__main__": import sys bioLibCG.submitArgs(makeContextPieBetter, sys.argv)
import bioLibCG import cgEdit import GenomeFetch def getFolded(fN): eSites = cgEdit.loadEditingSites(fN) gf = GenomeFetch.GenomeFetch('hg19') for eSite in eSites: #Get +/- 200 bp of eSite chrom, strand, coord = eSite.chromosome, eSite.strand, eSite.coordinate start, end = coord - 200, coord + 200 seq = gf.get_seq_from_to(chrom, start, end, strand) print '>', eSite.ID print seq if __name__ == "__main__": import sys bioLibCG.submitArgs(getFolded, sys.argv)
import bioLibCG def countUniqueID(fN): f = open(fN, 'r') countDict = {} for line in f: id = line.strip().split('\t')[0] countDict[id] = line.strip() f.close() print len(countDict) for i in countDict: print countDict[i] if __name__ == "__main__": import sys bioLibCG.submitArgs(countUniqueID, sys.argv)
----oRNA (slave: pRuns/run.00/oRNA) ----aDir --run.01 ''' mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment) id_masterObj = mDC.load() #recurse through all the runs masterBN = bioLibCG.getBaseFileName(masterDir) for slaveDir in bioLibCG.recursePaths(parDir, end=masterBN): oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment) id_slaveObj = oDC.load() id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj, cgOriginRNA.OriginRNA) mDC.commit(id_masterObj) def mergeDir(dirName): cgDB.mergeDirectory(dirName, cgOriginRNA.OriginRNA) if __name__ == "__main__": import sys bioLibCG.submitArgs(mergeDir, sys.argv)
f.readline() for line in f: ls = line.strip().split('\t') ''' for i, text in enumerate(ls): print i, text ''' tID = ls[0] chr = 'chr' + ls[1] strand = ls[4] tss, tse = ls[2], ls[3] css, cse = tss, tss exons = getBracketList(ls[18]) numExons = len(exons) exonStarts = ','.join([x[0] for x in exons]) exonEnds = ','.join([x[1] for x in exons]) geneName = ls[8] geneName = ensID_gID.get(geneName, geneName) stat5 = 'none' stat3 = 'none' tCoding = 'pseudogene_noncoding' unused = 'none' gCoding = 'pseudogene_noncoding' print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (tID, chr, strand, tss, tse, css, cse, numExons, exonStarts, exonEnds, geneName, stat5, stat3, tCoding, unused, gCoding) if __name__ == "__main__": import sys bioLibCG.submitArgs(convertPsuedo, sys.argv)
codingTranscript = '_coding' in transcript.tType tType = None codingFlag = None tTypes = [x[1] for x in transcript.getOverlappingElements(oRNA.tcc)] #categorize border types tType = ds.spotItem(tTypes) if tType == 'EXON' or 'EXON_INTRON': if codingTranscript: codingFlag = 'C' else: codingFlag = 'NC' else: codingFlag = 'NC' oRNA.transcriptIDs.append(transcript.id) oRNA.transcriptContexts.append(tType) oRNA.transcriptTypes.append(transcript.tType) oRNA.transcriptCodingTypes.append(codingFlag) oDC.commit(id_oRNA) if __name__ == "__main__": import sys bioLibCG.submitArgs(updateContext, sys.argv)
import bioLibCG import cgDB import cgOriginRNA def getSeqs(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for id, oRNA in id_oRNA.items(): if oRNA.sequenceDuplicate: continue if oRNA.totalContigLength > 6: continue if oRNA.endContigLength > 6: continue print "%s" % id if __name__ == "__main__": import sys bioLibCG.submitArgs(getSeqs, sys.argv)
import bioLibCG import cgDB import cgAlignment def probeAlignments(aDir): probePairs = [[6, 35934]] aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): for sID, tID in probePairs: if alignment.sID == sID and alignment.tID == tID: print alignment.id, alignment.sID, alignment.tID, alignment.centerExpression, alignment.mismatchStatus, alignment.numMismatches, alignment.transcriptOverlap if __name__ == "__main__": import sys bioLibCG.submitArgs(probeAlignments, sys.argv)
rects1 = ax.bar(ind, menMeans, width, color='r') womenMeans = (49344, 43652, 40213, 36490, 25724, 6236, 5237, 4639, 4108, 2774) womenStd = (220,209,202,191,152,13,12,11,11,9) rects2 = ax.bar(ind+width, womenMeans, width, color='y', yerr=womenStd) # add some ax.set_ylabel('Total Number of Targets') ax.set_xlabel('descriptor (a:b:c)\n a = # bp from center where NO mismatches allowed\n b = # of bp from center where at least c% of degradome expression must be found') ax.set_title('Total SNR') ax.set_xticks(ind+width) ax.set_xticklabels( ('4.6.30', '4.6.50', '4.6.60', '4.6.70', '4.6.90', '6.6.30', '6.6.50', '6.6.60', '6.6.70', '6.6.90')) ax.axis([0,10,0,70000]) ax.legend( (rects1[0], rects2[0]), ('Observed', 'Simulated') ) def autolabel(rects): # attach some text labels for rect in rects: height = rect.get_height() ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height), ha='center', va='bottom') autolabel(rects1) autolabel(rects2) plt.show() if __name__ == "__main__": import sys bioLibCG.submitArgs(plotBarSNR, sys.argv)
import bioLibCG import compareData import cgEdit def overlapWithDegradome(dFN, eFN): eSites = cgEdit.loadEditingSites(eFN) degTccs = [] f = open(dFN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[1]) start = start - 3 end = end + 3 degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end)) print degTccs[0:5] eTccs = [eSite.tcc for eSite in eSites] overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1) print len(overlaps) if __name__ == "__main__": import sys bioLibCG.submitArgs(overlapWithDegradome, sys.argv)
import bioLibCG def targetCount(fN): targetDict = {} f = open(fN, 'r') for line in f: ls = line.strip().split('\t') targets = ls[4].split(',') for target in targets: if target in targetDict: targetDict[target] += 1 else: targetDict[target] = 1 for target in targetDict: print target + '\t' + str(targetDict[target]) if __name__ == "__main__": import sys bioLibCG.submitArgs(targetCount, sys.argv)
f = open(degFile, 'r') newLines = [] for line in f: ls = line.strip().split('\t') degTcc = cg.convertToAS(ls[1]) chrom, strand, start, end = cg.tccSplit(degTcc) if chrom != runningChrom: continue if strand != runningStrand: continue inTran = '0' for i in xrange(start, end + 1): if i in coordSet: inTran = '1' break #update newLines newLine = cg.appendToLine(line, inTran, 3) newLines.append(newLine) f.close() f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w') f.writelines(newLines) f.close() if __name__ == "__main__": import sys cg.submitArgs(globals()[sys.argv[1]], sys.argv[1:])
for i,letter in enumerate(seq): if i == 0: continue if seq[i] == seq[i-1]: cLength5 += 1 else: break #3' cLength = 1 revSeq = [x for x in reversed(seq)] for i,letter in enumerate(revSeq): if i == 0: continue if revSeq[i] == revSeq[i-1]: cLength += 1 else: break highest = cLength5 if cLength > cLength5: highest = cLength oRNA.endContigLength = highest oDC.commit(id_oRNA) if __name__ == "__main__": import sys bioLibCG.submitArgs(updateEndContig, sys.argv) bioLibCG.submitArgs(updateTotalContig, sys.argv)
highRange = range(7, 15) lowCheck = 0 midCheck = 0 highCheck = 0 #check mismatches for i in lowRange: if i in alignment.mismatchPositions: lowCheck = 1 break for i in midRange: if i in alignment.mismatchPositions: midCheck = 1 break for i in highRange: if i in alignment.mismatchPositions: highCheck = 1 break fOut.write('%s\t%s\t%s\t%s\t%s\n' % (sID, tID, lowCheck, midCheck, highCheck)) f.close() if __name__ == "__main__": import sys #bioLibCG.submitArgs(markMismatchedPairs, sys.argv) bioLibCG.submitArgs(markCenterExpression, sys.argv)
mmVal = mmDict[sID][tID] if mmVal == 1: continue #check center Expression centerVal = centerDict[sID][tID] if centerVal < minCenterLevel: continue newTargetList.append(str(tID)) if len(newTargetList) < 1: continue newTargets = ','.join(newTargetList) #update newLines newLines.append(bioLibCG.appendToLine(line, newTargets, int(updatePosition))) f.close() #update file f = open(outFN, 'w') f.writelines(newLines) f.close() if __name__ == "__main__": import sys bioLibCG.submitArgs(filterTargets, sys.argv)
if fChrom != chrom: continue print ' ', fN, fChrom f = open(fN, 'r') f.readline() #header strand = cg.getBaseFileName(fN).strip().split('.')[-2] for line in f: lChrom, start, end, val = (line.strip().split('\t')) start, end, val = int(start), int(end), int(val) if val < 1: continue #print start, end, val for i in range(start, end): try: hitDict[lChrom][strand][i] += val except (KeyError, TypeError): if not lChrom in hitDict: hitDict[lChrom] = {} if not strand in hitDict[lChrom]: hitDict[lChrom][strand] = {} hitDict[lChrom][strand][i] = val #write results to wig file writeWigFromHitDict(hitDict, assembly, name, directory) if __name__ == "__main__": import sys cg.submitArgs(makeWigMem, sys.argv) #cg.submitArgs(mixWig, sys.argv)
import cgDB import cgOriginRNA import bioLibCG def probeORNA(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): if oRNA.passedFilter: cgOriginRNA.prettyPrint(oRNA) if __name__ == "__main__": import sys bioLibCG.submitArgs(probeORNA, sys.argv)
for i,letter in enumerate(seq): if i == 0: continue if seq[i] == seq[i-1]: cLength5 += 1 else: break #3' cLength = 1 revSeq = [x for x in reversed(seq)] for i,letter in enumerate(revSeq): if i == 0: continue if revSeq[i] == revSeq[i-1]: cLength += 1 else: break highest = cLength5 if cLength > cLength5: highest = cLength oNX.endContigLength[oID] = highest oNX.save() if __name__ == "__main__": import sys bioLibCG.submitArgs(updateEndContig, sys.argv) bioLibCG.submitArgs(updateTotalContig, sys.argv)
import bioLibCG def fixup(eFN, tableFN): coord_gName = {} coord_eRatio = {} f = open(eFN, 'r') for line in f: ls = line.strip().split('\t') coord = '%s:%s' % (ls[0], ls[1]) gName = ls[3] eRatio = ls[6] coord_gName[coord] = gName coord_eRatio[coord] = eRatio f = open(tableFN, 'r') for line in f: ls = line.strip().split('\t') coord = 'chr%s:%s' % (ls[0], ls[1]) ls.append(coord_eRatio[coord]) if ls[3] == 'NONE': ls[3] = coord_gName[coord] print '\t'.join(ls) if __name__ == "__main__": import sys bioLibCG.submitArgs(fixup, sys.argv)
def createDatabases(targetsFN, wordSize, runName, hasIDs=False): wordSize = int(wordSize) hasIDs = (hasIDs == "True") print 'using IDs', hasIDs #make sequence list out of targets, make db, write to file f = open(targetsFN, 'r') seqList = [] print 'obtaining sequences' i = 0 for line in f: if hasIDs: theID, seq = line.strip().split('\t') else: theID, seq = i, line.strip() seqList.append(cgAlign.cgSeq(theID, seq)) i += 1 f.close() print 'making word db' wordDB = cgAlign.createWordDatabase(seqList, wordSize) cgAlign.writeWordDatabase(wordDB, runName) print 'making seq db' seqDB = cgAlign.createSequenceDatabase(seqList) cgAlign.writeSequenceDatabase(seqDB, runName) if __name__ == "__main__": import sys bioLibCG.submitArgs(createDatabases, sys.argv)
import bioLibCG import cgGenes3 import cgSeqMod def testit(gFN): geneSet = cgGenes3.createGeneSetEditing(gFN) map = cgSeqMod.loadCodonMap('hg19') for gene in geneSet.genes: for transcript in gene.transcripts: try: print '' mRNA = transcript.getMRNA(coding = True) i = transcript.getRelativePositionMRNA(35872409) if i == -1: continue print transcript.id print i print mRNA[:i], mRNA[i], mRNA[i + 1:] print cgSeqMod.translateRNA(mRNA, map) except: pass if __name__ == "__main__": import sys bioLibCG.submitArgs(testit, sys.argv)
#error checks if len(c) > (keyPositions[-1] + 1): raise NameError("non-keyword arguments can not follow kw args") if continuityCheck(keyPositions): raise NameError("two keyword designations in a row!") for kw in possibleKeywords: if c.count(kw) > 1: raise NameError("keyword was used twice!") #update nonkw newNonkw = [x for x in nonkw[:keyPositions[0]]] #update kw for position in keyPositions: key, val = nonkw[position], nonkw[position + 1] kw[key] = val return dFxn(*newNonkw, **kw) return wrapped if __name__ == "__main__": import sys if sys.argv[1] == "help": bioLibCG.gd(sys.argv[0]) else: bioLibCG.submitArgs(globals()[sys.argv[1]], sys.argv[1:])
#3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') c += 1 if codingStatus: e += 1 range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') c += 1 if codingStatus: e += 1 range3 = () else: range3 = (tStart, cStart - 1) a += 1 print a, b, c, d, e if __name__ == "__main__": import sys if sys.argv[1] == "help": bioLibCG.gd(sys.argv[0]) else: bioLibCG.submitArgs(globals()[sys.argv[1]], sys.argv[1:])