]) header.extend(["shoresl", "shoresr", "shelvesl", "shelvesr"]) header.extend([ "lineContentGene", "sineContentGene", "lineContentPromoter", "sineContentPromoter" ]) for genelist in genelists: header.append(genelist.getFriendlyName()) outputcsv.writerow(header) # if expression file then use those genes for iterator # else use all ensembl genes if exprfile != None: exprCSV = IndexedCSV(exprfile, keyPos=1) iter = exprCSV else: iter = genedata for testid in iter: # look up meth value try: if exprfile != None: ensembl = exprCSV[testid][ensemblidcol] if ensembl not in genedata: # RNA-Seq is not a single unique gene, skip continue
opts, args = getopt.getopt(sys.argv[1:], "", ["gene-expr-transcriptionfactor-file="]) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" sys.exit(2) infile = None for o, a in opts: if o=="--gene-expr-transcriptionfactor-file": infile = a print "Matrix Dir:",infile assert infile!=None ensemblidcol = "ensemblid" inCSV = IndexedCSV(infile,keyPos=1) keys = inCSV.keys transcriptionFactors = keys[keys.index("significant")+1:] print transcriptionFactors assert len(set(transcriptionFactors))==len(transcriptionFactors) transcriptionFactorUp = collections.defaultdict(int) transcriptionFactorDown = collections.defaultdict(int) transcriptionFactorNC = collections.defaultdict(int) totalUp = 0 totalDown = 0
sys.argv[1:], "", [ # command args go here "gene-expression-difference=", "read-group-tracking=", "output=" ]) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" sys.exit(2) for o, a in opts: if o == "--gene-expression-difference": differences = IndexedCSV(a, key="test_id") elif o == "--read-group-tracking": readgrouptracking = IndexedCSV(a, useRowNumAsKey=True) xlocindex = ColumnIndex(readgrouptracking, "tracking_id") elif o == "--output": outputFile = a else: print "Unknown parameter: " + o + " " + a sys.exit(2) # get a suitable header replicatenames = [] for readtrackingrow in xlocindex["XLOC_000001"]: condition = readgrouptracking[readtrackingrow]["condition"] replicate = readgrouptracking[readtrackingrow]["replicate"] name = condition + "." + replicate
Created on 18 Jan 2011 @author: mcbryan ''' import sys import getopt from sequence.genome import Genome import subprocess import math import csv from filesystem.mkdir import makeDirectory from csvfile.indexedcsv import IndexedCSV build = "hg18" ucscNames = IndexedCSV( "/home/mcbryan/mount/publicdata/hg18/ucsc/ucsc-genes-xref.csv") ################### # isPcr needs: # ~/bin/x86_64/gfServer -stepSize=5 start localhost 20000 ~/mount/publicdata/hg18/assembly/twoBit/hg18.2bit ################### splitString = lambda v, l: [ v[i * l:(i + 1) * l] for i in range(int(math.ceil(len(v) / float(l)))) ] if __name__ == '__main__': try: opts, args = getopt.getopt(sys.argv[1:], "p:o:", ["primers=", "outputFolder="])
for o, a in opts: if o == "-i": infile = a elif o == "-o": outfile = a elif o == "-g": genelists.append(GeneList(a)) elif o == "-r": if (a.count(",") > 0): a, padding = a.split(",") regions.append(BedIntervalTree(a, padding=int(padding))) else: regions.append(BedIntervalTree(a)) elif o == "-c": controlAffyExpressionData = IndexedCSV(a) #Annotated difference file input elif o == "-e": rnaSeqExpressionData = IndexedCSV(a, key="test_id") elif o == "-a": assembly = a UPSTREAM_PROMOTOR_DIST = 2000 DOWNSTREAM_PROMOTOR_DIST = 2000 writer = csv.writer(open(outfile, "w"), delimiter="\t") genome = Genome(assembly) ###
assert bSample != None assert affyfile != None assert affyfccol != None assert affyexprcol != None assert outputfile != None print affyexprcol genedata = EnsemblGenes(assembly=build) genome = Genome(genomeBuild=build) affyannotation = NetAffxAnnotation(genome=build, cdfname="HG-U133_Plus_2") affyCSV = IndexedCSV(affyfile) affyEnsemblLogFCs = collections.defaultdict(list) affyEnsemblExprs = collections.defaultdict(list) # make a list of genenames in each genelist and store in the genelist for genelist in genelists: genelist.genenames = set() for e in genelist: if e in genedata: genelist.genenames.add(genedata[e].name) for affy in affyCSV: ensembls = affyannotation.getValues(affy, "Ensembl") if len(ensembls) == 1: affyFC = float(affyCSV[affy][affyfccol]) affylogFC = math.log(affyFC) if affyFC > 0.0 else math.log(
"outputfolder=" ]) except getopt.GetoptError, err: # print help information and exit: print str(err) # will print something like "option -a not recognized" print "Usage: xyz" sys.exit(2) isoformFile = None geneFile = None for o, a in opts: if (o=="-G") or (o=="--gtf"): gtfFileLoc = a elif (o=="-i") or (o=="--isoformexp"): isoformFile = IndexedCSV(a,keyPos=0) elif (o=="-g") or (o=="--geneexp"): geneFile = IndexedCSV(a,keyPos=0) elif (o=="-o") or (o=="--outputfolder"): outputFilePrefix = a else: print "Unknown parameter: "+o+" "+a sys.exit(2) makeDirectory(outputFilePrefix) gtfReader = csv.reader(open(gtfFileLoc,"r"),delimiter="\t") class Transcript(list): def __init__(self): self.chr = None
elif o == "--affyfc": affyfc = a elif o == "--rnalogfc": rnalogfc = a elif o == "--rnasignificant": rnasig = a elif o == "--rnakey": rnakeypos = int(a) # read in gene_exp.diff as indexedcsv affyannotation = NetAffxAnnotation(genome="hg18", array="HG-U133_Plus_2", version="29") affyCSV = IndexedCSV(affyfile) rnaseqCSV = IndexedCSV(rnaseqfile, keyPos=rnakeypos) fig = plt.figure(figsize=(12, 12), dpi=100) ax = fig.add_subplot(111) affyEnsemblLogFCs = collections.defaultdict(list) affyEnsemblSig = collections.defaultdict(bool) for affy in affyCSV: ensembls = affyannotation.getValues(affy, "Ensembl") if len(ensembls) == 1: affyFC = float(affyCSV[affy][affyfc]) affylogFC = math.log(affyFC) if affyFC > 0.0 else math.log( abs(affyFC)) * -1.0 affyEnsemblLogFCs[ensembls[0]].append(affylogFC)
def __init__(self, assembly="hg18"): self.clusters = {} baseLocation = os.path.expanduser("~/mount/publicdata/" + assembly + "/ucsc/") xrefs = IndexedCSV(baseLocation + "ucsc-genes-xref.csv") clusters = IndexedCSV(baseLocation + "ucsc-genes-knownIsoforms.csv", keyPos=1) ensemblMappings = IndexedCSV(baseLocation + "ucsc-knownToEnsembl.csv") self.reverseEnsemblMappings = collections.defaultdict(list) transcripts = IndexedCSV(baseLocation + "ucsc-knowngenes.csv") for id in transcripts: chr = transcripts[id]["chrom"] if chr not in [ "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY" ]: continue strand = transcripts[id]["strand"] assert strand == "+" or strand == "-" if id in ensemblMappings: ensemblTranscript = ensemblMappings[id]["value"] else: ensemblTranscript = None start = int(transcripts[id]["txStart"]) end = int(transcripts[id]["txEnd"]) exonCount = int(transcripts[id]["exonCount"]) exonStarts = transcripts[id]["exonStarts"].split( ",") # comma seperated exonEnds = transcripts[id]["exonEnds"].split( ",") # comma seperated # ucsc terminates with a comma which leaves a blank bit at the end after split exonStarts.remove('') exonEnds.remove('') # make sure it's consistent assert len(exonStarts) == len(exonEnds) and exonCount == len( exonStarts), str(transcripts[id]) # not using these at the moment #if "proteinID" in transcripts[id]: # not always there # proteinId = transcripts[id]["proteinID"] #alignId = transcripts[id]["alignID"] if id in xrefs: genesymbol = xrefs[id]["geneSymbol"] else: genesymbol = None assert id in clusters clusterid = clusters[id]["clusterId"] if clusterid not in self.clusters: self.clusters[clusterid] = UCSCCluster(clusterid) self[id] = UCSCTranscript(id, chr, start, end, strand, clusterid, exonStarts, exonEnds, genesymbol, ensemblTranscript) self.reverseEnsemblMappings[ensemblTranscript].append(id) self.clusters[clusterid].addToCluster(self[id])
def isDownstream(distance, strand): if strand == "+": return 'Y' if distance <= 0 else 'N' elif strand == "-": return 'Y' if distance >= 0 else 'N' else: # wtf went wrong here exit(-1) if not affyComparisonFile == None: #affyMapping = ExtendedBed(os.path.expanduser("~/mount/publicdata/positions2affy/HG-U133Plus2.csv"), chrPos=0, startPos = 2, stopPos=3, defaultkeys=["chr", "strand", "start", "stop", "affy"]) #print affyMapping.getValuesOfOverlappingIntervals("chr16", 72982016, 72983513) affyComparison = IndexedCSV(affyComparisonFile) headerRow = ['Index', 'ColumnID', 'Symbol', 'Chr', 'Mapinfo', 'Coord'] headerRow.extend([ 'PD30.Avg', 'PD56.Avg', 'Fold change', 'Log2MethFC', 'Bonferroni(p-value (PD56 vs. PD30))', 'Meth' ]) headerRow.extend(['In Gene', 'Genes', 'Names', 'Gene Bounds']) headerRow.extend([ TTS_TTS_Distance_Human + ' up or Gene Body', TTS_TTS_Distance_Human + ' up or Gene Body Genes', TTS_TTS_Distance_Human + ' up or Gene Body Names', TTS_TTS_Distance_Human + ' up or Gene Body Gene Bounds' ]) headerRow.extend([ 'Gene TSS Distance',
def addCounts(counts,arg,suffix): file = IndexedCSV(arg+suffix) for row in file: counts[row][arg] = Count(file[row]["AssignedCount"], file[row]["MinCount"], file[row]["MaxCount"])
extend = 150 exprCSV = None outputfile = None upstreamPromotor = 5000 downstreamPromotor = 1000 genelists = [] exactmatch = False genedata = EnsemblGenes(assembly="hg18") for opt, value in opts: if opt == "--gene-expression-file": exprCSV = IndexedCSV(value, keyPos=1) print "Expr File:", value elif opt == "--fccol": fccol = value elif opt == "--exprcols": exprcols = value.split(",") elif opt == "--outputfile": outputfile = value elif opt == "--promotorsize": upstreamPromotor = int(value) downstreamPromotor = int(value) elif opt == "--aSample": aSample = Bed(value, extend=extend) print "A Sample:", value elif opt == "--bSample": bSample = Bed(value, extend=extend)
affyfilelocation = a elif (o == "--rnaseq"): rnaseqfilelocation = a elif (o == "--output"): outputFileLoc = a assert affyfilelocation != None and rnaseqfilelocation != None assert outputFileLoc != None ucscgenedata = UCSC.UCSCTranscripts(assembly="hg18") ensemblgenedata = Ensembl.EnsemblGenes(assembly="hg18", annotation="ncbi36.1") affyannotation = NetAffxAnnotation() affyfile = IndexedCSV(affyfilelocation) rnaseqfile = IndexedCSV(rnaseqfilelocation, keyPos=1) print "Read Files" plotx = [] ploty = [] allaffys = 0 allrnaclusters = 0 finalentries = 0 affySymbols = collections.defaultdict(list) rnaSymbols = collections.defaultdict(list) for affy in affyfile:
elif o in ["-a", "--affyarray"]: affyComparisonFile = a elif o in ["--bed"]: bedTrackLoc = a elif o in ["--bedgraphfc"]: bedGraphFcTrackLoc = a elif o in ["--bedgraphbeta"]: bedGraphBetaTrackLoc = a elif o in ["--bedgraphp"]: bedGraphPTrackLoc = a elif o in ["--genebygene"]: geneByGeneOutputLoc = a elif o in ["--affyvalues"]: affyValueColumns = a.split(",") reader = IndexedCSV(infile) writer = csv.writer(open(outfile, "w"), delimiter="\t") ### def distanceHumanReadable(dist): return str(dist / 1000) + "kb" TSS_TTS_Distance = 1000 SURROUNDING_SEQUENCE_Distance = 250 # each side WINDOW_SIZE = 500 WINDOW_OFFSET = 5
outlocation = None for o, a in opts: if o == "-1": location1 = a elif o == "-2": location2 = a elif o == "-o": outlocation = a elif o == "--key1": key1 = int(a) elif o == "--key2": key2 = int(a) one = IndexedCSV(location1, keyPos=key1) two = IndexedCSV(location2, keyPos=key2) with open(outlocation, "w") as outfile: csvout = csv.writer(outfile, delimiter="\t") header = list(one.keys) header.extend(two.keys) header.insert(0, "Key") csvout.writerow(header) for item in one: row = [item]