def process(samples,cancer,infile,flog, mapping,fout): # one sample a file fin=open(infile,'U') line = fin.readline() line = fin.readline() sample = string.split(string.strip(line),"\t")[0] if mapping.has_key(sample): sample=mapping[sample] else: fin.close() message = "ERROR sample not in sdrf = "+ sample+ " " +cancer+" "+ __name__ flog.write(message+"\n") print message return if sample in samples: fin.close() message = "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__ flog.write(message+"\n") print message return # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4]!="TCGA": if TCGAUtil.UUID_NORMAL_CELLLINE.has_key(sample): fin.close() return else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid fin.close() return elif sampleTypeCode in ["10","11","20"]: #check TCGAUtil for codes fin.close() return samples.append(sample) fin.close() fin=open(infile,'U') line = fin.readline() for line in fin.readlines(): sample,chr,start, end, numMark, segMean = string.split(line[:-1],"\t") sample=mapping[sample] if chr=="23": chr="chrX" elif chr=="24": chr="chrY" elif chr=="M": continue else: chr="chr"+chr start = str(int(float(start))) end = str(int(float(end))) fout.write(sample+"\t"+chr+"\t"+start+"\t"+end+"\t"+segMean+"\n")
def process(c, dataMatrix, allSamples, samples, probes, cancer, infile, flog, BETA_POS, offset, maxLength): # one sample a file fin = open(infile, 'U') line = fin.readline() sample = string.split(line[:-1], "\t")[BETA_POS] if sample in allSamples or sample in samples: fin.close() message = "ERROR duplicated sample = " + sample + " " + cancer + " " + __name__ flog.write(message + "\n") print message return c # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4] != "TCGA": if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample fin.close() return c else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid fin.close() return c elif sampleTypeCode in ["20"]: print "control cell line ignore", sample fin.close() return c p = len(samples) samples[sample] = p allSamples[sample] = "" c = c + 1 fin.readline() for line in fin.readlines(): data = string.split(line[:-1], "\t") probe = data[0] value = data[BETA_POS] if probe not in probes: p = len(probes) probes[probe] = p l = [] for j in range(0, maxLength): l.append("") dataMatrix.append(l) if value not in ["", "null", "NULL", "Null", "NA"]: value = float(value) + offset x = probes[probe] y = samples[sample] dataMatrix[x][y] = value fin.close() return c
def process(c, dataMatrix,allSamples, samples, probes, cancer,infile,flog, BETA_POS, offset, maxLength): # one sample a file fin=open(infile,'U') line = fin.readline() sample = string.split(line[:-1],"\t")[BETA_POS] if sample in allSamples or sample in samples: fin.close() message = "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__ flog.write(message+"\n") print message return c # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4]!="TCGA": if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample fin.close() return c else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid fin.close() return c elif sampleTypeCode in ["20"]: print "control cell line ignore", sample fin.close() return c p=len(samples) samples[sample]=p allSamples[sample]="" c= c+1 fin.readline() for line in fin.readlines(): data =string.split(line[:-1],"\t") probe = data[0] value= data[BETA_POS] if probe not in probes: p=len(probes) probes[probe]=p l=[] for j in range (0,maxLength): l.append("") dataMatrix.append(l) if value not in ["","null","NULL","Null","NA"]: value = float(value)+ offset x=probes[probe] y=samples[sample] dataMatrix[x][y]=value fin.close() return c
def process(dataMatrix,samples,cancer,infile,mapping, flog): # one sample a file fin=open(infile,'U') line = fin.readline() sample = string.split(string.strip(line),"\t")[1] if mapping.has_key(sample): sample=mapping[sample] else: fin.close() message = "ERROR sample not in sdrf = "+ sample+ " " +cancer+" "+ __name__ flog.write(message+"\n") print message return if sample in samples: fin.close() message = "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__ flog.write(message+"\n") print message return # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4]!="TCGA": if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample fin.close() return else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid fin.close() return elif sampleTypeCode in ["20"]: fin.close() print "control cell line ignore", sample return samples.append(sample) fin.readline() for line in fin.readlines(): hugo,value =string.split(line[:-1],"\t") if not dataMatrix.has_key(hugo): dataMatrix[hugo]={} if value not in ["","null","NULL","Null","NA"]: dataMatrix[hugo][sample]=value else: dataMatrix[hugo][sample]="NA" fin.close() return
def add_col_PseudoSample(clinMatrix, col): # add sample type informatin to pseudo samples rows = clinMatrix.getROWs() for row in rows: st = clinMatrix.getDATA(row, col) if st != None and st != "": #assuming sample ids are TCGA barcode integration_id = TCGAUtil.barcode_IntegrationId(row) if clinMatrix.hasRow(integration_id): clinMatrix.setDATA(integration_id, col, st) else: clinMatrix.addNewRows([integration_id], {col: st}) r = clinMatrix.validate() if r == False: print "add pseudoSample clinical infor", col, "fail"
def betaMean(total, count,samples, probes,cancer,infile,flog): # one sample a file fin=open(infile,'r') line = fin.readline() sample = string.split(string.strip(line),"\t")[BETA_POS] if sample in samples: fin.close() message = "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__ flog.write(message+"\n") print message return total, count # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4]!="TCGA": print sample if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample fin.close() return total,count else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid fin.close() return elif sampleTypeCode in ["20"]: print "control cell line ignore", sample fin.close() return total,count p=len(samples) samples[sample]=p fin.readline() for line in fin.readlines(): probe,value =string.split(line[:-1],"\t")[0:BETA_POS+1] if probe not in probes: p = len(probes) probes[probe]= p if value in ["","null","NULL","Null","NA"]: continue value = float(value) total = total +value count = count +1 fin.close() return total, count
def uuid_2_barcode( clinMatrix, uuidcol, mapDic, flog): #convert uuid to barcode, if uuid not found, remove the sample rows = clinMatrix.getROWs() removeSamples = [] for row in rows: uuid = clinMatrix.getDATA(row, uuidcol) if TCGAUtil.is_barcode(uuid) == True: continue try: barcode = mapDic[string.lower(uuid)] clinMatrix.replaceValueInCol(uuidcol, uuid, barcode) except KeyError: removeSamples.append(row) print uuid, "not found" flog.write(uuid + " not found\n") if len(removeSamples) > 0: r = clinMatrix.removeRows(removeSamples, True) if not r: print "fail to validate" clinMatrix.replaceColName(uuidcol, "tcgaBarCode")
def buildSampleDic (samples, sMap, intDic, sampleDic,aliquote_dic): for sample in samples: #TCGA uuid handling uuid=sample if sample[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(sample)): TCGAbarcode = aliquote_dic[string.lower(sample)] else: print sample parent = TCGAbarcode child = sample sMap.addLink(parent,child) sample = parent #do TCGA barcode trick parts= string.split(sample,"-") parent = string.join(parts[0:3],"-") #parts[3] if len(parts)>3 and len(parts[3])==3: child=parent +"-" +parts[3][0:2] sMap.addLink(parent,child) parent=child child=string.join(parts[0:4],"-") sMap.addLink(parent,child) parent=child for i in range (4,len(parts)): child = parent +"-" +parts[i] #add parent child sMap.addLink(parent,child) parent = child intID= TCGAUtil.barcode_IntegrationId(sample) if intDic.has_key(intID): intDic[intID].append(uuid) else: intDic[intID]=[uuid] sampleDic[uuid]=intID
def buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic): for sample in samples: #TCGA uuid handling uuid = sample if sample[0:4] != "TCGA": if aliquote_dic.has_key(string.lower(sample)): TCGAbarcode = aliquote_dic[string.lower(sample)] else: print sample parent = TCGAbarcode child = sample sMap.addLink(parent, child) sample = parent #do TCGA barcode trick parts = string.split(sample, "-") parent = string.join(parts[0:3], "-") #parts[3] if len(parts) > 3 and len(parts[3]) == 3: child = parent + "-" + parts[3][0:2] sMap.addLink(parent, child) parent = child child = string.join(parts[0:4], "-") sMap.addLink(parent, child) parent = child for i in range(4, len(parts)): child = parent + "-" + parts[i] #add parent child sMap.addLink(parent, child) parent = child intID = TCGAUtil.barcode_IntegrationId(sample) if intDic.has_key(intID): intDic[intID].append(uuid) else: intDic[intID] = [uuid] sampleDic[uuid] = intID
def CAVMid (dir, outDir, cancer,log, REALRUN): print cancer, sys._getframe().f_code.co_name ignore =1 bookDic=cgWalk(dir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() if not os.path.exists (outDir): os.system("mkdir "+outDir) for map in missingMaps: print map sMap =SampleMapNew(None,map) for name in missingMaps[map]: samples =[] intDic={}#keyed on CAVMid sampleDic={} #keyd on original sample id obj=bookDic[name] print obj["name"] if obj['type'] in ["clinicalMatrix","mutationVector"]: outfile = outDir +os.path.basename(obj['path']) os.system("cp "+obj['path']+".json "+outfile+".json") fin = open (outfile+".json",'r') J=json.load(fin) fin.close() if J.has_key(":clinicalFeature"): cFobj= bookDic[J[":clinicalFeature"]] cFoutfile = outDir +os.path.basename(cFobj['path']) os.system("cp "+cFobj['path']+" "+cFoutfile) os.system("cp "+cFobj['path']+".json "+cFoutfile+".json") if REALRUN ==-1: continue if REALRUN ==0 and obj['type']=="mutationVector": continue fin = open(obj['path'],'r') fin.readline() for line in fin.readlines(): sample =string.split(line,"\t")[0] if sample not in samples and sample !="": samples.append(sample) buildSampleDic (samples, sMap, intDic, sampleDic, aliquote_dic) fin = open(obj['path'],'r') fout = open(outfile,'w') fout.write(fin.readline()) for line in fin.readlines(): data =string.split(line,"\t") sample =data[0] try: fout.write(sampleDic[sample]+"\t") fout.write(string.join(data[1:],"\t")) except: fout.write(line) fout.close() if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" sys.exit() samples.append(sample) fin.close() outfile = outDir +os.path.basename(obj['path']) os.system("cp "+obj['path']+".json "+outfile+".json") if REALRUN !=1: continue buildSampleDic (samples, sMap, intDic, sampleDic,aliquote_dic) process(obj['path'], outfile, samples, intDic)
def cohort_variable(var, value, inDir, outDir, cancer, REALRUN, doDerived): print inDir print outDir if REALRUN: ignore = 1 bookDic = cgWalk(inDir, ignore) existMaps = collectSampleMaps(bookDic) missingMaps = collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map] = existMaps[map] # all aliquote uuid dic aliquote_dic = TCGAUtil.uuid_Aliquot_all() sample_dic = TCGAUtil.uuid_Sample_all() if len(missingMaps) != 1: return map = missingMaps.keys()[0] print map samples = [] for name in missingMaps[map]: obj = bookDic[name] if obj['type'] == "genomicMatrix": fin = open(obj['path'], 'U') for sample in string.split(fin.readline()[:-1], "\t")[1:]: if sample == "": print name, "has bad empty sample id" sys.exit() if sample not in samples: samples.append(sample) fin.close() #take too long """ if obj['type']=="mutationVector": fin =open(obj['path'],'U') fin.readline() while 1: line = fin.readline() if string.strip(line) =="": break sample = string.split(line,'\t')[0] if sample not in samples: samples.append(sample) print sample, obj['path'] fin.close() """ intDic = {} for sample in samples: #TCGA uuid handling uuid = sample TCGAbarcode = "" if uuid[0:4] != "TCGA": if aliquote_dic.has_key(string.lower(uuid)): TCGAbarcode = aliquote_dic[string.lower(uuid)] else: TCGAbarcode = uuid else: TCGAbarcode = sample intID = TCGAUtil.barcode_IntegrationId(TCGAbarcode) if intID == None: # ids is on patient level above integration level continue if not intDic.has_key(intID): intDic[intID] = "" outfile = outDir + cancer + "/" + var fout = open(outfile, "w") fout.write("sample\t" + var + "\n") for intId in intDic: fout.write(intId + "\t" + value + "\n") fout.close() #data josn J = {} J["version"] = datetime.date.today().isoformat() J["name"] = "TCGA_" + cancer + "_" + var J["type"] = "clinicalMatrix" J["dataSubType"] = "phenotype" J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" outfile = outDir + cancer + "/" + var oHandle = open(outfile + ".json", "w") oHandle.write(json.dumps(J, indent=-1)) oHandle.close() if doDerived: if cancer in ["LUAD", "LUSC"]: derived_cancer = "LUNG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["COAD", "READ"]: derived_cancer = "COADREAD" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["GBM", "LGG"]: derived_cancer = "GBMLGG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
def geneRPKM(inDir, outDir, cancer, flog, PATHPATTERN, suffix, namesuffix, dataProducer, REALRUN, clean): garbage = [tmpDir] os.system("rm -rf tmp_*") if os.path.exists(tmpDir): if clean: os.system("rm -rf " + tmpDir + "*") else: os.system("mkdir " + tmpDir) #multiple files in dir mode lastRelease = {} for file in os.listdir(inDir): #find the file if string.find(file, PATHPATTERN) != -1 and string.find( file, LEVEL) != -1 and string.find( file, ".tar.gz") != -1 and string.find(file, "md5") == -1: pass else: continue if not os.path.exists(inDir + file + ".md5"): print "file has no matching .md5 throw out", file continue #find lastest in each archive info = string.split(file, ".") archive = info[-5] release = int(info[-4]) if not lastRelease.has_key(archive): lastRelease[archive] = release else: if lastRelease[archive] < release: lastRelease[archive] = release rootDir = "" lastDate = None remoteDataDirExample = "" for file in os.listdir(inDir): #find the file if string.find(file, PATHPATTERN) != -1 and string.find( file, LEVEL) != -1 and string.find( file, ".tar.gz") != -1 and string.find(file, "md5") == -1: pass else: continue if not os.path.exists(inDir + file + ".md5"): continue #find the file that is the lastest release for the archive info = string.split(file, ".") archive = info[-5] release = int(info[-4]) if release != lastRelease[archive]: continue #file latest date newDate = datetime.date.fromtimestamp(os.stat(inDir + file).st_mtime) if not lastDate: lastDate = newDate if lastDate < newDate: lastDate = newDate if remoteDataDirExample == "": remoteDataDirExample = file[:-7] #is tar.gz?, uncompress multiple file mode if not clean: rootDir = tmpDir elif string.find(file, ".tar.gz") != -1 and REALRUN and clean: os.system("tar -xzf " + inDir + file + " -C " + tmpDir) rootDir = tmpDir #make sure there is data if REALRUN and (rootDir == "" or not os.path.exists(rootDir)): print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__ return #set output dir if not os.path.exists(outDir): os.makedirs(outDir) if not os.path.exists(outDir + cancer + "/"): os.makedirs(outDir + cancer + "/") cgFileName = namesuffix #data processing multiple dirs mode if REALRUN: #hg19 or not pattern = ".hg19.mirbase20.isoform.quantification" HG19 = 0 for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir + dataDir): if string.find(file, pattern) != -1: HG19 = 1 break if HG19: break if HG19: pattern = ".hg19.mirbase20.isoform.quantification" else: pattern = ".isoform.quantification" allSamples = {} for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir + dataDir): sample = "" #bcgsc gene if string.find(file, pattern) != -1: infile = rootDir + dataDir + "/" + file # bcgsc stupid sample name in file name if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center": sample = string.split(file, ".")[0][:15] else: print "please check how to identify sample name" if sample == "": continue if sample in allSamples: print len(allSamples) message = "ERROR duplicated sample = " + sample + " " + cancer + " " + __name__ + file flog.write(message + "\n") print message continue # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4] != "TCGA": if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample continue else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid continue elif sampleTypeCode in ["20"]: print "control cell line ignore", sample continue p = len(allSamples) allSamples[sample] = p c = 0 dataMatrix = [] mapping = {} tmpSamples = {} genes = {} oldgenes = {} files = [] GOOD = 1 BATCH = 2500 for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir + dataDir): sample = "" if string.find(file, pattern) != -1: infile = rootDir + dataDir + "/" + file # bcgsc stupid sample name in file name if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center": sample = string.split(file, ".")[0][:15] else: print "please check how to identify sample name" valuePOS = 3 LOG2 = 1 if sample == "": continue if sample in tmpSamples: #duplicated samples continue if sample not in allSamples: continue p = len(tmpSamples) tmpSamples[sample] = p c = c + 1 process(dataMatrix, mapping, tmpSamples, sample, genes, cancer, infile, flog, valuePOS, LOG2, BATCH) if (c % BATCH) == 0: tmpout = "tmp_" + str(int(c / float(BATCH))) r = outputMatrix(dataMatrix, tmpSamples, genes, oldgenes, tmpout, flog) if r: GOOD = 0 dataMatrix = [] tmpSamples = {} oldgenes = copy.deepcopy(genes) genes = {} files.append(tmpout) if (c % BATCH) != 0: tmpout = "tmp_" + str(int(c / float(BATCH)) + 1) files.append(tmpout) r = outputMatrix(dataMatrix, tmpSamples, genes, oldgenes, tmpout, flog) if r: GOOD = 0 #paste all together outfile = outDir + cancer + "/" + cgFileName if GOOD: os.system("paste -d \'\' " + string.join(files, " ") + " > " + outfile) for file in files: os.system("rm " + file) if not GOOD: sys.exit() #probeMap probefile = outDir + cancer + "/" + namesuffix + ".probeMap" outputProbeMap(probefile, mapping) #transcript datafile datafile = outDir + cancer + "/" + cgFileName if not os.path.exists(datafile): return #gene datafile genefile = outDir + cancer + "/" + cgFileName + "_gene" os.system( "python ../support/genomicMatrixToGeneMatrix_memInEfficient.py " + datafile + " " + probefile + ' ' + genefile + " add 1 1") #probeMap json probefile = outDir + cancer + "/" + namesuffix + ".probeMap" fout = open(probefile + ".json", "w") J = {} J['type'] = 'probeMap' J['assembly'] = 'hg19' J['name'] = cancer + '_' + namesuffix + ".probeMap" fout.write(json.dumps(J, indent=-1)) fout.close() oHandle = open(outDir + cancer + "/" + cgFileName + ".json", "w") J = {} #stable J["cgDataVersion"] = 1 J["redistribution"] = True J["dataProducer"] = dataProducer J["colNormalization"] = True J["PLATFORM"] = PATHPATTERN J["type"] = "genomicMatrix" J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" #multiple dirs J["url"]=TCGAUtil.remoteBase \ +string.replace(inDir,TCGAUtil.localBase,"") J["version"] = datetime.date.today().isoformat() J["wrangler"] = "cgData TCGAscript " + __name__ + " processed on " + datetime.date.today( ).isoformat() J["unit"] = "log2(RPM+1)" J[":probeMap"] = cancer + '_' + namesuffix + ".probeMap" if PATHPATTERN in ["IlluminaHiSeq_miRNASeq"]: platformTitle = "Illumina HiSeq 2000 RNA Sequencing platform" if PATHPATTERN in ["IlluminaGA_miRNASeq"]: platformTitle = " Illumina Genome Analyzer RNA Sequencing platform" #change description J["description"] = "" J["dataSubType"] = "miRNA isoform expression RNAseq" J["label"] = suffix J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") miRNA isoform expression by RNAseq (" + suffix + ")" J["description"]= J["description"] +"TCGA "+ TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") miRNA expression by RNAseq. The miRNA expression profile was measured experimentally using the "+platformTitle+" by the "+ dataProducer +"." + \ " Level 3 interpreted level data was downloaded from TCGA data coordination center. Download data is in the unit of reads per million mapped reads (RPM). This dataset shows the miRNA transcription estimates in log2 (RPM). For more information see: http://nar.oxfordjournals.org/content/early/2015/08/13/nar.gkv808.full ." J["description"] = J["description"] +\ "<br><br>In order to more easily view the differential expression between samples, we set the default view to center each miRNA to zero by independently subtracting the mean of each miRNA across the cohort on the fly. Users can view the original non-normalized values by adjusting visualization settings." J["description"] = J["description"] + "<br><br>" J["wrangling_procedure"] = "Level_3 Data (file names: *.isoform.quantification.txt) download from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository." J["anatomical_origin"] = TCGAUtil.anatomical_origin[cancer] J["sample_type"] = ["tumor"] J["primary_disease"] = TCGAUtil.cancerGroupTitle[cancer] J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" J['domain'] = "TCGA" J['tags'] = ["cancer"] + TCGAUtil.tags[cancer] J['owner'] = "TCGA" J['gdata_tags'] = ["transcription", "miRNA"] #change cgData J["name"] = "TCGA_" + cancer + "_" + namesuffix name = trackName_fix(J['name']) if name == False: message = "bad object name, need fix otherwise break loader, too long " + J[ "name"] print message flog.write(message + "\n") return else: J["name"] = name oHandle.write(json.dumps(J, indent=-1)) oHandle.close() #gene datafile json genefile = outDir + cancer + "/" + cgFileName + "_gene" oHandle = open(genefile + ".json", "w") J.pop("name") J.pop(":probeMap") J.pop("description") J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") miRNA mature strand expression by RNAseq (" + suffix + ")" J["dataSubType"] = "miRNA mature strand expression RNAseq" J["label"] = suffix J["wrangling_procedure"] = "Level_3 Data (file names: *.isoform.quantification.txt) download from TCGA DCC, for each sample, all isoform expression for the same miRNA mature strand are added together, log2(total_RPM +1) transformed, and deposited at UCSC into Xena repository." oHandle.write(json.dumps(J, indent=-1)) oHandle.close() return
def TCGASampleMap (dir, outDir, cancer,log, REALRUN): #print status print cancer, __name__ #if cancer in ["PANCAN","PANCAN12"]: # return ignore =1 bookDic = cgWalk(dir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() #missingMaps --- actually this is all the maps for map in missingMaps: print map print missingMaps[map] sMap =SampleMapNew(None,map) #integration id intName= map+".integrationID" if intName in bookDic: fin = open(bookDic[intName]["path"],'r') integrationID=IntegrationId(intName, fin) fin.close() else: integrationID=IntegrationId(intName) samples =[] for name in missingMaps[map]: if REALRUN !=1: continue print name obj=bookDic[name] if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" if sample not in samples: samples.append(sample) fin.close() #elif obj['type']=="clinicalMatrix": # cMa = ClinicalMatrixNew(obj['path'],name) # for sample in cMa.getROWs(): # if sample not in samples: # samples.append(sample) elif obj['type'] in ["mutationVector","clinicalMatrix"]: path = obj['path'] os.system("cut -f 1 "+path+ " |sort |uniq > .tmp") fin=open('.tmp','r') fin.readline() for line in fin.readlines(): #if string.strip(line)=="": # break sample = string.strip(line) #string.split(line,'\t')[0] if sample =="": break if sample not in samples: samples.append(sample) else: continue for sample in samples: if REALRUN !=1: continue #TCGA uuid handling if sample[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(sample)): TCGAbarcode = aliquote_dic[string.lower(sample)] else: print sample continue parent = TCGAbarcode child = sample sMap.addLink(parent,string.lower(child)) sMap.addLink(parent,string.upper(child)) sample = parent #do TCGA barcode trick parts= string.split(sample,"-") if len(parts)>3 and len(parts[3])==3: parts = parts[0:3]+ [parts[3][0:2],parts[3][2]]+parts[4:] #print parts """ parent = string.join(parts[0:3],"-") #parts[3] if len(parts)>3 and len(parts[3])==3: child=parent +"-" +parts[3][0:2] sMap.addLink(parent,child) parent=child child=string.join(parts[0:4],"-") sMap.addLink(parent,child) parent=child """ parent = string.join(parts[0:3],"-") for i in range (3,len(parts)): if i!=4: child = parent +"-" +parts[i] else: child = parent +parts[i] #add parent child sMap.addLink(parent,child) parent = child intID= TCGAUtil.barcode_IntegrationId(sample) integrationID.addId(intID) #output sampleMap if not os.path.exists( outDir ): os.makedirs( outDir ) if not os.path.exists( outDir +cancer+"/"): os.makedirs( outDir+cancer+"/" ) if REALRUN == 1: oHandle = open(outDir+cancer+"/"+map,"w") sMap.store(oHandle) #output integrationID if REALRUN ==1: oHandle = open(outDir+cancer+"/integrationID","w") integrationID.store(oHandle) oHandle.close() #output integrationID json oHandle = open(outDir+cancer+"/integrationID.json","w") J={} J['name']=intName J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer] J["sample_type"]="tumor" if cancer not in ["PANCAN","PANCAN12"]: J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer] else: J["primary_disease"]="cancer" #J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer] J['domain']="TCGA" J['owner']="TCGA" J["cgDataVersion"]=1 J['type']="integrationId" J["version"]= datetime.date.today().isoformat() oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() #output json oHandle = open(outDir+cancer+"/"+map+".json","w") J['name']=map J['type']="sampleMap" J["version"]= datetime.date.today().isoformat() J["cgDataVersion"]=1 J[":integrationId"]=intName #add info for old clinical data if os.path.exists( outDir+cancer+"/oldClin.json" ): J[':oldClin']=cancer+"_oldClin" #special code if TCGAUtil.featurePriority.has_key(cancer) and len(TCGAUtil.featurePriority[cancer])>=5: J["VIS"]=5 #blackList in PAAD if J['name'] in ["TCGA.PAAD.sampleMap"]: J["blacklist"]= [ "TCGA-FQ-6551", "TCGA-FQ-6552", "TCGA-FQ-6553", "TCGA-FQ-6554", "TCGA-FQ-6555", "TCGA-FQ-6558", "TCGA-FQ-6559"] oHandle.write( json.dumps( J, indent=-1 ) ) return
def geneRPKM(inDir, outDir, cancer, flog, PATHPATTERN, suffix, namesuffix, dataProducer, REALRUN, clean): garbage = [tmpDir] os.system("rm -rf tmp_*") if os.path.exists(tmpDir): if clean: os.system("rm -rf " + tmpDir + "*") else: os.system("mkdir " + tmpDir) #multiple files in dir mode lastRelease = {} for file in os.listdir(inDir): #find the file if string.find(file, PATHPATTERN) != -1 and string.find( file, LEVEL) != -1 and string.find( file, ".tar.gz") != -1 and string.find(file, "md5") == -1: pass else: continue if not os.path.exists(inDir + file + ".md5"): print "file has no matching .md5 throw out", file continue #find lastest in each archive info = string.split(file, ".") archive = info[-5] release = int(info[-4]) if not lastRelease.has_key(archive): lastRelease[archive] = release else: if lastRelease[archive] < release: lastRelease[archive] = release rootDir = "" lastDate = None remoteDataDirExample = "" for file in os.listdir(inDir): #find the file if string.find(file, PATHPATTERN) != -1 and string.find( file, LEVEL) != -1 and string.find( file, ".tar.gz") != -1 and string.find(file, "md5") == -1: pass else: continue if not os.path.exists(inDir + file + ".md5"): continue #find the file that is the lastest release for the archive info = string.split(file, ".") archive = info[-5] release = int(info[-4]) if release != lastRelease[archive]: continue #file latest date newDate = datetime.date.fromtimestamp(os.stat(inDir + file).st_mtime) if not lastDate: lastDate = newDate if lastDate < newDate: lastDate = newDate if remoteDataDirExample == "": remoteDataDirExample = file[:-7] #is tar.gz?, uncompress multiple file mode if not clean: rootDir = tmpDir elif string.find(file, ".tar.gz") != -1 and REALRUN and clean: os.system("tar -xzf " + inDir + file + " -C " + tmpDir) rootDir = tmpDir #make sure there is data if REALRUN and (rootDir == "" or not os.path.exists(rootDir)): print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__ return #set output dir if not os.path.exists(outDir): os.makedirs(outDir) if not os.path.exists(outDir + cancer + "/"): os.makedirs(outDir + cancer + "/") cgFileName = namesuffix #data processing multiple dirs mode if REALRUN: allSamples = {} for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir + dataDir): sample = "" #v2 bcgsc gene pattern = ".gene.quantification" if string.find(file, pattern) != -1 and string.find( namesuffix, "exon") == -1: #check if there is .v2 if string.find(file, ".v2.") == -1: V2 = 0 for file2 in os.listdir(rootDir + dataDir): if string.find(file2, ".v2") != -1: V2 = 1 break if V2: continue if string.find(file, ".hg19.") == -1: HG19 = 0 for file2 in os.listdir(rootDir + dataDir): if string.find(file2, ".hg19.") != -1: HG19 = 1 break if HG19: continue infile = rootDir + dataDir + "/" + file # bcgsc stupid sample name in file name if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center": sample = string.split(file, ".")[0] else: print "please check how to identify sample name" #v2 bcgsc exon pattern = ".exon.quantification" if string.find(file, pattern) != -1 and string.find( namesuffix, "exon") != -1: #check if there is .v2 if string.find(file, ".v2.") == -1: V2 = 0 for file2 in os.listdir(rootDir + dataDir): if string.find(file2, ".v2.") != -1: V2 = 1 break if V2: continue if string.find(file, ".hg19.") == -1: HG19 = 0 for file2 in os.listdir(rootDir + dataDir): if string.find(file2, ".hg19.") != -1: HG19 = 1 break if HG19: continue infile = rootDir + dataDir + "/" + file # bcgsc stupid sample name in file name if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center": sample = string.split(file, ".")[0] else: print "please check how to identify sample name" #v2 pattern = "rsem.genes.normalized_results" if string.find(file, pattern) != -1 and string.find( namesuffix, "exon") == -1: infile = rootDir + dataDir + "/" + file # unc stupid sample name in file name if dataProducer == "University of North Carolina TCGA genome characterization center": sample = string.split(file, ".")[2] else: print "please check how to identify sample name" #v2 exon from unc pattern = "bt.exon_quantification" if string.find(file, pattern) != -1 and string.find( namesuffix, "exon") != -1: infile = rootDir + dataDir + "/" + file # unc stupid sample name in file name if dataProducer == "University of North Carolina TCGA genome characterization center": sample = string.split(file, ".")[2] else: print "please check how to identify sample name" if sample == "": continue if sample in allSamples: print len(allSamples) message = "ERROR duplicated sample = " + sample + " " + cancer + " " + __name__ + file flog.write(message + "\n") print message continue # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4] != "TCGA": if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample continue else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid continue elif sampleTypeCode in ["20"]: print "control cell line ignore", sample continue p = len(allSamples) allSamples[sample] = p c = 0 dataMatrix = [] tmpSamples = {} genes = {} oldgenes = {} files = [] GOOD = 1 for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir + dataDir): sample = "" #bcgsc v1 and 2 pattern = "gene.quantification" altpattern = ".v2.gene.quantification" if string.find(file, pattern) != -1 and string.find( namesuffix, "exon") == -1: #check if there is .v2 if string.find(file, ".v2.") == -1: V2 = 0 for file2 in os.listdir(rootDir + dataDir): if string.find(file2, altpattern) != -1: V2 = 1 break if V2: continue if string.find(file, ".hg19.") == -1: HG19 = 0 for file2 in os.listdir(rootDir + dataDir): if string.find(file2, ".hg19.") != -1: HG19 = 1 break if HG19: continue infile = rootDir + dataDir + "/" + file # bcgsc stupid sample name in file name if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center": sample = string.split(file, ".")[0] else: print "please check how to identify sample name" valuePOS = 3 LOG2 = 1 RANK = 0 #bcgsc exon v1 and v2 pattern = "exon.quantification" altpattern = ".v2.exon.quantification" if string.find(file, pattern) != -1 and string.find( namesuffix, "exon") != -1: #check if there is .v2 if string.find(file, ".v2.") == -1: V2 = 0 for file2 in os.listdir(rootDir + dataDir): if string.find(file2, altpattern) != -1: V2 = 1 break if V2: continue if string.find(file, ".hg19.") == -1: HG19 = 0 for file2 in os.listdir(rootDir + dataDir): if string.find(file2, ".hg19.") != -1: HG19 = 1 break if HG19: continue infile = rootDir + dataDir + "/" + file # bcgsc stupid sample name in file name if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center": sample = string.split(file, ".")[0] else: print "please check how to identify sample name" valuePOS = 3 LOG2 = 1 RANK = 0 #v2 pattern = "rsem.genes.normalized_results" if string.find(file, pattern) != -1 and string.find( namesuffix, "exon") == -1: infile = rootDir + dataDir + "/" + file # unc stupid sample name in file name if dataProducer == "University of North Carolina TCGA genome characterization center": sample = string.split(file, ".")[2] else: print "please check how to identify sample name" if string.find( namesuffix, "percentile" ) != -1: #generated percentileRANK based data RANK = 1 else: RANK = 0 valuePOS = 1 LOG2 = 1 #v2 exon from unc pattern = "bt.exon_quantification" if string.find(file, pattern) != -1 and string.find( namesuffix, "exon") != -1: infile = rootDir + dataDir + "/" + file # unc stupid sample name in file name if dataProducer == "University of North Carolina TCGA genome characterization center": sample = string.split(file, ".")[2] else: print "please check how to identify sample name" valuePOS = 3 LOG2 = 1 RANK = 0 if sample == "": continue if sample in tmpSamples: #duplicated samples continue if sample not in allSamples: continue p = len(tmpSamples) tmpSamples[sample] = p c = c + 1 #print c if RANK: process_percentileRANK(dataMatrix, tmpSamples, sample, genes, cancer, infile, flog, valuePOS, 250) else: process(dataMatrix, tmpSamples, sample, genes, cancer, infile, flog, valuePOS, LOG2, 250) if (c % 250) == 0: tmpout = "tmp_" + str(int(c / 250.0)) r = outputMatrix(dataMatrix, tmpSamples, genes, oldgenes, tmpout, flog) if r: GOOD = 0 dataMatrix = [] tmpSamples = {} oldgenes = copy.deepcopy(genes) genes = {} files.append(tmpout) if (c % 250) != 0: tmpout = "tmp_" + str(int(c / 250.0) + 1) files.append(tmpout) r = outputMatrix(dataMatrix, tmpSamples, genes, oldgenes, tmpout, flog) if r: GOOD = 0 #paste all together outfile = outDir + cancer + "/" + cgFileName if GOOD: os.system("paste -d \'\' " + string.join(files, " ") + " > " + outfile) for file in files: os.system("rm " + file) if not GOOD: sys.exit() datafile = outDir + cancer + "/" + cgFileName if not os.path.exists(datafile): return oHandle = open(outDir + cancer + "/" + cgFileName + ".json", "w") J = {} #stable J["redistribution"] = True J["groupTitle"] = "TCGA " + TCGAUtil.cancerGroupTitle[cancer] J["dataProducer"] = dataProducer J["colNormalization"] = True J["PLATFORM"] = PATHPATTERN J["type"] = "genomicMatrix" J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" #multiple dirs J["url"]=TCGAUtil.remoteBase \ +string.replace(inDir,TCGAUtil.localBase,"") J["version"] = datetime.date.today().isoformat() J["wrangler"] = "Xena TCGAscript " + __name__ + " processed on " + datetime.date.today( ).isoformat() if string.find(PATHPATTERN, "IlluminaHiSeq") != -1: #IlluminaHiSeq platformTitle = "Illumina HiSeq 2000 RNA Sequencing platform" elif string.find(PATHPATTERN, "IlluminaGA") != -1: #IlluminaGA platformTitle = " Illumina Genome Analyzer RNA Sequencing platform" assert platformTitle #change description J["description"] = "" J["RNAtype"] = "polyA+" if string.find(namesuffix, "total") != -1: #totalRNA J["RNAtype"] = "total RNA" EXONGENE = "GENE" if string.find(namesuffix, "exon") != -1: #exon EXONGENE = "EXON" if EXONGENE == "GENE": #gene J[":probeMap"] = "hugo" J["dataSubType"] = "gene expression RNAseq" if cancer not in ["OV", "STAD"]: J["label"] = suffix J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") gene expression by RNAseq (" + J[ "RNAtype"] + " " + suffix + ")" else: if dataProducer == "University of North Carolina TCGA genome characterization center": J["label"] = suffix + " UNC" J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") gene expression by RNAseq (" + J[ "RNAtype"] + " " + suffix + " UNC)" else: J["label"] = suffix + " BC" J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") gene expression by RNAseq (" + J[ "RNAtype"] + " " + suffix + " BC)" J["notes"] = "the probeMap is hugo for the short term, however probably around 10% of the gene symbols are not HUGO names, but ENTRE genes" if string.find(namesuffix, "percentile") != -1: #percentile J["description"] = J[ "description"] + "For each sample, we rank genes RSEM values between 0% to 100%. This dataset is gene expression estimation in percentile rank, which higher value representing higher expression. The dataset can be used to compare this RNAseq data with other cohorts when the other data is processed in the same way (i.e. percentile ranking)." else: #basic J["description"]= J["description"] + "The gene expression profile was measured experimentally using the "+platformTitle+" by the "+ dataProducer +"." + \ " Level 3 data was downloaded from TCGA data coordination center. This dataset shows the gene-level transcription estimates, " else: #exon J["dataSubType"] = "exon expression RNAseq" J[":probeMap"] = "unc_RNAseq_exon.hg19" if cancer not in ["OV", "STAD"]: J["label"] = suffix J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") exon expression by RNAseq (" + J[ "RNAtype"] + " " + suffix + ")" else: if dataProducer == "University of North Carolina TCGA genome characterization center": J["label"] = suffix + " UNC" J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") exon expression by RNAseq (" + J[ "RNAtype"] + " " + suffix + " UNC)" else: J["label"] = suffix + " BC" J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") exon expression by RNAseq (" + J[ "RNAtype"] + " " + suffix + " BC)" J["description"]= J["description"] +" The exon expression profile was measured experimentally using the "+platformTitle+" by the "+ dataProducer +"." + \ " Level 3 data was downloaded from TCGA data coordination center. This dataset shows the exon-level transcription estimates, " #wrangling stuff if PATHPATTERN in ["IlluminaHiSeq_RNASeqV2", "IlluminaGA_RNASeqV2" ] and string.find(namesuffix, "exon") == -1: if string.find(namesuffix, "percentile") == -1: #basic J["description"] = J[ "description"] + "as in log2(x+1) transformed RSEM normalized count." J["unit"] = "log2(norm_count+1)" J["wrangling_procedure"] = "Level_3 data (file names: *.rsem.genes.normalized_results) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository" else: #percentile J["unit"] = "percentile rank" J["wrangling_procedure"] = "Level_3 data (file names: *.rsem.genes.normalized_results) are downloaded from TCGA DCC, percentile ranked, and processed at UCSC into Xena repository." elif string.find(namesuffix, "exon") != -1: #exon J["description"] = J[ "description"] + "as in RPKM values (Reads Per Kilobase of exon model per Million mapped reads)." J["wrangling_procedure"] = "Level_3 data (file names: *.exon_quantification.txt) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository." J["unit"] = "log2(RPKM+1)" else: J["description"] = J[ "description"] + "as in RPKM values (Reads Per Kilobase of exon model per Million mapped reads)." J["wrangling_procedure"] = "Level_3 data (file names: *.gene.quantification.txt) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository." J["unit"] = "log2(RPKM+1)" #mapping to genomics region if string.find(namesuffix, "exon") == -1: #gene J["description"] = J[ "description"] + " Genes are mapped onto the human genome coordinates using UCSC Xena HUGO probeMap (see ID/Gene mapping link below for details)." else: #exon J["description"] = J[ "description"] + " Exons are mapped onto the human genome coordinates using UCSC Xena unc_RNAseq_exon probeMap (see ID/Gene mapping link below for details." #reference if dataProducer == "University of North Carolina TCGA genome characterization center": J["description"] = J["description"] +\ " Reference to method description from "+dataProducer+": <a href=\"" + TCGAUtil.remoteBase +string.replace(inDir,TCGAUtil.localBase,"") +remoteDataDirExample+"/DESCRIPTION.txt\" target=\"_blank\"><u>DCC description</u></a>" # comparison if string.find(namesuffix, "exon") == -1: # gene if string.find(namesuffix, "percentile") != -1: #percentile gene J["description"] = J[ "description"] + "<br><br>For comparing data within this cohort, we recommend to use the \"gene expression RNAseq\" dataset. For questions regarding the gene expression of this particular cohort in relation to other types tumors, you can use the pancan normalized version of the \"gene expression RNAseq\" data. For comparing with data outside TCGA, we recommend using the percentile version if the non-TCGA data is normalized by percentile ranking. For more information, please see our Data FAQ: <a href=https://docs.google.com/document/d/1q-7Tkzd7pci4Rz-_IswASRMRzYrbgx1FTTfAWOyHbmk/edit?usp=sharing target=\"_blank\"><u>here</u></a>." #viz setting J["description"] = J["description"] +\ "<br><br>In order to more easily view the differential expression between samples, we set the default view to center each gene or exon to zero by independently subtracting the mean of each gene or exon on the fly. Users can view the original non-normalized values by adjusting visualization settings." J["description"] = J["description"] + "<br><br>" J["anatomical_origin"] = TCGAUtil.anatomical_origin[cancer] J["sample_type"] = ["tumor"] J["primary_disease"] = TCGAUtil.cancerGroupTitle[cancer] J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" J['tags'] = ["cancer"] + TCGAUtil.tags[cancer] J['owner'] = "TCGA" J['gdata_tags'] = ["transcription"] #change cgData J["name"] = "TCGA_" + cancer + "_exp_" + namesuffix name = trackName_fix(J['name']) if name == False: message = "bad object name, need fix otherwise break loader, too long " + J[ "name"] print message flog.write(message + "\n") return else: J["name"] = name oHandle.write(json.dumps(J, indent=-1)) oHandle.close() return
def RPPA(inDir, outDir, cancer, flog, REALRUN): print cancer, sys._getframe().f_code.co_name PATHPATTERN = "MDA_RPPA_Core" dataProducer = "MD Anderson Cancer Center TCGA proteome characterization center" garbage = [tmpDir] if os.path.exists(tmpDir): os.system("rm -rf " + tmpDir + "*") else: os.system("mkdir " + tmpDir) #multiple files in dir mode lastRelease = {} for file in os.listdir(inDir): #find the file if string.find(file, PATHPATTERN) != -1 and string.find( file, LEVEL) != -1 and string.find( file, ".tar.gz") != -1 and string.find(file, "md5") == -1: pass else: continue if not os.path.exists(inDir + file + ".md5"): print "file has no matching .md5 throw out", file continue #find lastest in each archive info = string.split(file, ".") archive = info[-5] release = int(info[-4]) if not lastRelease.has_key(archive): lastRelease[archive] = release else: if lastRelease[archive] < release: lastRelease[archive] = release rootDir = "" lastDate = None remoteDataDirExample = "" for file in os.listdir(inDir): #find the file if string.find(file, PATHPATTERN) != -1 and string.find( file, LEVEL) != -1 and string.find( file, ".tar.gz") != -1 and string.find(file, "md5") == -1: pass else: continue if not os.path.exists(inDir + file + ".md5"): continue #find the file that is the lastest release for the archive info = string.split(file, ".") archive = info[-5] release = int(info[-4]) if release != lastRelease[archive]: continue #file latest date newDate = datetime.date.fromtimestamp(os.stat(inDir + file).st_mtime) if not lastDate: lastDate = newDate if lastDate < newDate: lastDate = newDate if remoteDataDirExample == "": remoteDataDirExample = file[:-7] #is tar.gz?, uncompress multiple file mode if string.find(file, ".tar.gz") != -1 and REALRUN: os.system("tar -xzf " + inDir + file + " -C " + tmpDir) rootDir = tmpDir #make sure there is data if REALRUN and (rootDir == "" or not os.path.exists(rootDir)): print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__ return #set output dir if not os.path.exists(outDir): os.makedirs(outDir) if not os.path.exists(outDir + cancer + "/"): os.makedirs(outDir + cancer + "/") cgFileName = "RPPA" #data processing multiple dirs mode if REALRUN: aliquote_dic = TCGAUtil.uuid_Aliquot_all() dataMatrix = {} allSamples = [] probes = [] for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir + dataDir): sample = "" pattern = "protein_expression" if string.find(file, pattern) != -1: infile = rootDir + dataDir + "/" + file sample = string.split(file, ".")[5] if sample == "": continue # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4] != "TCGA": if aliquote_dic.has_key(string.lower(sample)): if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample continue else: print "unknow id:", sample continue else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid continue elif sampleTypeCode in ["20"]: print "control cell line ignore", sample continue if sample not in allSamples: allSamples.append(sample) for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir + dataDir): sample = "" pattern = "protein_expression" if string.find(file, pattern) != -1: infile = rootDir + dataDir + "/" + file sample = string.split(file, ".")[5] if sample == "": continue if sample not in allSamples: continue valuePOS = 1 process(dataMatrix, allSamples, sample, probes, cancer, infile, flog, valuePOS) outfile = outDir + cancer + "/" + cgFileName outputMatrix(dataMatrix, allSamples, probes, outfile) oHandle = open(outDir + cancer + "/" + cgFileName + ".json", "w") J = {} #stable J["dataSubType"] = "protein expression RPPA" J["redistribution"] = True J["dataProducer"] = dataProducer J["colNormalization"] = True J["PLATFORM"] = "M.D. Anderson Reverse Phase Protein Array Core platform" J["type"] = "genomicMatrix" J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" #multiple dirs J["url"]=TCGAUtil.remoteBase \ +string.replace(inDir,TCGAUtil.localBase,"") J["version"] = datetime.date.today().isoformat() J["wrangler"] = "Xena TCGAscript " + __name__ + " processed on " + datetime.date.today( ).isoformat() J["wrangling_procedure"] = "Level_3 Data (file names: *.protein_expression.*) download from TCGA DCC, and processed at UCSC into Xena repository" J["label"] = "RPPA" J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") reverse phase protein array" J[":probeMap"] = "md_anderson_antibodies" J["anatomical_origin"] = TCGAUtil.anatomical_origin[cancer] J["sample_type"] = ["tumor"] J["primary_disease"] = TCGAUtil.cancerGroupTitle[cancer] J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" J['domain'] = "TCGA" J['owner'] = "TCGA" J["tags"] = ["cancer"] + TCGAUtil.tags[cancer] J["unit"] = "normalized RPPA value" J["description"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") protein expression by reverse phase protein array (RPPA).<br><br> The data was generated and processed at the MD Anderson Cancer Center TCGA proteome characterization center RPPA core. Level 3 interpreted level data was downloaded from TCGA data coordination center.<br><br>" J["description"] = J[ "description"] + "Data normalization from the MDACC RPPA core: <a href=\"http://bioinformatics.mdanderson.org/main/TCPA:Overview\" target=\"_blank\"><u>under section How are the RPPA data processed</u></a>.<br>" #change cgData J["name"] = "TCGA_" + cancer + "_RPPA" name = trackName_fix(J['name']) if name == False: message = "bad object name, need fix otherwise break loader, too long " + J[ "name"] print message flog.write(message + "\n") return else: J["name"] = name oHandle.write(json.dumps(J, indent=-1)) oHandle.close() return
import TCGAUtil dic = TCGAUtil.uuid_Aliquot_all() dic = TCGAUtil.uuid_Sample_all() TCGAUtil.uuid_normal_cellline() TCGAUtil.uuid_cellline()
import TCGAUtil dic=TCGAUtil.uuid_Aliquot_all() dic=TCGAUtil.uuid_Sample_all() TCGAUtil.uuid_normal_cellline() TCGAUtil.uuid_cellline()
def RPPA (inDir, outDir, cancer, flog,REALRUN): print cancer, sys._getframe().f_code.co_name PATHPATTERN = "MDA_RPPA_Core" dataProducer= "MD Anderson Cancer Center TCGA proteome characterization center" garbage=[tmpDir] if os.path.exists( tmpDir ): os.system("rm -rf "+tmpDir+"*") else: os.system("mkdir "+tmpDir) #multiple files in dir mode lastRelease={} for file in os.listdir(inDir): #find the file if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1: pass else: continue if not os.path.exists(inDir +file+".md5"): print "file has no matching .md5 throw out", file continue #find lastest in each archive info = string.split(file,".") archive = info [-5] release = int(info [-4]) if not lastRelease.has_key(archive): lastRelease[archive]= release else: if lastRelease[archive]< release: lastRelease[archive]=release rootDir ="" lastDate=None remoteDataDirExample ="" for file in os.listdir(inDir): #find the file if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1: pass else: continue if not os.path.exists(inDir +file+".md5"): continue #find the file that is the lastest release for the archive info = string.split(file,".") archive = info [-5] release = int(info [-4]) if release != lastRelease[archive]: continue #file latest date newDate= datetime.date.fromtimestamp(os.stat(inDir+file).st_mtime) if not lastDate: lastDate = newDate if lastDate < newDate: lastDate = newDate if remoteDataDirExample =="": remoteDataDirExample = file[:-7] #is tar.gz?, uncompress multiple file mode if string.find(file,".tar.gz")!=-1 and REALRUN : os.system("tar -xzf "+inDir+file +" -C "+tmpDir) rootDir =tmpDir #make sure there is data if REALRUN and (rootDir =="" or not os.path.exists(rootDir)): print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__ return #set output dir if not os.path.exists( outDir ): os.makedirs( outDir ) if not os.path.exists( outDir +cancer+"/"): os.makedirs( outDir+cancer+"/" ) cgFileName= "RPPA" #data processing multiple dirs mode if REALRUN: aliquote_dic =TCGAUtil.uuid_Aliquot_all() dataMatrix={} allSamples=[] probes=[] for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir+dataDir): sample ="" pattern ="protein_expression" if string.find(file,pattern)!=-1: infile = rootDir+dataDir+"/"+file sample = string.split(file,".")[5] if sample =="": continue # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(sample)): if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample continue else: print "unknow id:", sample continue else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid continue elif sampleTypeCode in ["20"]: print "control cell line ignore", sample continue if sample not in allSamples: allSamples.append(sample) for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir+dataDir): sample ="" pattern ="protein_expression" if string.find(file,pattern)!=-1: infile = rootDir+dataDir+"/"+file sample = string.split(file,".")[5] if sample =="": continue if sample not in allSamples: continue valuePOS=1 process(dataMatrix,allSamples,sample, probes, cancer,infile,flog, valuePOS) outfile = outDir+cancer+"/"+cgFileName outputMatrix(dataMatrix, allSamples, probes, outfile) oHandle = open(outDir+cancer+"/"+cgFileName+".json","w") J={} #stable J["dataSubType"]="protein expression RPPA" J["redistribution"]= True J["dataProducer"]= dataProducer J["colNormalization"]=True J["PLATFORM"]= "M.D. Anderson Reverse Phase Protein Array Core platform" J["type"]= "genomicMatrix" J[":sampleMap"]="TCGA."+cancer+".sampleMap" #multiple dirs J["url"]=TCGAUtil.remoteBase \ +string.replace(inDir,TCGAUtil.localBase,"") J["version"]= datetime.date.today().isoformat() J["wrangler"]= "Xena TCGAscript "+ __name__ +" processed on "+ datetime.date.today().isoformat() J["wrangling_procedure"]= "Level_3 Data (file names: *.protein_expression.*) download from TCGA DCC, and processed at UCSC into Xena repository" J["label"]= "RPPA" J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") reverse phase protein array" J[":probeMap"]= "md_anderson_antibodies" J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer] J["sample_type"]=["tumor"] J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer] J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")" J['domain']="TCGA" J['owner']="TCGA" J["tags"]=["cancer"]+ TCGAUtil.tags[cancer] J["unit"]="normalized RPPA value" J["description"]= "TCGA "+ TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") protein expression by reverse phase protein array (RPPA).<br><br> The data was generated and processed at the MD Anderson Cancer Center TCGA proteome characterization center RPPA core. Level 3 interpreted level data was downloaded from TCGA data coordination center.<br><br>" J["description"] = J["description"] + "Data normalization from the MDACC RPPA core: <a href=\"http://bioinformatics.mdanderson.org/main/TCPA:Overview\" target=\"_blank\"><u>under section How are the RPPA data processed</u></a>.<br>" #change cgData J["name"]="TCGA_"+cancer+"_RPPA" name = trackName_fix(J['name']) if name ==False: message = "bad object name, need fix otherwise break loader, too long "+J["name"] print message flog.write(message+"\n") return else: J["name"]=name oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() return
def CAVMid(dir, outDir, cancer, log, REALRUN): print cancer, sys._getframe().f_code.co_name ignore = 1 bookDic = cgWalk(dir, ignore) existMaps = collectSampleMaps(bookDic) missingMaps = collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map] = existMaps[map] # all aliquote uuid dic aliquote_dic = TCGAUtil.uuid_Aliquot_all() sample_dic = TCGAUtil.uuid_Sample_all() if not os.path.exists(outDir): os.system("mkdir " + outDir) for map in missingMaps: print map sMap = SampleMapNew(None, map) for name in missingMaps[map]: samples = [] intDic = {} #keyed on CAVMid sampleDic = {} #keyd on original sample id obj = bookDic[name] print obj["name"] if obj['type'] in ["clinicalMatrix", "mutationVector"]: outfile = outDir + os.path.basename(obj['path']) os.system("cp " + obj['path'] + ".json " + outfile + ".json") fin = open(outfile + ".json", 'r') J = json.load(fin) fin.close() if J.has_key(":clinicalFeature"): cFobj = bookDic[J[":clinicalFeature"]] cFoutfile = outDir + os.path.basename(cFobj['path']) os.system("cp " + cFobj['path'] + " " + cFoutfile) os.system("cp " + cFobj['path'] + ".json " + cFoutfile + ".json") if REALRUN == -1: continue if REALRUN == 0 and obj['type'] == "mutationVector": continue fin = open(obj['path'], 'r') fin.readline() for line in fin.readlines(): sample = string.split(line, "\t")[0] if sample not in samples and sample != "": samples.append(sample) buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic) fin = open(obj['path'], 'r') fout = open(outfile, 'w') fout.write(fin.readline()) for line in fin.readlines(): data = string.split(line, "\t") sample = data[0] try: fout.write(sampleDic[sample] + "\t") fout.write(string.join(data[1:], "\t")) except: fout.write(line) fout.close() if obj['type'] == "genomicMatrix": fin = open(obj['path'], 'U') for sample in string.split(fin.readline()[:-1], "\t")[1:]: if sample == "": print name, "has bad empty sample id" sys.exit() samples.append(sample) fin.close() outfile = outDir + os.path.basename(obj['path']) os.system("cp " + obj['path'] + ".json " + outfile + ".json") if REALRUN != 1: continue buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic) process(obj['path'], outfile, samples, intDic)
def geneRPKM (inDir, outDir, cancer,flog,PATHPATTERN,suffix, namesuffix, dataProducer,REALRUN,clean): garbage=[tmpDir] os.system("rm -rf tmp_*") if os.path.exists( tmpDir ): if clean: os.system("rm -rf "+tmpDir+"*") else: os.system("mkdir "+tmpDir) #multiple files in dir mode lastRelease={} for file in os.listdir(inDir): #find the file if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1: pass else: continue if not os.path.exists(inDir +file+".md5"): print "file has no matching .md5 throw out", file continue #find lastest in each archive info = string.split(file,".") archive = info [-5] release = int(info [-4]) if not lastRelease.has_key(archive): lastRelease[archive]= release else: if lastRelease[archive]< release: lastRelease[archive]=release rootDir ="" lastDate=None remoteDataDirExample ="" for file in os.listdir(inDir): #find the file if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1: pass else: continue if not os.path.exists(inDir +file+".md5"): continue #find the file that is the lastest release for the archive info = string.split(file,".") archive = info [-5] release = int(info [-4]) if release != lastRelease[archive]: continue #file latest date newDate= datetime.date.fromtimestamp(os.stat(inDir+file).st_mtime) if not lastDate: lastDate = newDate if lastDate < newDate: lastDate = newDate if remoteDataDirExample =="": remoteDataDirExample = file[:-7] #is tar.gz?, uncompress multiple file mode if not clean: rootDir =tmpDir elif string.find(file,".tar.gz")!=-1 and REALRUN and clean: os.system("tar -xzf "+inDir+file +" -C "+tmpDir) rootDir =tmpDir #make sure there is data if REALRUN and (rootDir =="" or not os.path.exists(rootDir)): print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__ return #set output dir if not os.path.exists( outDir ): os.makedirs( outDir ) if not os.path.exists( outDir +cancer+"/"): os.makedirs( outDir+cancer+"/" ) cgFileName= namesuffix #data processing multiple dirs mode if REALRUN: allSamples={} for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir+dataDir): sample ="" #v2 bcgsc gene pattern =".gene.quantification" if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")==-1: #check if there is .v2 if string.find(file,".v2.")==-1: V2=0 for file2 in os.listdir(rootDir+dataDir): if string.find(file2,".v2")!=-1: V2=1 break if V2: continue if string.find(file,".hg19.")==-1: HG19=0 for file2 in os.listdir(rootDir+dataDir): if string.find(file2,".hg19.")!=-1: HG19=1 break if HG19: continue infile = rootDir+dataDir+"/"+file # bcgsc stupid sample name in file name if dataProducer=="British Columbia Cancer Agency TCGA genome characterization center": sample = string.split(file,".")[0] else: print "please check how to identify sample name" #v2 bcgsc exon pattern =".exon.quantification" if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")!=-1: #check if there is .v2 if string.find(file,".v2.")==-1: V2=0 for file2 in os.listdir(rootDir+dataDir): if string.find(file2,".v2.")!=-1: V2=1 break if V2: continue if string.find(file,".hg19.")==-1: HG19=0 for file2 in os.listdir(rootDir+dataDir): if string.find(file2,".hg19.")!=-1: HG19=1 break if HG19: continue infile = rootDir+dataDir+"/"+file # bcgsc stupid sample name in file name if dataProducer=="British Columbia Cancer Agency TCGA genome characterization center": sample = string.split(file,".")[0] else: print "please check how to identify sample name" #v2 pattern ="rsem.genes.normalized_results" if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")==-1: infile = rootDir+dataDir+"/"+file # unc stupid sample name in file name if dataProducer =="University of North Carolina TCGA genome characterization center": sample = string.split(file,".")[2] else: print "please check how to identify sample name" #v2 exon from unc pattern ="bt.exon_quantification" if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")!=-1: infile = rootDir+dataDir+"/"+file # unc stupid sample name in file name if dataProducer =="University of North Carolina TCGA genome characterization center": sample = string.split(file,".")[2] else: print "please check how to identify sample name" if sample=="": continue if sample in allSamples: print len(allSamples) message = "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__ +file flog.write(message+"\n") print message continue # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4]!="TCGA": if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample continue else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid continue elif sampleTypeCode in ["20"]: print "control cell line ignore", sample continue p=len(allSamples) allSamples[sample]=p c=0 dataMatrix=[] tmpSamples={} genes={} oldgenes={} files=[] GOOD=1 for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir+dataDir): sample="" #bcgsc v1 and 2 pattern ="gene.quantification" altpattern =".v2.gene.quantification" if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")==-1: #check if there is .v2 if string.find(file,".v2.")==-1: V2=0 for file2 in os.listdir(rootDir+dataDir): if string.find(file2,altpattern)!=-1: V2=1 break if V2: continue if string.find(file,".hg19.")==-1: HG19=0 for file2 in os.listdir(rootDir+dataDir): if string.find(file2,".hg19.")!=-1: HG19=1 break if HG19: continue infile = rootDir+dataDir+"/"+file # bcgsc stupid sample name in file name if dataProducer=="British Columbia Cancer Agency TCGA genome characterization center": sample = string.split(file,".")[0] else: print "please check how to identify sample name" valuePOS=3 LOG2=1 RANK=0 #bcgsc exon v1 and v2 pattern ="exon.quantification" altpattern =".v2.exon.quantification" if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")!=-1: #check if there is .v2 if string.find(file,".v2.")==-1: V2=0 for file2 in os.listdir(rootDir+dataDir): if string.find(file2,altpattern)!=-1: V2=1 break if V2: continue if string.find(file,".hg19.")==-1: HG19=0 for file2 in os.listdir(rootDir+dataDir): if string.find(file2,".hg19.")!=-1: HG19=1 break if HG19: continue infile = rootDir+dataDir+"/"+file # bcgsc stupid sample name in file name if dataProducer=="British Columbia Cancer Agency TCGA genome characterization center": sample = string.split(file,".")[0] else: print "please check how to identify sample name" valuePOS=3 LOG2=1 RANK=0 #v2 pattern ="rsem.genes.normalized_results" if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")==-1: infile = rootDir+dataDir+"/"+file # unc stupid sample name in file name if dataProducer =="University of North Carolina TCGA genome characterization center": sample = string.split(file,".")[2] else: print "please check how to identify sample name" if string.find(namesuffix,"percentile") !=-1: #generated percentileRANK based data RANK=1 else: RANK=0 valuePOS=1 LOG2=1 #v2 exon from unc pattern ="bt.exon_quantification" if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")!=-1: infile = rootDir+dataDir+"/"+file # unc stupid sample name in file name if dataProducer =="University of North Carolina TCGA genome characterization center": sample = string.split(file,".")[2] else: print "please check how to identify sample name" valuePOS=3 LOG2=1 RANK=0 if sample=="": continue if sample in tmpSamples: #duplicated samples continue if sample not in allSamples: continue p=len(tmpSamples) tmpSamples[sample]=p c=c+1 #print c if RANK: process_percentileRANK(dataMatrix,tmpSamples,sample,genes, cancer,infile,flog, valuePOS,250) else: process(dataMatrix,tmpSamples,sample,genes, cancer,infile,flog, valuePOS,LOG2,250) if (c % 250)==0: tmpout="tmp_"+ str(int(c/250.0)) r =outputMatrix(dataMatrix, tmpSamples, genes, oldgenes, tmpout, flog) if r: GOOD=0 dataMatrix=[] tmpSamples={} oldgenes=copy.deepcopy(genes) genes ={} files.append(tmpout) if (c % 250)!=0: tmpout= "tmp_"+ str(int(c/250.0)+1) files.append(tmpout) r= outputMatrix(dataMatrix, tmpSamples, genes, oldgenes,tmpout, flog) if r: GOOD=0 #paste all together outfile = outDir+cancer+"/"+cgFileName if GOOD: os.system("paste -d \'\' "+string.join(files," ")+" > "+ outfile) for file in files: os.system("rm "+ file) if not GOOD: sys.exit() datafile= outDir+cancer+"/"+cgFileName if not os.path.exists(datafile): return oHandle = open(outDir+cancer+"/"+cgFileName+".json","w") J={} #stable J["redistribution"]= True J["groupTitle"]="TCGA "+TCGAUtil.cancerGroupTitle[cancer] J["dataProducer"]= dataProducer J["colNormalization"]=True J["PLATFORM"]= PATHPATTERN J["type"]= "genomicMatrix" J[":sampleMap"]="TCGA."+cancer+".sampleMap" #multiple dirs J["url"]=TCGAUtil.remoteBase \ +string.replace(inDir,TCGAUtil.localBase,"") J["version"]= datetime.date.today().isoformat() J["wrangler"]= "Xena TCGAscript "+ __name__ +" processed on "+ datetime.date.today().isoformat() if string.find(PATHPATTERN, "IlluminaHiSeq")!=-1: #IlluminaHiSeq platformTitle ="Illumina HiSeq 2000 RNA Sequencing platform" elif string.find(PATHPATTERN, "IlluminaGA")!=-1: #IlluminaGA platformTitle =" Illumina Genome Analyzer RNA Sequencing platform" assert platformTitle #change description J["description"]="" J["RNAtype"]="polyA+" if string.find(namesuffix, "total")!=-1: #totalRNA J["RNAtype"]= "total RNA" EXONGENE= "GENE" if string.find(namesuffix, "exon")!=-1: #exon EXONGENE = "EXON" if EXONGENE == "GENE": #gene J[":probeMap"]= "hugo" J["dataSubType"]="gene expression RNAseq" if cancer not in ["OV", "STAD"]: J["label"]= suffix J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") gene expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+")" else: if dataProducer =="University of North Carolina TCGA genome characterization center": J["label"]= suffix +" UNC" J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") gene expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+" UNC)" else: J["label"]= suffix+" BC" J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") gene expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+" BC)" J["notes"]= "the probeMap is hugo for the short term, however probably around 10% of the gene symbols are not HUGO names, but ENTRE genes" if string.find(namesuffix, "percentile") != -1: #percentile J["description"]= J["description"] + "For each sample, we rank genes RSEM values between 0% to 100%. This dataset is gene expression estimation in percentile rank, which higher value representing higher expression. The dataset can be used to compare this RNAseq data with other cohorts when the other data is processed in the same way (i.e. percentile ranking)." else: #basic J["description"]= J["description"] + "The gene expression profile was measured experimentally using the "+platformTitle+" by the "+ dataProducer +"." + \ " Level 3 data was downloaded from TCGA data coordination center. This dataset shows the gene-level transcription estimates, " else: #exon J["dataSubType"]="exon expression RNAseq" J[":probeMap"]= "unc_RNAseq_exon.hg19" if cancer not in [ "OV","STAD"]: J["label"]= suffix J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") exon expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+")" else: if dataProducer =="University of North Carolina TCGA genome characterization center": J["label"]= suffix+" UNC" J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") exon expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+" UNC)" else: J["label"]= suffix+" BC" J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") exon expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+" BC)" J["description"]= J["description"] +" The exon expression profile was measured experimentally using the "+platformTitle+" by the "+ dataProducer +"." + \ " Level 3 data was downloaded from TCGA data coordination center. This dataset shows the exon-level transcription estimates, " #wrangling stuff if PATHPATTERN in [ "IlluminaHiSeq_RNASeqV2","IlluminaGA_RNASeqV2"] and string.find(namesuffix, "exon")==-1: if string.find(namesuffix, "percentile")==-1: #basic J["description"] = J["description"] + "as in log2(x+1) transformed RSEM normalized count." J["unit"]="log2(norm_count+1)" J["wrangling_procedure"]= "Level_3 data (file names: *.rsem.genes.normalized_results) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository" else: #percentile J["unit"]="percentile rank" J["wrangling_procedure"]= "Level_3 data (file names: *.rsem.genes.normalized_results) are downloaded from TCGA DCC, percentile ranked, and processed at UCSC into Xena repository." elif string.find(namesuffix, "exon")!=-1: #exon J["description"] = J["description"] + "as in RPKM values (Reads Per Kilobase of exon model per Million mapped reads)." J["wrangling_procedure"]= "Level_3 data (file names: *.exon_quantification.txt) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository." J["unit"]="log2(RPKM+1)" else: J["description"] = J["description"] + "as in RPKM values (Reads Per Kilobase of exon model per Million mapped reads)." J["wrangling_procedure"]= "Level_3 data (file names: *.gene.quantification.txt) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository." J["unit"]="log2(RPKM+1)" #mapping to genomics region if string.find(namesuffix, "exon")==-1: #gene J["description"] = J["description"] + " Genes are mapped onto the human genome coordinates using UCSC Xena HUGO probeMap (see ID/Gene mapping link below for details)." else: #exon J["description"] = J["description"] + " Exons are mapped onto the human genome coordinates using UCSC Xena unc_RNAseq_exon probeMap (see ID/Gene mapping link below for details." #reference if dataProducer =="University of North Carolina TCGA genome characterization center": J["description"] = J["description"] +\ " Reference to method description from "+dataProducer+": <a href=\"" + TCGAUtil.remoteBase +string.replace(inDir,TCGAUtil.localBase,"") +remoteDataDirExample+"/DESCRIPTION.txt\" target=\"_blank\"><u>DCC description</u></a>" # comparison if string.find(namesuffix, "exon")==-1: # gene if string.find(namesuffix, "percentile")!=-1: #percentile gene J["description"]= J["description"] +"<br><br>For comparing data within this cohort, we recommend to use the \"gene expression RNAseq\" dataset. For questions regarding the gene expression of this particular cohort in relation to other types tumors, you can use the pancan normalized version of the \"gene expression RNAseq\" data. For comparing with data outside TCGA, we recommend using the percentile version if the non-TCGA data is normalized by percentile ranking. For more information, please see our Data FAQ: <a href=https://docs.google.com/document/d/1q-7Tkzd7pci4Rz-_IswASRMRzYrbgx1FTTfAWOyHbmk/edit?usp=sharing target=\"_blank\"><u>here</u></a>." #viz setting J["description"] = J["description"] +\ "<br><br>In order to more easily view the differential expression between samples, we set the default view to center each gene or exon to zero by independently subtracting the mean of each gene or exon on the fly. Users can view the original non-normalized values by adjusting visualization settings." J["description"] = J["description"] +"<br><br>" J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer] J["sample_type"]=["tumor"] J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer] J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")" J['tags']=["cancer"]+ TCGAUtil.tags[cancer] J['owner']="TCGA" J['gdata_tags'] =["transcription"] #change cgData J["name"]="TCGA_"+cancer+"_exp_"+namesuffix name = trackName_fix(J['name']) if name ==False: message = "bad object name, need fix otherwise break loader, too long "+J["name"] print message flog.write(message+"\n") return else: J["name"]=name oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() return
def cohort_variable (var, value, inDir, outDir, cancer, REALRUN, doDerived): print inDir print outDir if REALRUN: ignore =1 bookDic=cgWalk(inDir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() if len(missingMaps)!=1: return map = missingMaps.keys()[0] print map samples =[] for name in missingMaps[map]: obj=bookDic[name] if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" sys.exit() if sample not in samples: samples.append(sample) fin.close() #take too long """ if obj['type']=="mutationVector": fin =open(obj['path'],'U') fin.readline() while 1: line = fin.readline() if string.strip(line) =="": break sample = string.split(line,'\t')[0] if sample not in samples: samples.append(sample) print sample, obj['path'] fin.close() """ intDic={} for sample in samples: #TCGA uuid handling uuid =sample TCGAbarcode ="" if uuid[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(uuid)): TCGAbarcode = aliquote_dic[string.lower(uuid)] else: TCGAbarcode = uuid else: TCGAbarcode = sample intID= TCGAUtil.barcode_IntegrationId(TCGAbarcode) if intID == None: # ids is on patient level above integration level continue if not intDic.has_key(intID): intDic[intID]="" outfile = outDir+cancer+"/"+ var fout =open(outfile,"w") fout.write("sample\t"+var+"\n") for intId in intDic: fout.write(intId+"\t"+ value+"\n") fout.close() #data josn J={} J["version"]= datetime.date.today().isoformat() J["name"]="TCGA_"+cancer+"_"+var J["type"]= "clinicalMatrix" J["dataSubType"]="phenotype" J[":sampleMap"]="TCGA."+cancer+".sampleMap" J["cohort"]="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")" outfile = outDir+cancer+"/"+var oHandle = open(outfile +".json","w") oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() if doDerived: if cancer in ["LUAD","LUSC"]: derived_cancer="LUNG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["COAD","READ"]: derived_cancer="COADREAD" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["GBM","LGG"]: derived_cancer="GBMLGG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)