def CAVMid(dir, outDir, cancer, log, REALRUN): print cancer, sys._getframe().f_code.co_name ignore = 1 bookDic = cgWalk(dir, ignore) existMaps = collectSampleMaps(bookDic) missingMaps = collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map] = existMaps[map] # all aliquote uuid dic aliquote_dic = TCGAUtil.uuid_Aliquot_all() sample_dic = TCGAUtil.uuid_Sample_all() if not os.path.exists(outDir): os.system("mkdir " + outDir) for map in missingMaps: print map sMap = SampleMapNew(None, map) for name in missingMaps[map]: samples = [] intDic = {} #keyed on CAVMid sampleDic = {} #keyd on original sample id obj = bookDic[name] print obj["name"] if obj['type'] in ["clinicalMatrix", "mutationVector"]: outfile = outDir + os.path.basename(obj['path']) os.system("cp " + obj['path'] + ".json " + outfile + ".json") fin = open(outfile + ".json", 'r') J = json.load(fin) fin.close() if J.has_key(":clinicalFeature"): cFobj = bookDic[J[":clinicalFeature"]] cFoutfile = outDir + os.path.basename(cFobj['path']) os.system("cp " + cFobj['path'] + " " + cFoutfile) os.system("cp " + cFobj['path'] + ".json " + cFoutfile + ".json") if REALRUN == -1: continue if REALRUN == 0 and obj['type'] == "mutationVector": continue fin = open(obj['path'], 'r') fin.readline() for line in fin.readlines(): sample = string.split(line, "\t")[0] if sample not in samples and sample != "": samples.append(sample) buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic) fin = open(obj['path'], 'r') fout = open(outfile, 'w') fout.write(fin.readline()) for line in fin.readlines(): data = string.split(line, "\t") sample = data[0] try: fout.write(sampleDic[sample] + "\t") fout.write(string.join(data[1:], "\t")) except: fout.write(line) fout.close() if obj['type'] == "genomicMatrix": fin = open(obj['path'], 'U') for sample in string.split(fin.readline()[:-1], "\t")[1:]: if sample == "": print name, "has bad empty sample id" sys.exit() samples.append(sample) fin.close() outfile = outDir + os.path.basename(obj['path']) os.system("cp " + obj['path'] + ".json " + outfile + ".json") if REALRUN != 1: continue buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic) process(obj['path'], outfile, samples, intDic)
def TCGASampleMap (dir, outDir, cancer,log, REALRUN): #print status print cancer, __name__ #if cancer in ["PANCAN","PANCAN12"]: # return ignore =1 bookDic = cgWalk(dir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() #missingMaps --- actually this is all the maps for map in missingMaps: print map print missingMaps[map] sMap =SampleMapNew(None,map) #integration id intName= map+".integrationID" if intName in bookDic: fin = open(bookDic[intName]["path"],'r') integrationID=IntegrationId(intName, fin) fin.close() else: integrationID=IntegrationId(intName) samples =[] for name in missingMaps[map]: if REALRUN !=1: continue print name obj=bookDic[name] if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" if sample not in samples: samples.append(sample) fin.close() #elif obj['type']=="clinicalMatrix": # cMa = ClinicalMatrixNew(obj['path'],name) # for sample in cMa.getROWs(): # if sample not in samples: # samples.append(sample) elif obj['type'] in ["mutationVector","clinicalMatrix"]: path = obj['path'] os.system("cut -f 1 "+path+ " |sort |uniq > .tmp") fin=open('.tmp','r') fin.readline() for line in fin.readlines(): #if string.strip(line)=="": # break sample = string.strip(line) #string.split(line,'\t')[0] if sample =="": break if sample not in samples: samples.append(sample) else: continue for sample in samples: if REALRUN !=1: continue #TCGA uuid handling if sample[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(sample)): TCGAbarcode = aliquote_dic[string.lower(sample)] else: print sample continue parent = TCGAbarcode child = sample sMap.addLink(parent,string.lower(child)) sMap.addLink(parent,string.upper(child)) sample = parent #do TCGA barcode trick parts= string.split(sample,"-") if len(parts)>3 and len(parts[3])==3: parts = parts[0:3]+ [parts[3][0:2],parts[3][2]]+parts[4:] #print parts """ parent = string.join(parts[0:3],"-") #parts[3] if len(parts)>3 and len(parts[3])==3: child=parent +"-" +parts[3][0:2] sMap.addLink(parent,child) parent=child child=string.join(parts[0:4],"-") sMap.addLink(parent,child) parent=child """ parent = string.join(parts[0:3],"-") for i in range (3,len(parts)): if i!=4: child = parent +"-" +parts[i] else: child = parent +parts[i] #add parent child sMap.addLink(parent,child) parent = child intID= TCGAUtil.barcode_IntegrationId(sample) integrationID.addId(intID) #output sampleMap if not os.path.exists( outDir ): os.makedirs( outDir ) if not os.path.exists( outDir +cancer+"/"): os.makedirs( outDir+cancer+"/" ) if REALRUN == 1: oHandle = open(outDir+cancer+"/"+map,"w") sMap.store(oHandle) #output integrationID if REALRUN ==1: oHandle = open(outDir+cancer+"/integrationID","w") integrationID.store(oHandle) oHandle.close() #output integrationID json oHandle = open(outDir+cancer+"/integrationID.json","w") J={} J['name']=intName J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer] J["sample_type"]="tumor" if cancer not in ["PANCAN","PANCAN12"]: J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer] else: J["primary_disease"]="cancer" #J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer] J['domain']="TCGA" J['owner']="TCGA" J["cgDataVersion"]=1 J['type']="integrationId" J["version"]= datetime.date.today().isoformat() oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() #output json oHandle = open(outDir+cancer+"/"+map+".json","w") J['name']=map J['type']="sampleMap" J["version"]= datetime.date.today().isoformat() J["cgDataVersion"]=1 J[":integrationId"]=intName #add info for old clinical data if os.path.exists( outDir+cancer+"/oldClin.json" ): J[':oldClin']=cancer+"_oldClin" #special code if TCGAUtil.featurePriority.has_key(cancer) and len(TCGAUtil.featurePriority[cancer])>=5: J["VIS"]=5 #blackList in PAAD if J['name'] in ["TCGA.PAAD.sampleMap"]: J["blacklist"]= [ "TCGA-FQ-6551", "TCGA-FQ-6552", "TCGA-FQ-6553", "TCGA-FQ-6554", "TCGA-FQ-6555", "TCGA-FQ-6558", "TCGA-FQ-6559"] oHandle.write( json.dumps( J, indent=-1 ) ) return
import TCGAUtil dic = TCGAUtil.uuid_Aliquot_all() dic = TCGAUtil.uuid_Sample_all() TCGAUtil.uuid_normal_cellline() TCGAUtil.uuid_cellline()
def RPPA (inDir, outDir, cancer, flog,REALRUN): print cancer, sys._getframe().f_code.co_name PATHPATTERN = "MDA_RPPA_Core" dataProducer= "MD Anderson Cancer Center TCGA proteome characterization center" garbage=[tmpDir] if os.path.exists( tmpDir ): os.system("rm -rf "+tmpDir+"*") else: os.system("mkdir "+tmpDir) #multiple files in dir mode lastRelease={} for file in os.listdir(inDir): #find the file if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1: pass else: continue if not os.path.exists(inDir +file+".md5"): print "file has no matching .md5 throw out", file continue #find lastest in each archive info = string.split(file,".") archive = info [-5] release = int(info [-4]) if not lastRelease.has_key(archive): lastRelease[archive]= release else: if lastRelease[archive]< release: lastRelease[archive]=release rootDir ="" lastDate=None remoteDataDirExample ="" for file in os.listdir(inDir): #find the file if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1: pass else: continue if not os.path.exists(inDir +file+".md5"): continue #find the file that is the lastest release for the archive info = string.split(file,".") archive = info [-5] release = int(info [-4]) if release != lastRelease[archive]: continue #file latest date newDate= datetime.date.fromtimestamp(os.stat(inDir+file).st_mtime) if not lastDate: lastDate = newDate if lastDate < newDate: lastDate = newDate if remoteDataDirExample =="": remoteDataDirExample = file[:-7] #is tar.gz?, uncompress multiple file mode if string.find(file,".tar.gz")!=-1 and REALRUN : os.system("tar -xzf "+inDir+file +" -C "+tmpDir) rootDir =tmpDir #make sure there is data if REALRUN and (rootDir =="" or not os.path.exists(rootDir)): print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__ return #set output dir if not os.path.exists( outDir ): os.makedirs( outDir ) if not os.path.exists( outDir +cancer+"/"): os.makedirs( outDir+cancer+"/" ) cgFileName= "RPPA" #data processing multiple dirs mode if REALRUN: aliquote_dic =TCGAUtil.uuid_Aliquot_all() dataMatrix={} allSamples=[] probes=[] for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir+dataDir): sample ="" pattern ="protein_expression" if string.find(file,pattern)!=-1: infile = rootDir+dataDir+"/"+file sample = string.split(file,".")[5] if sample =="": continue # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(sample)): if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample continue else: print "unknow id:", sample continue else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid continue elif sampleTypeCode in ["20"]: print "control cell line ignore", sample continue if sample not in allSamples: allSamples.append(sample) for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir+dataDir): sample ="" pattern ="protein_expression" if string.find(file,pattern)!=-1: infile = rootDir+dataDir+"/"+file sample = string.split(file,".")[5] if sample =="": continue if sample not in allSamples: continue valuePOS=1 process(dataMatrix,allSamples,sample, probes, cancer,infile,flog, valuePOS) outfile = outDir+cancer+"/"+cgFileName outputMatrix(dataMatrix, allSamples, probes, outfile) oHandle = open(outDir+cancer+"/"+cgFileName+".json","w") J={} #stable J["dataSubType"]="protein expression RPPA" J["redistribution"]= True J["dataProducer"]= dataProducer J["colNormalization"]=True J["PLATFORM"]= "M.D. Anderson Reverse Phase Protein Array Core platform" J["type"]= "genomicMatrix" J[":sampleMap"]="TCGA."+cancer+".sampleMap" #multiple dirs J["url"]=TCGAUtil.remoteBase \ +string.replace(inDir,TCGAUtil.localBase,"") J["version"]= datetime.date.today().isoformat() J["wrangler"]= "Xena TCGAscript "+ __name__ +" processed on "+ datetime.date.today().isoformat() J["wrangling_procedure"]= "Level_3 Data (file names: *.protein_expression.*) download from TCGA DCC, and processed at UCSC into Xena repository" J["label"]= "RPPA" J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") reverse phase protein array" J[":probeMap"]= "md_anderson_antibodies" J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer] J["sample_type"]=["tumor"] J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer] J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")" J['domain']="TCGA" J['owner']="TCGA" J["tags"]=["cancer"]+ TCGAUtil.tags[cancer] J["unit"]="normalized RPPA value" J["description"]= "TCGA "+ TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") protein expression by reverse phase protein array (RPPA).<br><br> The data was generated and processed at the MD Anderson Cancer Center TCGA proteome characterization center RPPA core. Level 3 interpreted level data was downloaded from TCGA data coordination center.<br><br>" J["description"] = J["description"] + "Data normalization from the MDACC RPPA core: <a href=\"http://bioinformatics.mdanderson.org/main/TCPA:Overview\" target=\"_blank\"><u>under section How are the RPPA data processed</u></a>.<br>" #change cgData J["name"]="TCGA_"+cancer+"_RPPA" name = trackName_fix(J['name']) if name ==False: message = "bad object name, need fix otherwise break loader, too long "+J["name"] print message flog.write(message+"\n") return else: J["name"]=name oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() return
def RPPA(inDir, outDir, cancer, flog, REALRUN): print cancer, sys._getframe().f_code.co_name PATHPATTERN = "MDA_RPPA_Core" dataProducer = "MD Anderson Cancer Center TCGA proteome characterization center" garbage = [tmpDir] if os.path.exists(tmpDir): os.system("rm -rf " + tmpDir + "*") else: os.system("mkdir " + tmpDir) #multiple files in dir mode lastRelease = {} for file in os.listdir(inDir): #find the file if string.find(file, PATHPATTERN) != -1 and string.find( file, LEVEL) != -1 and string.find( file, ".tar.gz") != -1 and string.find(file, "md5") == -1: pass else: continue if not os.path.exists(inDir + file + ".md5"): print "file has no matching .md5 throw out", file continue #find lastest in each archive info = string.split(file, ".") archive = info[-5] release = int(info[-4]) if not lastRelease.has_key(archive): lastRelease[archive] = release else: if lastRelease[archive] < release: lastRelease[archive] = release rootDir = "" lastDate = None remoteDataDirExample = "" for file in os.listdir(inDir): #find the file if string.find(file, PATHPATTERN) != -1 and string.find( file, LEVEL) != -1 and string.find( file, ".tar.gz") != -1 and string.find(file, "md5") == -1: pass else: continue if not os.path.exists(inDir + file + ".md5"): continue #find the file that is the lastest release for the archive info = string.split(file, ".") archive = info[-5] release = int(info[-4]) if release != lastRelease[archive]: continue #file latest date newDate = datetime.date.fromtimestamp(os.stat(inDir + file).st_mtime) if not lastDate: lastDate = newDate if lastDate < newDate: lastDate = newDate if remoteDataDirExample == "": remoteDataDirExample = file[:-7] #is tar.gz?, uncompress multiple file mode if string.find(file, ".tar.gz") != -1 and REALRUN: os.system("tar -xzf " + inDir + file + " -C " + tmpDir) rootDir = tmpDir #make sure there is data if REALRUN and (rootDir == "" or not os.path.exists(rootDir)): print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__ return #set output dir if not os.path.exists(outDir): os.makedirs(outDir) if not os.path.exists(outDir + cancer + "/"): os.makedirs(outDir + cancer + "/") cgFileName = "RPPA" #data processing multiple dirs mode if REALRUN: aliquote_dic = TCGAUtil.uuid_Aliquot_all() dataMatrix = {} allSamples = [] probes = [] for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir + dataDir): sample = "" pattern = "protein_expression" if string.find(file, pattern) != -1: infile = rootDir + dataDir + "/" + file sample = string.split(file, ".")[5] if sample == "": continue # Test for barcode or UUID #throw out all normals and control Analyte if sample[0:4] != "TCGA": if aliquote_dic.has_key(string.lower(sample)): if TCGAUtil.UUID_CELLLINE.has_key(sample): print "control cell line ignore", sample continue else: print "unknow id:", sample continue else: sampleTypeCode = TCGAUtil.barcode_SampleType(sample) if sampleTypeCode == False: # likely a uuid continue elif sampleTypeCode in ["20"]: print "control cell line ignore", sample continue if sample not in allSamples: allSamples.append(sample) for dataDir in os.listdir(rootDir): for file in os.listdir(rootDir + dataDir): sample = "" pattern = "protein_expression" if string.find(file, pattern) != -1: infile = rootDir + dataDir + "/" + file sample = string.split(file, ".")[5] if sample == "": continue if sample not in allSamples: continue valuePOS = 1 process(dataMatrix, allSamples, sample, probes, cancer, infile, flog, valuePOS) outfile = outDir + cancer + "/" + cgFileName outputMatrix(dataMatrix, allSamples, probes, outfile) oHandle = open(outDir + cancer + "/" + cgFileName + ".json", "w") J = {} #stable J["dataSubType"] = "protein expression RPPA" J["redistribution"] = True J["dataProducer"] = dataProducer J["colNormalization"] = True J["PLATFORM"] = "M.D. Anderson Reverse Phase Protein Array Core platform" J["type"] = "genomicMatrix" J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" #multiple dirs J["url"]=TCGAUtil.remoteBase \ +string.replace(inDir,TCGAUtil.localBase,"") J["version"] = datetime.date.today().isoformat() J["wrangler"] = "Xena TCGAscript " + __name__ + " processed on " + datetime.date.today( ).isoformat() J["wrangling_procedure"] = "Level_3 Data (file names: *.protein_expression.*) download from TCGA DCC, and processed at UCSC into Xena repository" J["label"] = "RPPA" J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") reverse phase protein array" J[":probeMap"] = "md_anderson_antibodies" J["anatomical_origin"] = TCGAUtil.anatomical_origin[cancer] J["sample_type"] = ["tumor"] J["primary_disease"] = TCGAUtil.cancerGroupTitle[cancer] J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" J['domain'] = "TCGA" J['owner'] = "TCGA" J["tags"] = ["cancer"] + TCGAUtil.tags[cancer] J["unit"] = "normalized RPPA value" J["description"] = "TCGA " + TCGAUtil.cancerOfficial[ cancer] + " (" + cancer + ") protein expression by reverse phase protein array (RPPA).<br><br> The data was generated and processed at the MD Anderson Cancer Center TCGA proteome characterization center RPPA core. Level 3 interpreted level data was downloaded from TCGA data coordination center.<br><br>" J["description"] = J[ "description"] + "Data normalization from the MDACC RPPA core: <a href=\"http://bioinformatics.mdanderson.org/main/TCPA:Overview\" target=\"_blank\"><u>under section How are the RPPA data processed</u></a>.<br>" #change cgData J["name"] = "TCGA_" + cancer + "_RPPA" name = trackName_fix(J['name']) if name == False: message = "bad object name, need fix otherwise break loader, too long " + J[ "name"] print message flog.write(message + "\n") return else: J["name"] = name oHandle.write(json.dumps(J, indent=-1)) oHandle.close() return
def cohort_variable(var, value, inDir, outDir, cancer, REALRUN, doDerived): print inDir print outDir if REALRUN: ignore = 1 bookDic = cgWalk(inDir, ignore) existMaps = collectSampleMaps(bookDic) missingMaps = collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map] = existMaps[map] # all aliquote uuid dic aliquote_dic = TCGAUtil.uuid_Aliquot_all() sample_dic = TCGAUtil.uuid_Sample_all() if len(missingMaps) != 1: return map = missingMaps.keys()[0] print map samples = [] for name in missingMaps[map]: obj = bookDic[name] if obj['type'] == "genomicMatrix": fin = open(obj['path'], 'U') for sample in string.split(fin.readline()[:-1], "\t")[1:]: if sample == "": print name, "has bad empty sample id" sys.exit() if sample not in samples: samples.append(sample) fin.close() #take too long """ if obj['type']=="mutationVector": fin =open(obj['path'],'U') fin.readline() while 1: line = fin.readline() if string.strip(line) =="": break sample = string.split(line,'\t')[0] if sample not in samples: samples.append(sample) print sample, obj['path'] fin.close() """ intDic = {} for sample in samples: #TCGA uuid handling uuid = sample TCGAbarcode = "" if uuid[0:4] != "TCGA": if aliquote_dic.has_key(string.lower(uuid)): TCGAbarcode = aliquote_dic[string.lower(uuid)] else: TCGAbarcode = uuid else: TCGAbarcode = sample intID = TCGAUtil.barcode_IntegrationId(TCGAbarcode) if intID == None: # ids is on patient level above integration level continue if not intDic.has_key(intID): intDic[intID] = "" outfile = outDir + cancer + "/" + var fout = open(outfile, "w") fout.write("sample\t" + var + "\n") for intId in intDic: fout.write(intId + "\t" + value + "\n") fout.close() #data josn J = {} J["version"] = datetime.date.today().isoformat() J["name"] = "TCGA_" + cancer + "_" + var J["type"] = "clinicalMatrix" J["dataSubType"] = "phenotype" J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" outfile = outDir + cancer + "/" + var oHandle = open(outfile + ".json", "w") oHandle.write(json.dumps(J, indent=-1)) oHandle.close() if doDerived: if cancer in ["LUAD", "LUSC"]: derived_cancer = "LUNG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["COAD", "READ"]: derived_cancer = "COADREAD" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["GBM", "LGG"]: derived_cancer = "GBMLGG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
import TCGAUtil dic=TCGAUtil.uuid_Aliquot_all() dic=TCGAUtil.uuid_Sample_all() TCGAUtil.uuid_normal_cellline() TCGAUtil.uuid_cellline()
def cohort_variable (var, value, inDir, outDir, cancer, REALRUN, doDerived): print inDir print outDir if REALRUN: ignore =1 bookDic=cgWalk(inDir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() if len(missingMaps)!=1: return map = missingMaps.keys()[0] print map samples =[] for name in missingMaps[map]: obj=bookDic[name] if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" sys.exit() if sample not in samples: samples.append(sample) fin.close() #take too long """ if obj['type']=="mutationVector": fin =open(obj['path'],'U') fin.readline() while 1: line = fin.readline() if string.strip(line) =="": break sample = string.split(line,'\t')[0] if sample not in samples: samples.append(sample) print sample, obj['path'] fin.close() """ intDic={} for sample in samples: #TCGA uuid handling uuid =sample TCGAbarcode ="" if uuid[0:4]!="TCGA": if aliquote_dic.has_key(string.lower(uuid)): TCGAbarcode = aliquote_dic[string.lower(uuid)] else: TCGAbarcode = uuid else: TCGAbarcode = sample intID= TCGAUtil.barcode_IntegrationId(TCGAbarcode) if intID == None: # ids is on patient level above integration level continue if not intDic.has_key(intID): intDic[intID]="" outfile = outDir+cancer+"/"+ var fout =open(outfile,"w") fout.write("sample\t"+var+"\n") for intId in intDic: fout.write(intId+"\t"+ value+"\n") fout.close() #data josn J={} J["version"]= datetime.date.today().isoformat() J["name"]="TCGA_"+cancer+"_"+var J["type"]= "clinicalMatrix" J["dataSubType"]="phenotype" J[":sampleMap"]="TCGA."+cancer+".sampleMap" J["cohort"]="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")" outfile = outDir+cancer+"/"+var oHandle = open(outfile +".json","w") oHandle.write( json.dumps( J, indent=-1 ) ) oHandle.close() if doDerived: if cancer in ["LUAD","LUSC"]: derived_cancer="LUNG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["COAD","READ"]: derived_cancer="COADREAD" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN) if cancer in ["GBM","LGG"]: derived_cancer="GBMLGG" doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
def CAVMid (dir, outDir, cancer,log, REALRUN): print cancer, sys._getframe().f_code.co_name ignore =1 bookDic=cgWalk(dir,ignore) existMaps = collectSampleMaps(bookDic) missingMaps= collectMissingSampleMaps(bookDic) #removeExistMaps for map in existMaps: if map not in missingMaps: missingMaps[map]=existMaps[map] # all aliquote uuid dic aliquote_dic =TCGAUtil.uuid_Aliquot_all() sample_dic =TCGAUtil.uuid_Sample_all() if not os.path.exists (outDir): os.system("mkdir "+outDir) for map in missingMaps: print map sMap =SampleMapNew(None,map) for name in missingMaps[map]: samples =[] intDic={}#keyed on CAVMid sampleDic={} #keyd on original sample id obj=bookDic[name] print obj["name"] if obj['type'] in ["clinicalMatrix","mutationVector"]: outfile = outDir +os.path.basename(obj['path']) os.system("cp "+obj['path']+".json "+outfile+".json") fin = open (outfile+".json",'r') J=json.load(fin) fin.close() if J.has_key(":clinicalFeature"): cFobj= bookDic[J[":clinicalFeature"]] cFoutfile = outDir +os.path.basename(cFobj['path']) os.system("cp "+cFobj['path']+" "+cFoutfile) os.system("cp "+cFobj['path']+".json "+cFoutfile+".json") if REALRUN ==-1: continue if REALRUN ==0 and obj['type']=="mutationVector": continue fin = open(obj['path'],'r') fin.readline() for line in fin.readlines(): sample =string.split(line,"\t")[0] if sample not in samples and sample !="": samples.append(sample) buildSampleDic (samples, sMap, intDic, sampleDic, aliquote_dic) fin = open(obj['path'],'r') fout = open(outfile,'w') fout.write(fin.readline()) for line in fin.readlines(): data =string.split(line,"\t") sample =data[0] try: fout.write(sampleDic[sample]+"\t") fout.write(string.join(data[1:],"\t")) except: fout.write(line) fout.close() if obj['type']=="genomicMatrix": fin =open(obj['path'],'U') for sample in string.split(fin.readline()[:-1],"\t")[1:]: if sample =="": print name, "has bad empty sample id" sys.exit() samples.append(sample) fin.close() outfile = outDir +os.path.basename(obj['path']) os.system("cp "+obj['path']+".json "+outfile+".json") if REALRUN !=1: continue buildSampleDic (samples, sMap, intDic, sampleDic,aliquote_dic) process(obj['path'], outfile, samples, intDic)