def process(inDir, outDir, cancer, flog, PATHPATTERN, originCancer): #print status print cancer, __name__ #set output dir if not os.path.exists(outDir): os.makedirs(outDir) if not os.path.exists(outDir + cancer + "/"): os.makedirs(outDir + cancer + "/") for file in os.listdir(inDir): clinMatrix = None clinFeature = None clinFfile = "" #find the file #clinMatrix if file[0:6] == PATHPATTERN and os.path.exists(inDir + file + ".json"): pass else: continue infile = inDir + file #json file processing (validation) fjson = open(infile + ".json", "U") J = json.load(fjson) fjson.close() if J["type"] != "clinicalMatrix": continue #clinFeature if J.has_key(":clinicalFeature"): clinFname = J[":clinicalFeature"] for clinFfile in os.listdir(inDir): #find the file if not os.path.exists(inDir + clinFfile + ".json"): continue fjson = open(inDir + clinFfile + ".json", "U") clinFJ = json.load(fjson) fjson.close() #data processing if clinFJ["type"] == "clinicalFeature" and clinFJ[ "name"] == clinFname: print originCancer, cancer if cancer != originCancer: clinFname = clinFname + "_" + originCancer clinFJ["name"] = clinFname clinFeature = ClinicalFeatureNew(inDir + clinFfile, clinFname) for feature in clinFeature.getFeatures(): if TCGAUtil.featurePriority.has_key(cancer): if TCGAUtil.featurePriority[cancer].has_key( feature): priority = TCGAUtil.featurePriority[cancer][ feature] clinFeature.setFeaturePriority( feature, priority) clinFeature.setFeatureVisibility(feature, "on") break #data processing clinMatrix = ClinicalMatrixNew(infile, J["name"], False, clinFeature) clinMatrix.removeCols(["ethnicity", "race", "jewish_origin"]) clinMatrix.replaceValue("null", "") clinMatrix.replaceValue("NULL", "") clinMatrix.replaceValue("Null", "") clinMatrix.replaceValue("NA", "") clinMatrix.replaceValue("[null]", "") clinMatrix.replaceValue("[NULL]", "") clinMatrix.replaceValue("[Null]", "") clinMatrix.replaceValue("[NA]", "") clinMatrix.replaceValue("[Not Available]", "") clinMatrix.replaceValue("[Not Reported]", "") clinMatrix.replaceValue("[Not Applicable]", "") clinMatrix.replaceValue("[Not Requested]", "") clinMatrix.replaceValue("[Completed]", "") clinMatrix.replaceValue("[Pending]", "") clinMatrix.replaceValue("Not Tested", "") clinMatrix.replaceValue("[]", "") clinMatrix.replaceValue(",\"", "") clinMatrix.replaceValue("\"", "") clinMatrix.replaceValue("'", "") clinMatrix.replaceValue("`", "") clinMatrix.replaceValue("|", "") #if cancer != originCancer: # clinMatrix.addOneColWithSameValue("cohort",originCancer) #json file processing (validation) fjson = open(infile + ".json", "U") J = json.load(fjson) fjson.close() if cancer != originCancer: J['name'] = J['name'] + "_" + originCancer J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" name = trackName_fix(J['name']) if name == False: message = "bad object name, need fix otherwise break loader, too long " + J[ "name"] print message flog.write(message + "\n") return else: J["name"] = name if cancer != originCancer and J.has_key(":clinicalFeature"): J[":clinicalFeature"] = J[":clinicalFeature"] + "_" + originCancer J["cgDataVersion"] = 1 #output matrix if cancer != originCancer: outfile = outDir + cancer + "/" + file + "_" + originCancer else: outfile = outDir + cancer + "/" + file oHandle = open(outfile, "w") clinMatrix.store(oHandle, validation=True) oHandle.close() fjson = open(outfile + ".json", "w") json.dump(J, fjson, indent=-1) fjson.close() #output clinFeature if clinFeature: if cancer != originCancer: outfile = outDir + cancer + "/" + clinFfile + "_" + originCancer else: outfile = outDir + cancer + "/" + clinFfile fout = open(outfile, 'w') clinFeature.store(fout) fout.close() clinFJ["cgDataVersion"] = 1 fjson = open(outfile + ".json", "w") json.dump(clinFJ, fjson, indent=-1) fjson.close() return
def process(inDir, outDir, dataDir, cancer, flog, PATHPATTERN, originCancer, REALRUN): #print status print cancer, __name__ #set output dir if not os.path.exists(outDir): os.makedirs(outDir) if not os.path.exists(outDir + cancer + "/"): os.makedirs(outDir + cancer + "/") #data processing currentFollowUpV = 0.0 for file in os.listdir(dataDir): if file[-5:] == ".html": continue for pattern in [ "clinical_sample", "clinical_patient", "clinical_follow_up", "auxiliary", "biospecimen_slide", "biospecimen_sample" ]: if string.find(file, pattern) != -1: followUpV = 0.0 cgFileName = string.replace(file, ".txt", "") # the follow_up files has -vn.n version number if cgFileName != re.sub(r'_v[1-9]+.[0-9]+', '', cgFileName): followUpV = string.split( string.split(cgFileName, "follow_up_")[1], "_" + string.lower(cancer))[0][1:] # the auxillary file does not start with clin if cgFileName[0:9] != "clinical_": cgFileName = "clinical_" + cgFileName outfile = outDir + cancer + "/" + cgFileName cFfile = outfile + "_clinicalFeature" if not REALRUN: if os.path.exists(cFfile): tmpClinFeature = ClinicalFeatureNew(cFfile, "tmpName") features = tmpClinFeature.getFeatures() for feature in features: if TCGAUtil.featurePriority.has_key(cancer): if TCGAUtil.featurePriority[cancer].has_key( feature): priority = TCGAUtil.featurePriority[ cancer][feature] tmpClinFeature.setFeaturePriority( feature, priority) tmpClinFeature.setFeatureVisibility( feature, "on") stateOrder = None if TCGAUtil.featureStateOrder.has_key(feature): if TCGAUtil.featureStateOrder[feature].has_key( cancer): stateOrder = TCGAUtil.featureStateOrder[ feature][cancer] if TCGAUtil.featureStateOrder[feature].has_key( "ALL"): stateOrder = TCGAUtil.featureStateOrder[ feature]["ALL"] print stateOrder if stateOrder: tmpClinFeature.setFeatureValueType( feature, "category") tmpClinFeature.setFeatureStates( feature, stateOrder) tmpClinFeature.setFeatureStateOrder( feature, stateOrder) tmpClinFeature.setFeatureStateOrderRelax( feature, "true") if TCGAUtil.valueType.has_key(feature): tmpClinFeature.setFeatureValueType( feature, TCGAUtil.valueType[feature]) fout = open(cFfile, 'w') tmpClinFeature.store(fout) fout.close() infile = dataDir + file #infile often row read has fewer fields than the fieldnames sequence # use csv.DictReader and Writer to fix this fin = open(infile, 'r') reader = csv.DictReader(fin, delimiter="\t", restval="") fout = open(".tmp", 'w') writer = csv.DictWriter(fout, delimiter="\t", fieldnames=reader.fieldnames) writer.writer.writerow(reader.fieldnames) writer.writerows(reader) fout.close() fin.close() os.system("cp .tmp " + infile) if pattern == "clinical_follow_up": print file if cancer == originCancer: cleanupFollowUpFile(infile, ".tmp") os.system("cp .tmp " + infile) # slide file need to be remade due to the need to duplicate column as top or bottom if pattern == "biospecimen_slide": print file if cancer == originCancer: cleanupSlideFile(infile, ".tmp") os.system("cp .tmp " + infile) #clinicalMatrix AllowDupCol = True if string.find(pattern, "biospecimen_") != -1: SkipLines = [2] else: SkipLines = [1, 3] # 1based if os.path.getsize(infile) == 0: continue if pattern == "biospecimen_slide": FirstColAuto = 0 #0 based, already cleaned clinMatrix = ClinicalMatrixNew(infile, "foo", FirstColAuto, None, SkipLines, AllowDupCol) else: FirstColAuto = findIDCol(infile) if FirstColAuto == -1: print infile, "bad header line" continue else: clinMatrix = ClinicalMatrixNew(infile, "foo", FirstColAuto, None, SkipLines, AllowDupCol) clinMatrix.removeCols(["ethnicity", "race", "jewish_origin"]) #,"patient_id"]) if pattern == "clinical_sample" or pattern == "biospecimen_sample": if "sample_type" in clinMatrix.getCOLs(): add_col_PseudoSample(clinMatrix, "sample_type") if "sample_type_id" in clinMatrix.getCOLs(): add_col_PseudoSample(clinMatrix, "sample_type_id") #remove all cols with uuid features = clinMatrix.getCOLs() for f in features: if string.find(f, "uuid") != -1 or string.find( f, "UUID") != -1 or string.find(f, "day_of") != -1: clinMatrix.removeCols([f]) clinMatrix.replaceValue("null", "") clinMatrix.replaceValue("NULL", "") clinMatrix.replaceValue("Null", "") clinMatrix.replaceValue("NA", "") clinMatrix.replaceValue("[null]", "") clinMatrix.replaceValue("[NULL]", "") clinMatrix.replaceValue("[Null]", "") clinMatrix.replaceValue("[NA]", "") clinMatrix.replaceValue("[Not Available]", "") clinMatrix.replaceValue("[Not Applicable]", "") clinMatrix.replaceValue("[Unknown]", "") clinMatrix.replaceValue("[Not Reported]", "") clinMatrix.replaceValue("[Not Requested]", "") clinMatrix.replaceValue("[Not Evaluated]", "") clinMatrix.replaceValue("[Completed]", "") clinMatrix.replaceValue("[Pending]", "") clinMatrix.replaceValue("Not Tested", "") clinMatrix.replaceValue("[]", "") clinMatrix.replaceValue(",\"", "") clinMatrix.replaceValue("\"", "") clinMatrix.replaceValue("'", "") clinMatrix.replaceValue("`", "") clinMatrix.replaceValue("||", "") clinMatrix.replaceValueWhole("|", "") clinMatrix.replaceValue("LUNG", "Lung") #stupid BCR clinMatrix.replaceValue("MSS|MSS", "MSS") #stupid BCR clinMatrix.replaceValue("Alive", "LIVING") #stupid BCR clinMatrix.replaceValue("ALIVE", "LIVING") #stupid BCR clinMatrix.replaceValue("alive", "LIVING") #stupid BCR clinMatrix.replaceValue("Dead", "DECEASED") #stupid BCR clinMatrix.replaceValue("DEAD", "DECEASED") #stupid BCR clinMatrix.replaceValue("dead", "DECEASED") #stupid BCR oHandle = open(outfile, "w") clinMatrix.store(oHandle, validation=True) oHandle.close() #clinicalFeature fout = open(cFfile, "w") fout.write("#feature\tattribute\tvalue\n") cFeatures = clinMatrix.getCOLs() for feature in cFeatures: if not TCGAUtil.featureLongTitle.has_key(feature): longTitle = feature shortTitle = feature message = "Feature Not in dictionary" + "\t" + feature + "\t" + feature flog.write(message + "\n") else: longTitle = TCGAUtil.featureLongTitle[feature] if TCGAUtil.featureShortTitle.has_key(feature): shortTitle = TCGAUtil.featureShortTitle[feature] else: shortTitle = TCGAUtil.featureLongTitle[feature] fout.write(feature + "\tshortTitle\t" + shortTitle + "\n") fout.write(feature + "\tlongTitle\t" + longTitle + "\n") if string.find(feature, "uuid") != -1 or string.find( feature, "UUID") != -1: fout.write(feature + "\tvisibility\toff\n") if TCGAUtil.valueType.has_key(feature): fout.write(feature + "\tvalueType\t" + TCGAUtil.valueType[feature] + "\n") stateOrder = None if TCGAUtil.featureStateOrder.has_key(feature): if TCGAUtil.featureStateOrder[feature].has_key(cancer): fout.write(feature + "\tvalueType\tcategory\n") stateOrder = TCGAUtil.featureStateOrder[feature][ cancer] if TCGAUtil.featureStateOrder[feature].has_key("ALL"): fout.write(feature + "\tvalueType\tcategory\n") stateOrder = TCGAUtil.featureStateOrder[feature][ "ALL"] if stateOrder: for state in stateOrder: fout.write(feature + "\tstate\t" + state + "\n") fout.write(feature + "\tstateOrder\t\"" + string.join(stateOrder, "\",\"") + "\"\n") fout.write(feature + "\tstateOrderRelax\ttrue\n") if TCGAUtil.featurePriority.has_key(cancer): if TCGAUtil.featurePriority[cancer].has_key(feature): priority = TCGAUtil.featurePriority[cancer][ feature] fout.write(feature + "\tpriority\t" + str(priority) + "\n") fout.write(feature + "\tvisibility\ton\n") if feature in [ "gender", "age_at_initial_pathologic_diagnosis", "days_to_last_followup", "days_to_last_known_alive", "sample_type", "mononucleotide_and_dinucleotide_marker_panel_analysis_status", "percent_stromal_cells_BOTTOM", "percent_tumor_nuclei_BOTTOM" ]: fout.write(feature + "\tvisibility\ton\n") fout.close() #json J = {} cFJ = {} oHandle = open(outfile + ".json", "w") #stable if pattern == "clinical_sample": if cancer != originCancer: suffix = "clinSample" + PATHPATTERN + originCancer else: suffix = "clinSample" + PATHPATTERN if pattern == "clinical_patient": if cancer != originCancer: suffix = "clinPatient" + PATHPATTERN + originCancer else: suffix = "clinPatient" + PATHPATTERN if pattern == "clinical_follow_up": if cancer != originCancer: suffix = cgFileName + originCancer else: suffix = cgFileName if pattern == "auxiliary": if cancer != originCancer: suffix = "clinAuxiliary" + PATHPATTERN + originCancer else: suffix = "clinAuxiliary" + PATHPATTERN if pattern == "biospecimen_slide": if cancer != originCancer: suffix = "bioSlide" + PATHPATTERN + originCancer else: suffix = "bioSlide" + PATHPATTERN if pattern == "biospecimen_sample": if cancer != originCancer: suffix = "bioSample" + PATHPATTERN + originCancer else: suffix = "bioSample" + PATHPATTERN J["cgDataVersion"] = 1 J["redistribution"] = True J["dataProducer"] = "TCGA biospecimen core resource" J["url"]=TCGAUtil.remoteBase \ +string.replace(inDir,TCGAUtil.localBase,"") \ + string.replace(dataDir,tmpDir,"")[:-1] J["version"] = datetime.date.today().isoformat() J["wrangler"] = "cgData TCGAscript " + __name__ + " processed on " + datetime.date.today( ).isoformat() J["dataSubType"] = "phenotype" #change description J["wrangling_procedure"] = "Clinical data download from TCGA DCC, processed at UCSC into cgData repository" J["description"] = "This dataset is the TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ") clinical data." #change cgData J["name"] = "TCGA_" + cancer + "_" + suffix cFJ["name"] = J["name"] + "_clinFeat" cFJ["type"] = "clinicalFeature" J["type"] = "clinicalMatrix" J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" J[":clinicalFeature"] = cFJ["name"] if pattern == "clinical_follow_up": if cancer != originCancer: J["upToDate"] = str( followUpV) + "_" + originCancer #"Yes" else: J["upToDate"] = str(followUpV) #"Yes" oHandle.write(json.dumps(J, indent=-1)) oHandle.close() oHandle = open(cFfile + ".json", "w") oHandle.write(json.dumps(cFJ, indent=-1)) oHandle.close() return
def process (inDir,outDir,cancer,flog,PATHPATTERN,originCancer): #print status print cancer, __name__ #set output dir if not os.path.exists( outDir ): os.makedirs( outDir ) if not os.path.exists( outDir +cancer+"/"): os.makedirs( outDir+cancer+"/" ) for file in os.listdir(inDir): clinMatrix = None clinFeature =None clinFfile="" #find the file #clinMatrix if file[0:6]== PATHPATTERN and os.path.exists(inDir+ file+".json") : pass else: continue infile = inDir+file #json file processing (validation) fjson= open(infile+".json","U") J =json.load(fjson) fjson.close() if J["type"]!="clinicalMatrix": continue #clinFeature if J.has_key(":clinicalFeature"): clinFname = J[":clinicalFeature"] for clinFfile in os.listdir(inDir): #find the file if not os.path.exists(inDir+ clinFfile+".json"): continue fjson= open(inDir+clinFfile+".json","U") clinFJ =json.load(fjson) fjson.close() #data processing if clinFJ["type"]=="clinicalFeature" and clinFJ["name"]==clinFname: print originCancer, cancer if cancer != originCancer: clinFname= clinFname+"_"+originCancer clinFJ["name"]=clinFname clinFeature= ClinicalFeatureNew(inDir+clinFfile,clinFname) for feature in clinFeature.getFeatures(): if TCGAUtil.featurePriority.has_key(cancer): if TCGAUtil.featurePriority[cancer].has_key(feature): priority= TCGAUtil.featurePriority[cancer][feature] clinFeature.setFeaturePriority(feature, priority) clinFeature.setFeatureVisibility(feature, "on") break #data processing clinMatrix = ClinicalMatrixNew(infile, J["name"], False, clinFeature) clinMatrix.removeCols(["ethnicity","race","jewish_origin"]) clinMatrix.replaceValue("null","") clinMatrix.replaceValue("NULL","") clinMatrix.replaceValue("Null","") clinMatrix.replaceValue("NA","") clinMatrix.replaceValue("[null]","") clinMatrix.replaceValue("[NULL]","") clinMatrix.replaceValue("[Null]","") clinMatrix.replaceValue("[NA]","") clinMatrix.replaceValue("[Not Available]","") clinMatrix.replaceValue("[Not Reported]","") clinMatrix.replaceValue("[Not Applicable]","") clinMatrix.replaceValue("[Not Requested]","") clinMatrix.replaceValue("[Completed]","") clinMatrix.replaceValue("[Pending]","") clinMatrix.replaceValue("Not Tested","") clinMatrix.replaceValue("[]","") clinMatrix.replaceValue(",\"","") clinMatrix.replaceValue("\"","") clinMatrix.replaceValue("'","") clinMatrix.replaceValue("`","") clinMatrix.replaceValue("|","") #if cancer != originCancer: # clinMatrix.addOneColWithSameValue("cohort",originCancer) #json file processing (validation) fjson= open(infile+".json","U") J =json.load(fjson) fjson.close() if cancer != originCancer: J['name'] = J['name'] +"_"+originCancer J[":sampleMap"]="TCGA."+cancer+".sampleMap" J["cohort"]="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")" name = trackName_fix(J['name']) if name ==False: message = "bad object name, need fix otherwise break loader, too long "+J["name"] print message flog.write(message+"\n") return else: J["name"]=name if cancer != originCancer and J.has_key(":clinicalFeature"): J[":clinicalFeature"] = J[":clinicalFeature"] +"_"+originCancer J["cgDataVersion"]=1 #output matrix if cancer != originCancer: outfile = outDir+cancer+"/"+file+"_"+originCancer else: outfile = outDir+cancer+"/"+file oHandle = open(outfile,"w") clinMatrix.store(oHandle, validation=True) oHandle.close() fjson = open(outfile+".json","w") json.dump(J, fjson, indent=-1) fjson.close() #output clinFeature if clinFeature: if cancer != originCancer: outfile = outDir+cancer+"/"+clinFfile+"_"+originCancer else: outfile = outDir+cancer+"/"+clinFfile fout=open(outfile,'w') clinFeature.store(fout) fout.close() clinFJ["cgDataVersion"]=1 fjson = open(outfile+".json","w") json.dump(clinFJ, fjson, indent=-1) fjson.close() return