def flattenForClinicalFeature(sMap, bookDic): clinFeatures = [] finalClinFeature = None sampleMap = sMap.getName() datasets = collectNamesBelongToSampleMap(bookDic, sampleMap) for name in datasets: obj = bookDic[name] if obj['type'] == "clinicalMatrix": clinFeature = None #clinFeature obj if obj.has_key(':clinicalFeature'): path = bookDic[obj[':clinicalFeature']]['path'] neme = bookDic[obj[':clinicalFeature']]['name'] clinFeature = ClinicalFeatureNew(path, name) #get matrix obj path = obj['path'] name = obj['name'] cMatrix = ClinicalMatrixNew(path, name, False, clinFeature) if clinFeature: clinFeatures.append(clinFeature) fout = open(".tmp", 'w') fout.close() for clinF in clinFeatures: fout = open(".tmptmp", 'w') clinF.store(fout) fout.close() os.system("cat .tmptmp >> .tmp") fin = open(".tmp", 'r') jsonName = trackName_fix(sampleMapBaseName(sMap) + "_clinicalFeature") finalClinFeature = ClinicalFeatureNew(fin, jsonName) if not finalClinFeature.isValid(): print "final clinFeature file .tmp is invalid" return 0 fin.close() #vis exceptions VIS_limit = 4 if bookDic.has_key( sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"): VIS_limit = bookDic[sMap.getName()]["VIS"] finalClinFeature.fillInPriorityVisibility(VIS_limit) finalClinFeature.setFeatureShortTitle("_PATIENT", "_PATIENT_ID") finalClinFeature.setFeatureLongTitle("_PATIENT", "_PATIENT_ID") finalClinFeature.setFeatureValueType("_PATIENT", "category") finalClinFeature.setFeatureShortTitle("_INTEGRATION", "_SAMPLE_ID") finalClinFeature.setFeatureLongTitle("_INTEGRATION", "_SAMPLE_ID") finalClinFeature.setFeatureValueType("_INTEGRATION", "category") return finalClinFeature
def flattenForClinicalFeature(sMap, bookDic): clinFeatures=[] finalClinFeature=None sampleMap = sMap.getName() datasets = collectNamesBelongToSampleMap(bookDic, sampleMap) for name in datasets: obj= bookDic[name] if obj['type']=="clinicalMatrix": clinFeature=None #clinFeature obj if obj.has_key(':clinicalFeature'): path= bookDic[obj[':clinicalFeature']]['path'] neme = bookDic[obj[':clinicalFeature']]['name'] clinFeature = ClinicalFeatureNew(path, name) #get matrix obj path = obj['path'] name = obj['name'] cMatrix = ClinicalMatrixNew(path,name,False, clinFeature) if clinFeature: clinFeatures.append(clinFeature) fout = open(".tmp",'w') fout.close() for clinF in clinFeatures: fout = open(".tmptmp",'w') clinF.store(fout) fout.close() os.system("cat .tmptmp >> .tmp") fin = open(".tmp",'r') jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalFeature") finalClinFeature =ClinicalFeatureNew(fin,jsonName) if not finalClinFeature.isValid(): print "final clinFeature file .tmp is invalid" return 0 fin.close() #vis exceptions VIS_limit=4 if bookDic.has_key(sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"): VIS_limit= bookDic[sMap.getName()]["VIS"] finalClinFeature.fillInPriorityVisibility(VIS_limit) finalClinFeature.setFeatureShortTitle("_PATIENT","_PATIENT_ID") finalClinFeature.setFeatureLongTitle("_PATIENT","_PATIENT_ID") finalClinFeature.setFeatureValueType("_PATIENT","category") finalClinFeature.setFeatureShortTitle("_INTEGRATION","_SAMPLE_ID") finalClinFeature.setFeatureLongTitle("_INTEGRATION","_SAMPLE_ID") finalClinFeature.setFeatureValueType("_INTEGRATION","category") return finalClinFeature
def process(inDir, outDir, dataDir, cancer, flog, PATHPATTERN, originCancer, REALRUN): #print status print cancer, __name__ #set output dir if not os.path.exists(outDir): os.makedirs(outDir) if not os.path.exists(outDir + cancer + "/"): os.makedirs(outDir + cancer + "/") #data processing currentFollowUpV = 0.0 for file in os.listdir(dataDir): if file[-5:] == ".html": continue for pattern in [ "clinical_sample", "clinical_patient", "clinical_follow_up", "auxiliary", "biospecimen_slide", "biospecimen_sample" ]: if string.find(file, pattern) != -1: followUpV = 0.0 cgFileName = string.replace(file, ".txt", "") # the follow_up files has -vn.n version number if cgFileName != re.sub(r'_v[1-9]+.[0-9]+', '', cgFileName): followUpV = string.split( string.split(cgFileName, "follow_up_")[1], "_" + string.lower(cancer))[0][1:] # the auxillary file does not start with clin if cgFileName[0:9] != "clinical_": cgFileName = "clinical_" + cgFileName outfile = outDir + cancer + "/" + cgFileName cFfile = outfile + "_clinicalFeature" if not REALRUN: if os.path.exists(cFfile): tmpClinFeature = ClinicalFeatureNew(cFfile, "tmpName") features = tmpClinFeature.getFeatures() for feature in features: if TCGAUtil.featurePriority.has_key(cancer): if TCGAUtil.featurePriority[cancer].has_key( feature): priority = TCGAUtil.featurePriority[ cancer][feature] tmpClinFeature.setFeaturePriority( feature, priority) tmpClinFeature.setFeatureVisibility( feature, "on") stateOrder = None if TCGAUtil.featureStateOrder.has_key(feature): if TCGAUtil.featureStateOrder[feature].has_key( cancer): stateOrder = TCGAUtil.featureStateOrder[ feature][cancer] if TCGAUtil.featureStateOrder[feature].has_key( "ALL"): stateOrder = TCGAUtil.featureStateOrder[ feature]["ALL"] print stateOrder if stateOrder: tmpClinFeature.setFeatureValueType( feature, "category") tmpClinFeature.setFeatureStates( feature, stateOrder) tmpClinFeature.setFeatureStateOrder( feature, stateOrder) tmpClinFeature.setFeatureStateOrderRelax( feature, "true") if TCGAUtil.valueType.has_key(feature): tmpClinFeature.setFeatureValueType( feature, TCGAUtil.valueType[feature]) fout = open(cFfile, 'w') tmpClinFeature.store(fout) fout.close() infile = dataDir + file #infile often row read has fewer fields than the fieldnames sequence # use csv.DictReader and Writer to fix this fin = open(infile, 'r') reader = csv.DictReader(fin, delimiter="\t", restval="") fout = open(".tmp", 'w') writer = csv.DictWriter(fout, delimiter="\t", fieldnames=reader.fieldnames) writer.writer.writerow(reader.fieldnames) writer.writerows(reader) fout.close() fin.close() os.system("cp .tmp " + infile) if pattern == "clinical_follow_up": print file if cancer == originCancer: cleanupFollowUpFile(infile, ".tmp") os.system("cp .tmp " + infile) # slide file need to be remade due to the need to duplicate column as top or bottom if pattern == "biospecimen_slide": print file if cancer == originCancer: cleanupSlideFile(infile, ".tmp") os.system("cp .tmp " + infile) #clinicalMatrix AllowDupCol = True if string.find(pattern, "biospecimen_") != -1: SkipLines = [2] else: SkipLines = [1, 3] # 1based if os.path.getsize(infile) == 0: continue if pattern == "biospecimen_slide": FirstColAuto = 0 #0 based, already cleaned clinMatrix = ClinicalMatrixNew(infile, "foo", FirstColAuto, None, SkipLines, AllowDupCol) else: FirstColAuto = findIDCol(infile) if FirstColAuto == -1: print infile, "bad header line" continue else: clinMatrix = ClinicalMatrixNew(infile, "foo", FirstColAuto, None, SkipLines, AllowDupCol) clinMatrix.removeCols(["ethnicity", "race", "jewish_origin"]) #,"patient_id"]) if pattern == "clinical_sample" or pattern == "biospecimen_sample": if "sample_type" in clinMatrix.getCOLs(): add_col_PseudoSample(clinMatrix, "sample_type") if "sample_type_id" in clinMatrix.getCOLs(): add_col_PseudoSample(clinMatrix, "sample_type_id") #remove all cols with uuid features = clinMatrix.getCOLs() for f in features: if string.find(f, "uuid") != -1 or string.find( f, "UUID") != -1 or string.find(f, "day_of") != -1: clinMatrix.removeCols([f]) clinMatrix.replaceValue("null", "") clinMatrix.replaceValue("NULL", "") clinMatrix.replaceValue("Null", "") clinMatrix.replaceValue("NA", "") clinMatrix.replaceValue("[null]", "") clinMatrix.replaceValue("[NULL]", "") clinMatrix.replaceValue("[Null]", "") clinMatrix.replaceValue("[NA]", "") clinMatrix.replaceValue("[Not Available]", "") clinMatrix.replaceValue("[Not Applicable]", "") clinMatrix.replaceValue("[Unknown]", "") clinMatrix.replaceValue("[Not Reported]", "") clinMatrix.replaceValue("[Not Requested]", "") clinMatrix.replaceValue("[Not Evaluated]", "") clinMatrix.replaceValue("[Completed]", "") clinMatrix.replaceValue("[Pending]", "") clinMatrix.replaceValue("Not Tested", "") clinMatrix.replaceValue("[]", "") clinMatrix.replaceValue(",\"", "") clinMatrix.replaceValue("\"", "") clinMatrix.replaceValue("'", "") clinMatrix.replaceValue("`", "") clinMatrix.replaceValue("||", "") clinMatrix.replaceValueWhole("|", "") clinMatrix.replaceValue("LUNG", "Lung") #stupid BCR clinMatrix.replaceValue("MSS|MSS", "MSS") #stupid BCR clinMatrix.replaceValue("Alive", "LIVING") #stupid BCR clinMatrix.replaceValue("ALIVE", "LIVING") #stupid BCR clinMatrix.replaceValue("alive", "LIVING") #stupid BCR clinMatrix.replaceValue("Dead", "DECEASED") #stupid BCR clinMatrix.replaceValue("DEAD", "DECEASED") #stupid BCR clinMatrix.replaceValue("dead", "DECEASED") #stupid BCR oHandle = open(outfile, "w") clinMatrix.store(oHandle, validation=True) oHandle.close() #clinicalFeature fout = open(cFfile, "w") fout.write("#feature\tattribute\tvalue\n") cFeatures = clinMatrix.getCOLs() for feature in cFeatures: if not TCGAUtil.featureLongTitle.has_key(feature): longTitle = feature shortTitle = feature message = "Feature Not in dictionary" + "\t" + feature + "\t" + feature flog.write(message + "\n") else: longTitle = TCGAUtil.featureLongTitle[feature] if TCGAUtil.featureShortTitle.has_key(feature): shortTitle = TCGAUtil.featureShortTitle[feature] else: shortTitle = TCGAUtil.featureLongTitle[feature] fout.write(feature + "\tshortTitle\t" + shortTitle + "\n") fout.write(feature + "\tlongTitle\t" + longTitle + "\n") if string.find(feature, "uuid") != -1 or string.find( feature, "UUID") != -1: fout.write(feature + "\tvisibility\toff\n") if TCGAUtil.valueType.has_key(feature): fout.write(feature + "\tvalueType\t" + TCGAUtil.valueType[feature] + "\n") stateOrder = None if TCGAUtil.featureStateOrder.has_key(feature): if TCGAUtil.featureStateOrder[feature].has_key(cancer): fout.write(feature + "\tvalueType\tcategory\n") stateOrder = TCGAUtil.featureStateOrder[feature][ cancer] if TCGAUtil.featureStateOrder[feature].has_key("ALL"): fout.write(feature + "\tvalueType\tcategory\n") stateOrder = TCGAUtil.featureStateOrder[feature][ "ALL"] if stateOrder: for state in stateOrder: fout.write(feature + "\tstate\t" + state + "\n") fout.write(feature + "\tstateOrder\t\"" + string.join(stateOrder, "\",\"") + "\"\n") fout.write(feature + "\tstateOrderRelax\ttrue\n") if TCGAUtil.featurePriority.has_key(cancer): if TCGAUtil.featurePriority[cancer].has_key(feature): priority = TCGAUtil.featurePriority[cancer][ feature] fout.write(feature + "\tpriority\t" + str(priority) + "\n") fout.write(feature + "\tvisibility\ton\n") if feature in [ "gender", "age_at_initial_pathologic_diagnosis", "days_to_last_followup", "days_to_last_known_alive", "sample_type", "mononucleotide_and_dinucleotide_marker_panel_analysis_status", "percent_stromal_cells_BOTTOM", "percent_tumor_nuclei_BOTTOM" ]: fout.write(feature + "\tvisibility\ton\n") fout.close() #json J = {} cFJ = {} oHandle = open(outfile + ".json", "w") #stable if pattern == "clinical_sample": if cancer != originCancer: suffix = "clinSample" + PATHPATTERN + originCancer else: suffix = "clinSample" + PATHPATTERN if pattern == "clinical_patient": if cancer != originCancer: suffix = "clinPatient" + PATHPATTERN + originCancer else: suffix = "clinPatient" + PATHPATTERN if pattern == "clinical_follow_up": if cancer != originCancer: suffix = cgFileName + originCancer else: suffix = cgFileName if pattern == "auxiliary": if cancer != originCancer: suffix = "clinAuxiliary" + PATHPATTERN + originCancer else: suffix = "clinAuxiliary" + PATHPATTERN if pattern == "biospecimen_slide": if cancer != originCancer: suffix = "bioSlide" + PATHPATTERN + originCancer else: suffix = "bioSlide" + PATHPATTERN if pattern == "biospecimen_sample": if cancer != originCancer: suffix = "bioSample" + PATHPATTERN + originCancer else: suffix = "bioSample" + PATHPATTERN J["cgDataVersion"] = 1 J["redistribution"] = True J["dataProducer"] = "TCGA biospecimen core resource" J["url"]=TCGAUtil.remoteBase \ +string.replace(inDir,TCGAUtil.localBase,"") \ + string.replace(dataDir,tmpDir,"")[:-1] J["version"] = datetime.date.today().isoformat() J["wrangler"] = "cgData TCGAscript " + __name__ + " processed on " + datetime.date.today( ).isoformat() J["dataSubType"] = "phenotype" #change description J["wrangling_procedure"] = "Clinical data download from TCGA DCC, processed at UCSC into cgData repository" J["description"] = "This dataset is the TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ") clinical data." #change cgData J["name"] = "TCGA_" + cancer + "_" + suffix cFJ["name"] = J["name"] + "_clinFeat" cFJ["type"] = "clinicalFeature" J["type"] = "clinicalMatrix" J[":sampleMap"] = "TCGA." + cancer + ".sampleMap" J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[ cancer] + " (" + cancer + ")" J[":clinicalFeature"] = cFJ["name"] if pattern == "clinical_follow_up": if cancer != originCancer: J["upToDate"] = str( followUpV) + "_" + originCancer #"Yes" else: J["upToDate"] = str(followUpV) #"Yes" oHandle.write(json.dumps(J, indent=-1)) oHandle.close() oHandle = open(cFfile + ".json", "w") oHandle.write(json.dumps(cFJ, indent=-1)) oHandle.close() return
def flattenEachSampleMap(sMap, bookDic,onlyGenomicSamples): sampleMap = sMap.getName() jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalMatrix") finalClinMatrix= ClinicalMatrixNew(None,jsonName) finalClinMatrixJSON={} finalClinMatrixJSON["name"]=jsonName finalClinMatrixJSON["type"]="clinicalMatrix" finalClinMatrixJSON["path"]="" finalClinMatrixJSON[":sampleMap"]=sampleMap clinFeatures=[] finalClinFeatureJSON=None finalClinFeature=None # add all ids to sMap sMapChanged= checkIdsAllIn(sMap, bookDic) #build initial clinical Matrix with sampleMap ids, all with empty data emptyData={} success = finalClinMatrix.addNewRows(sMap.getNodes(),emptyData) if not success: print "fail to add all initial ids from sampleMap" datasets = collectNamesBelongToSampleMap(bookDic, sampleMap) datasetsOrdered =[] #only the ClinicalMatrix ordered list for name in datasets: obj= bookDic[name] if obj['type']=="clinicalMatrix": if obj.has_key('outOfDate') and obj['outOfDate'] in ["yes", "Yes","YES"]: datasetsOrdered.append(name) elif not obj.has_key('outOfDate') and not obj.has_key('upToDate'): datasetsOrdered.insert(0,name) upToDateSets={} for name in datasets: obj= bookDic[name] if obj['type']=="clinicalMatrix": if obj.has_key('upToDate') : upToDateSets[obj['upToDate']]=name keys= upToDateSets.keys() keys.sort() for version in keys: name = upToDateSets [version] datasetsOrdered.insert(0,name) for name in datasetsOrdered: obj= bookDic[name] if obj['type']=="clinicalMatrix": clinFeature=None #clinFeature obj if obj.has_key(':clinicalFeature'): path= bookDic[obj[':clinicalFeature']]['path'] neme = bookDic[obj[':clinicalFeature']]['name'] clinFeature = ClinicalFeatureNew(path, name) #get matrix obj path = obj['path'] name = obj['name'] cMatrix = ClinicalMatrixNew(path,name,False, clinFeature) if finalClinMatrix==None: finalClinMatrix= cMatrix if finalClinMatrixJSON==None: finalClinMatrixJSON= obj #merge final and cMatrix if finalClinMatrix != cMatrix: print "name=",cMatrix.getName() r = finalClinMatrix.addNewCols(cMatrix,validation=True) if r!=True: print "Fail to merge" return False #add clinFeature if clinFeature: clinFeatures.append(clinFeature) #merge finalClinMatrixJSON with new json if finalClinMatrixJSON != obj: jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalMatrix") finalClinMatrixJSON= cgDataMergeJSON(finalClinMatrixJSON, obj, jsonName) # final ClinFeature json if clinFeature: clinFeatureJSON = bookDic[obj[':clinicalFeature']] if finalClinFeatureJSON==None: finalClinFeatureJSON= clinFeatureJSON else: jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalFeature") finalClinFeatureJSON["version"]=datetime.date.today().isoformat() finalClinFeatureJSON["type"]="clinicalFeature" finalClinFeatureJSON["name"]=jsonName #final clinicalFeature if finalClinFeatureJSON: fout = open(".tmp",'w') fout.close() for clinF in clinFeatures: fout = open(".tmptmp",'w') clinF.store(fout) fout.close() os.system("cat .tmptmp >> .tmp") fin = open(".tmp",'r') finalClinFeature =ClinicalFeatureNew(fin,finalClinFeatureJSON['name']) if not finalClinFeature.isValid(): print "final clinFeature file .tmp is invalid" return 0 fin.close() #SURVIVAL analysis data foundE=0 foundT=0 if finalClinFeature: features= finalClinFeature.getFeatures() for feature in features: sameAs = finalClinFeature.getFeatureSameAs(feature) if sameAs =="_TIME_TO_EVENT": #check there is only one parameter is set to be _TIME_TO_EVENT if foundT==1: print "ERROR there is already _TIME_TO_EVENT" continue #check matrix does not have _TIME_TO_EVNET if sameAs in finalClinMatrix.getCOLs(): print "ERROR there is already _TIME_TO_EVENT in matrix" continue #data check need to check these are floats or "" in both clinFeature and clinMatrix GOOD=1 if finalClinMatrix.isTypeFloat(feature)!= True: print "ERROR _TIME_TO_EVENT parent feature values are not correct", finalClinMatrix.getColStates(feature) GOOD=0 if GOOD: foundT=1 finalClinMatrix.addNewColfromOld(sameAs, feature) finalClinFeature.setFeatureValueType(sameAs,"float") if sameAs =="_EVENT": #check there is only one parameter is set to be _EVENT if foundE==1: print "ERROR there is already _EVENT" continue #check matrix does not have _EVNET if sameAs in finalClinMatrix.getCOLs(): print "ERROR there is already _EVENT in matrix" continue #data check GOOD=1 states= finalClinMatrix.getColStates(feature) """ for state in states: if state not in [0,1,"0","1",""]: print "ERROR _EVENT values are not correct", state GOOD=0 break """ if len(states) not in [2,3]: GOOD=0 if len(states)==3 and states.count('')!=1: GOOD=0 if GOOD: foundE=1 finalClinMatrix.addNewColfromOld(sameAs, feature) finalClinFeature.setFeatureValueType(sameAs,"category") #finalClinFeature.setFeatureStates(sameAs,["0","1"]) #finalClinFeature.setFeatureStateOrder(sameAs,["0","1"]) #clinical data push down roots = sMap.allRoots() for root in roots: r = finalClinMatrix.pushToChildren (root,sMap) if r != True: print "Fail to push down" return 0 print "after clinical push down", sampleMap,finalClinMatrix.getROWnum() # collect all genomic data keepSamples = getAllGenomicIds(sMap, bookDic) # removing rows without genomic data from clinical data matrix due to mysql enum limitation # should remove this step after the display functionality is done better, currently cgb clinical data range in feature control panel shows the full range of clinical data without checking if the specific track/dataset has the full value range. if onlyGenomicSamples: print "genomic sample count", len(keepSamples) success= finalClinMatrix.onlyKeepRows(keepSamples) if not success: print "fail to remove extra rows" else: print "after keeping sample with genomic data", finalClinMatrix.getROWnum() #add to the clinical matrix any samples with genomic data but no clinical data emptyData={} for col in finalClinMatrix.getCOLs(): emptyData[col]="" success = finalClinMatrix.addNewRows(keepSamples,emptyData) if not success: print "fail to add new roows" else: print "after adding all genomic data", finalClinMatrix.getROWnum() if finalClinMatrix.validate() != True: print "Fail to validate" cMatrix = oldCMatrix return 0 # end of collecting all genomic data #code to remove blacklist samples and all its descendants badList= badListSelfAndDescendants (sMap, bookDic) if badList!=[]: #remove badList samples finalClinMatrix.removeRows(badList, True) print "after remove badList", finalClinMatrix.getROWnum() #identify empty features badFeatures= finalClinMatrix.findBadColsNotRemove() print "emptye features:", badFeatures #finalBadFeatures=[] #if finalClinFeature: ########### don't understand this # for feature in badFeatures: # #get short label # shortTitle = finalClinFeature.getShortTitle(feature) # if not shortTitle: # print feature,"remove" # finalBadFeatures.append(feature) # else: # print shortTitle,"not remove" #else: # finalBadFeatures =badFeatures[:] #remove bad features finalBadFeatures= badFeatures finalClinMatrix.removeCols(finalBadFeatures) print "remove features", finalBadFeatures # add _PATIENT col if finalClinMatrix.addColRoot(sMap) == None: print "Fail to addColRoot" return 0 # add _INTEGRATION col intList=[] if bookDic.has_key(sampleMap) and bookDic[sampleMap].has_key(":integrationId"): intName=bookDic[sampleMap][":integrationId"] fin= open(bookDic[intName]["path"],"r") intId = IntegrationId(intName,fin) intList = intId.getList() finalClinMatrix.addColIntegration(sMap,intList) # final ClinFeature json if finalClinFeatureJSON==None: jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalFeature") finalClinFeatureJSON= {} finalClinFeatureJSON["version"]=datetime.date.today().isoformat() finalClinFeatureJSON["type"]="clinicalFeature" finalClinFeatureJSON["name"]=jsonName finalClinFeatureJSON["path"]="" finalClinFeature = ClinicalFeatureNew (None, finalClinFeatureJSON["name"]) #final clinicalFeature if finalClinFeature: finalClinFeature.removeFeatures(finalBadFeatures) finalClinFeature.cleanState() finalClinFeature.checkFeatureWithMatrix(finalClinMatrix) #clinicalFeature fillin ValueType finalClinFeature.fillInValueTypeWithMatrix(finalClinMatrix) #clinicalFeature fillin missing features finalClinFeature.fillInFeaturesWithMatrix(finalClinMatrix) #clinicalFeature fillin short and long titles finalClinFeature.fillInTitles() #clinicalFeature fillin priority visibility #vis exceptions VIS_limit=4 if bookDic.has_key(sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"): VIS_limit= bookDic[sMap.getName()]["VIS"] finalClinFeature.fillInPriorityVisibility(VIS_limit) finalClinFeature.setFeatureShortTitle("_PATIENT","_PATIENT_ID") finalClinFeature.setFeatureLongTitle("_PATIENT","_PATIENT_ID") finalClinFeature.setFeatureValueType("_PATIENT","category") finalClinFeature.setFeatureShortTitle("_INTEGRATION","_SAMPLE_ID") finalClinFeature.setFeatureLongTitle("_INTEGRATION","_SAMPLE_ID") finalClinFeature.setFeatureValueType("_INTEGRATION","category") print sampleMap,finalClinMatrix.getROWnum() return finalClinMatrix,finalClinMatrixJSON, finalClinFeature, finalClinFeatureJSON
def flattenEachSampleMap(sMap, bookDic, onlyGenomicSamples): sampleMap = sMap.getName() jsonName = trackName_fix(sampleMapBaseName(sMap) + "_clinicalMatrix") finalClinMatrix = ClinicalMatrixNew(None, jsonName) finalClinMatrixJSON = {} finalClinMatrixJSON["name"] = jsonName finalClinMatrixJSON["type"] = "clinicalMatrix" finalClinMatrixJSON["path"] = "" finalClinMatrixJSON[":sampleMap"] = sampleMap clinFeatures = [] finalClinFeatureJSON = None finalClinFeature = None # add all ids to sMap sMapChanged = checkIdsAllIn(sMap, bookDic) #build initial clinical Matrix with sampleMap ids, all with empty data emptyData = {} success = finalClinMatrix.addNewRows(sMap.getNodes(), emptyData) if not success: print "fail to add all initial ids from sampleMap" datasets = collectNamesBelongToSampleMap(bookDic, sampleMap) datasetsOrdered = [] #only the ClinicalMatrix ordered list for name in datasets: obj = bookDic[name] if obj['type'] == "clinicalMatrix": if obj.has_key('outOfDate') and obj['outOfDate'] in [ "yes", "Yes", "YES" ]: datasetsOrdered.append(name) elif not obj.has_key('outOfDate') and not obj.has_key('upToDate'): datasetsOrdered.insert(0, name) upToDateSets = {} for name in datasets: obj = bookDic[name] if obj['type'] == "clinicalMatrix": if obj.has_key('upToDate'): upToDateSets[obj['upToDate']] = name keys = upToDateSets.keys() keys.sort() for version in keys: name = upToDateSets[version] datasetsOrdered.insert(0, name) for name in datasetsOrdered: obj = bookDic[name] if obj['type'] == "clinicalMatrix": clinFeature = None #clinFeature obj if obj.has_key(':clinicalFeature'): path = bookDic[obj[':clinicalFeature']]['path'] neme = bookDic[obj[':clinicalFeature']]['name'] clinFeature = ClinicalFeatureNew(path, name) #get matrix obj path = obj['path'] name = obj['name'] cMatrix = ClinicalMatrixNew(path, name, False, clinFeature) if finalClinMatrix == None: finalClinMatrix = cMatrix if finalClinMatrixJSON == None: finalClinMatrixJSON = obj #merge final and cMatrix if finalClinMatrix != cMatrix: print "name=", cMatrix.getName() r = finalClinMatrix.addNewCols(cMatrix, validation=True) if r != True: print "Fail to merge" return False #add clinFeature if clinFeature: clinFeatures.append(clinFeature) #merge finalClinMatrixJSON with new json if finalClinMatrixJSON != obj: jsonName = trackName_fix( sampleMapBaseName(sMap) + "_clinicalMatrix") finalClinMatrixJSON = cgDataMergeJSON(finalClinMatrixJSON, obj, jsonName) # final ClinFeature json if clinFeature: clinFeatureJSON = bookDic[obj[':clinicalFeature']] if finalClinFeatureJSON == None: finalClinFeatureJSON = clinFeatureJSON else: jsonName = trackName_fix( sampleMapBaseName(sMap) + "_clinicalFeature") finalClinFeatureJSON["version"] = datetime.date.today( ).isoformat() finalClinFeatureJSON["type"] = "clinicalFeature" finalClinFeatureJSON["name"] = jsonName #final clinicalFeature if finalClinFeatureJSON: fout = open(".tmp", 'w') fout.close() for clinF in clinFeatures: fout = open(".tmptmp", 'w') clinF.store(fout) fout.close() os.system("cat .tmptmp >> .tmp") fin = open(".tmp", 'r') finalClinFeature = ClinicalFeatureNew(fin, finalClinFeatureJSON['name']) if not finalClinFeature.isValid(): print "final clinFeature file .tmp is invalid" return 0 fin.close() #SURVIVAL analysis data foundE = 0 foundT = 0 if finalClinFeature: features = finalClinFeature.getFeatures() for feature in features: sameAs = finalClinFeature.getFeatureSameAs(feature) if sameAs == "_TIME_TO_EVENT": #check there is only one parameter is set to be _TIME_TO_EVENT if foundT == 1: print "ERROR there is already _TIME_TO_EVENT" continue #check matrix does not have _TIME_TO_EVNET if sameAs in finalClinMatrix.getCOLs(): print "ERROR there is already _TIME_TO_EVENT in matrix" continue #data check need to check these are floats or "" in both clinFeature and clinMatrix GOOD = 1 if finalClinMatrix.isTypeFloat(feature) != True: print "ERROR _TIME_TO_EVENT parent feature values are not correct", finalClinMatrix.getColStates( feature) GOOD = 0 if GOOD: foundT = 1 finalClinMatrix.addNewColfromOld(sameAs, feature) finalClinFeature.setFeatureValueType(sameAs, "float") if sameAs == "_EVENT": #check there is only one parameter is set to be _EVENT if foundE == 1: print "ERROR there is already _EVENT" continue #check matrix does not have _EVNET if sameAs in finalClinMatrix.getCOLs(): print "ERROR there is already _EVENT in matrix" continue #data check GOOD = 1 states = finalClinMatrix.getColStates(feature) """ for state in states: if state not in [0,1,"0","1",""]: print "ERROR _EVENT values are not correct", state GOOD=0 break """ if len(states) not in [2, 3]: GOOD = 0 if len(states) == 3 and states.count('') != 1: GOOD = 0 if GOOD: foundE = 1 finalClinMatrix.addNewColfromOld(sameAs, feature) finalClinFeature.setFeatureValueType(sameAs, "category") #finalClinFeature.setFeatureStates(sameAs,["0","1"]) #finalClinFeature.setFeatureStateOrder(sameAs,["0","1"]) #clinical data push down roots = sMap.allRoots() for root in roots: r = finalClinMatrix.pushToChildren(root, sMap) if r != True: print "Fail to push down" return 0 print "after clinical push down", sampleMap, finalClinMatrix.getROWnum() # collect all genomic data keepSamples = getAllGenomicIds(sMap, bookDic) # removing rows without genomic data from clinical data matrix due to mysql enum limitation # should remove this step after the display functionality is done better, currently cgb clinical data range in feature control panel shows the full range of clinical data without checking if the specific track/dataset has the full value range. if onlyGenomicSamples: print "genomic sample count", len(keepSamples) success = finalClinMatrix.onlyKeepRows(keepSamples) if not success: print "fail to remove extra rows" else: print "after keeping sample with genomic data", finalClinMatrix.getROWnum( ) #add to the clinical matrix any samples with genomic data but no clinical data emptyData = {} for col in finalClinMatrix.getCOLs(): emptyData[col] = "" success = finalClinMatrix.addNewRows(keepSamples, emptyData) if not success: print "fail to add new roows" else: print "after adding all genomic data", finalClinMatrix.getROWnum() if finalClinMatrix.validate() != True: print "Fail to validate" cMatrix = oldCMatrix return 0 # end of collecting all genomic data #code to remove blacklist samples and all its descendants badList = badListSelfAndDescendants(sMap, bookDic) if badList != []: #remove badList samples finalClinMatrix.removeRows(badList, True) print "after remove badList", finalClinMatrix.getROWnum() #identify empty features badFeatures = finalClinMatrix.findBadColsNotRemove() print "emptye features:", badFeatures #finalBadFeatures=[] #if finalClinFeature: ########### don't understand this # for feature in badFeatures: # #get short label # shortTitle = finalClinFeature.getShortTitle(feature) # if not shortTitle: # print feature,"remove" # finalBadFeatures.append(feature) # else: # print shortTitle,"not remove" #else: # finalBadFeatures =badFeatures[:] #remove bad features finalBadFeatures = badFeatures finalClinMatrix.removeCols(finalBadFeatures) print "remove features", finalBadFeatures # add _PATIENT col if finalClinMatrix.addColRoot(sMap) == None: print "Fail to addColRoot" return 0 # add _INTEGRATION col intList = [] if bookDic.has_key(sampleMap) and bookDic[sampleMap].has_key( ":integrationId"): intName = bookDic[sampleMap][":integrationId"] fin = open(bookDic[intName]["path"], "r") intId = IntegrationId(intName, fin) intList = intId.getList() finalClinMatrix.addColIntegration(sMap, intList) # final ClinFeature json if finalClinFeatureJSON == None: jsonName = trackName_fix(sampleMapBaseName(sMap) + "_clinicalFeature") finalClinFeatureJSON = {} finalClinFeatureJSON["version"] = datetime.date.today().isoformat() finalClinFeatureJSON["type"] = "clinicalFeature" finalClinFeatureJSON["name"] = jsonName finalClinFeatureJSON["path"] = "" finalClinFeature = ClinicalFeatureNew(None, finalClinFeatureJSON["name"]) #final clinicalFeature if finalClinFeature: finalClinFeature.removeFeatures(finalBadFeatures) finalClinFeature.cleanState() finalClinFeature.checkFeatureWithMatrix(finalClinMatrix) #clinicalFeature fillin ValueType finalClinFeature.fillInValueTypeWithMatrix(finalClinMatrix) #clinicalFeature fillin missing features finalClinFeature.fillInFeaturesWithMatrix(finalClinMatrix) #clinicalFeature fillin short and long titles finalClinFeature.fillInTitles() #clinicalFeature fillin priority visibility #vis exceptions VIS_limit = 4 if bookDic.has_key( sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"): VIS_limit = bookDic[sMap.getName()]["VIS"] finalClinFeature.fillInPriorityVisibility(VIS_limit) finalClinFeature.setFeatureShortTitle("_PATIENT", "_PATIENT_ID") finalClinFeature.setFeatureLongTitle("_PATIENT", "_PATIENT_ID") finalClinFeature.setFeatureValueType("_PATIENT", "category") finalClinFeature.setFeatureShortTitle("_INTEGRATION", "_SAMPLE_ID") finalClinFeature.setFeatureLongTitle("_INTEGRATION", "_SAMPLE_ID") finalClinFeature.setFeatureValueType("_INTEGRATION", "category") print sampleMap, finalClinMatrix.getROWnum() return finalClinMatrix, finalClinMatrixJSON, finalClinFeature, finalClinFeatureJSON