Esempio n. 1
0
def flattenForClinicalFeature(sMap, bookDic):
    clinFeatures=[]
    finalClinFeature=None
    sampleMap = sMap.getName()
    datasets = collectNamesBelongToSampleMap(bookDic, sampleMap)
    for name in datasets:  
        obj= bookDic[name]
        if obj['type']=="clinicalMatrix":
            clinFeature=None
            #clinFeature obj
            if obj.has_key(':clinicalFeature'):
                path=  bookDic[obj[':clinicalFeature']]['path']
                neme = bookDic[obj[':clinicalFeature']]['name']
                clinFeature = ClinicalFeatureNew(path, name)

            #get matrix obj
            path = obj['path']
            name = obj['name']
            cMatrix = ClinicalMatrixNew(path,name,False, clinFeature)

            if clinFeature:
                clinFeatures.append(clinFeature)

    fout = open(".tmp",'w')
    fout.close()
    for clinF in clinFeatures:
        fout = open(".tmptmp",'w')
        clinF.store(fout)
        fout.close()
        os.system("cat .tmptmp >> .tmp")
    fin = open(".tmp",'r')
    jsonName=  trackName_fix(sampleMapBaseName(sMap)+"_clinicalFeature")
    finalClinFeature =ClinicalFeatureNew(fin,jsonName)
    if not finalClinFeature.isValid():
        print "final clinFeature file .tmp is invalid"
        return 0
    fin.close()

    #vis exceptions
    VIS_limit=4
    if bookDic.has_key(sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"):
        VIS_limit= bookDic[sMap.getName()]["VIS"]
    finalClinFeature.fillInPriorityVisibility(VIS_limit)
    finalClinFeature.setFeatureShortTitle("_PATIENT","_PATIENT_ID")
    finalClinFeature.setFeatureLongTitle("_PATIENT","_PATIENT_ID")
    finalClinFeature.setFeatureValueType("_PATIENT","category")
    finalClinFeature.setFeatureShortTitle("_INTEGRATION","_SAMPLE_ID")
    finalClinFeature.setFeatureLongTitle("_INTEGRATION","_SAMPLE_ID")
    finalClinFeature.setFeatureValueType("_INTEGRATION","category")
    return finalClinFeature
def process(inDir, outDir, dataDir, cancer, flog, PATHPATTERN, originCancer,
            REALRUN):
    #print status
    print cancer, __name__

    #set output dir
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    if not os.path.exists(outDir + cancer + "/"):
        os.makedirs(outDir + cancer + "/")

    #data processing
    currentFollowUpV = 0.0
    for file in os.listdir(dataDir):
        if file[-5:] == ".html":
            continue
        for pattern in [
                "clinical_sample", "clinical_patient", "clinical_follow_up",
                "auxiliary", "biospecimen_slide", "biospecimen_sample"
        ]:
            if string.find(file, pattern) != -1:
                followUpV = 0.0
                cgFileName = string.replace(file, ".txt", "")
                # the follow_up files has -vn.n version number
                if cgFileName != re.sub(r'_v[1-9]+.[0-9]+', '', cgFileName):
                    followUpV = string.split(
                        string.split(cgFileName, "follow_up_")[1],
                        "_" + string.lower(cancer))[0][1:]
                # the auxillary file does not start with clin
                if cgFileName[0:9] != "clinical_":
                    cgFileName = "clinical_" + cgFileName

                outfile = outDir + cancer + "/" + cgFileName
                cFfile = outfile + "_clinicalFeature"

                if not REALRUN:
                    if os.path.exists(cFfile):
                        tmpClinFeature = ClinicalFeatureNew(cFfile, "tmpName")
                        features = tmpClinFeature.getFeatures()
                        for feature in features:
                            if TCGAUtil.featurePriority.has_key(cancer):
                                if TCGAUtil.featurePriority[cancer].has_key(
                                        feature):
                                    priority = TCGAUtil.featurePriority[
                                        cancer][feature]
                                    tmpClinFeature.setFeaturePriority(
                                        feature, priority)
                                    tmpClinFeature.setFeatureVisibility(
                                        feature, "on")

                            stateOrder = None
                            if TCGAUtil.featureStateOrder.has_key(feature):
                                if TCGAUtil.featureStateOrder[feature].has_key(
                                        cancer):
                                    stateOrder = TCGAUtil.featureStateOrder[
                                        feature][cancer]
                                if TCGAUtil.featureStateOrder[feature].has_key(
                                        "ALL"):
                                    stateOrder = TCGAUtil.featureStateOrder[
                                        feature]["ALL"]
                                print stateOrder
                            if stateOrder:
                                tmpClinFeature.setFeatureValueType(
                                    feature, "category")
                                tmpClinFeature.setFeatureStates(
                                    feature, stateOrder)
                                tmpClinFeature.setFeatureStateOrder(
                                    feature, stateOrder)
                                tmpClinFeature.setFeatureStateOrderRelax(
                                    feature, "true")

                            if TCGAUtil.valueType.has_key(feature):
                                tmpClinFeature.setFeatureValueType(
                                    feature, TCGAUtil.valueType[feature])

                        fout = open(cFfile, 'w')
                        tmpClinFeature.store(fout)
                        fout.close()

                infile = dataDir + file
                #infile often row read has fewer fields than the fieldnames sequence
                # use csv.DictReader and Writer to fix this
                fin = open(infile, 'r')
                reader = csv.DictReader(fin, delimiter="\t", restval="")
                fout = open(".tmp", 'w')
                writer = csv.DictWriter(fout,
                                        delimiter="\t",
                                        fieldnames=reader.fieldnames)
                writer.writer.writerow(reader.fieldnames)
                writer.writerows(reader)
                fout.close()
                fin.close()
                os.system("cp .tmp " + infile)

                if pattern == "clinical_follow_up":
                    print file
                    if cancer == originCancer:
                        cleanupFollowUpFile(infile, ".tmp")
                        os.system("cp .tmp " + infile)

                # slide file need to be remade due to the need to duplicate column as top or bottom
                if pattern == "biospecimen_slide":
                    print file
                    if cancer == originCancer:
                        cleanupSlideFile(infile, ".tmp")
                        os.system("cp .tmp " + infile)

                #clinicalMatrix
                AllowDupCol = True
                if string.find(pattern, "biospecimen_") != -1:
                    SkipLines = [2]
                else:
                    SkipLines = [1, 3]  # 1based

                if os.path.getsize(infile) == 0:
                    continue

                if pattern == "biospecimen_slide":
                    FirstColAuto = 0  #0 based,  already cleaned
                    clinMatrix = ClinicalMatrixNew(infile, "foo", FirstColAuto,
                                                   None, SkipLines,
                                                   AllowDupCol)
                else:
                    FirstColAuto = findIDCol(infile)
                    if FirstColAuto == -1:
                        print infile, "bad header line"
                        continue
                    else:
                        clinMatrix = ClinicalMatrixNew(infile, "foo",
                                                       FirstColAuto, None,
                                                       SkipLines, AllowDupCol)

                clinMatrix.removeCols(["ethnicity", "race",
                                       "jewish_origin"])  #,"patient_id"])

                if pattern == "clinical_sample" or pattern == "biospecimen_sample":
                    if "sample_type" in clinMatrix.getCOLs():
                        add_col_PseudoSample(clinMatrix, "sample_type")
                    if "sample_type_id" in clinMatrix.getCOLs():
                        add_col_PseudoSample(clinMatrix, "sample_type_id")

                #remove all cols with uuid
                features = clinMatrix.getCOLs()
                for f in features:
                    if string.find(f, "uuid") != -1 or string.find(
                            f, "UUID") != -1 or string.find(f, "day_of") != -1:
                        clinMatrix.removeCols([f])

                clinMatrix.replaceValue("null", "")
                clinMatrix.replaceValue("NULL", "")
                clinMatrix.replaceValue("Null", "")
                clinMatrix.replaceValue("NA", "")
                clinMatrix.replaceValue("[null]", "")
                clinMatrix.replaceValue("[NULL]", "")
                clinMatrix.replaceValue("[Null]", "")
                clinMatrix.replaceValue("[NA]", "")
                clinMatrix.replaceValue("[Not Available]", "")
                clinMatrix.replaceValue("[Not Applicable]", "")
                clinMatrix.replaceValue("[Unknown]", "")
                clinMatrix.replaceValue("[Not Reported]", "")
                clinMatrix.replaceValue("[Not Requested]", "")
                clinMatrix.replaceValue("[Not Evaluated]", "")
                clinMatrix.replaceValue("[Completed]", "")
                clinMatrix.replaceValue("[Pending]", "")
                clinMatrix.replaceValue("Not Tested", "")
                clinMatrix.replaceValue("[]", "")
                clinMatrix.replaceValue(",\"", "")
                clinMatrix.replaceValue("\"", "")
                clinMatrix.replaceValue("'", "")
                clinMatrix.replaceValue("`", "")
                clinMatrix.replaceValue("||", "")
                clinMatrix.replaceValueWhole("|", "")
                clinMatrix.replaceValue("LUNG", "Lung")  #stupid BCR
                clinMatrix.replaceValue("MSS|MSS", "MSS")  #stupid BCR
                clinMatrix.replaceValue("Alive", "LIVING")  #stupid BCR
                clinMatrix.replaceValue("ALIVE", "LIVING")  #stupid BCR
                clinMatrix.replaceValue("alive", "LIVING")  #stupid BCR
                clinMatrix.replaceValue("Dead", "DECEASED")  #stupid BCR
                clinMatrix.replaceValue("DEAD", "DECEASED")  #stupid BCR
                clinMatrix.replaceValue("dead", "DECEASED")  #stupid BCR

                oHandle = open(outfile, "w")
                clinMatrix.store(oHandle, validation=True)
                oHandle.close()

                #clinicalFeature

                fout = open(cFfile, "w")
                fout.write("#feature\tattribute\tvalue\n")
                cFeatures = clinMatrix.getCOLs()
                for feature in cFeatures:
                    if not TCGAUtil.featureLongTitle.has_key(feature):
                        longTitle = feature
                        shortTitle = feature
                        message = "Feature Not in dictionary" + "\t" + feature + "\t" + feature
                        flog.write(message + "\n")
                    else:
                        longTitle = TCGAUtil.featureLongTitle[feature]
                        if TCGAUtil.featureShortTitle.has_key(feature):
                            shortTitle = TCGAUtil.featureShortTitle[feature]
                        else:
                            shortTitle = TCGAUtil.featureLongTitle[feature]

                    fout.write(feature + "\tshortTitle\t" + shortTitle + "\n")
                    fout.write(feature + "\tlongTitle\t" + longTitle + "\n")
                    if string.find(feature, "uuid") != -1 or string.find(
                            feature, "UUID") != -1:
                        fout.write(feature + "\tvisibility\toff\n")
                    if TCGAUtil.valueType.has_key(feature):
                        fout.write(feature + "\tvalueType\t" +
                                   TCGAUtil.valueType[feature] + "\n")
                    stateOrder = None
                    if TCGAUtil.featureStateOrder.has_key(feature):
                        if TCGAUtil.featureStateOrder[feature].has_key(cancer):
                            fout.write(feature + "\tvalueType\tcategory\n")
                            stateOrder = TCGAUtil.featureStateOrder[feature][
                                cancer]
                        if TCGAUtil.featureStateOrder[feature].has_key("ALL"):
                            fout.write(feature + "\tvalueType\tcategory\n")
                            stateOrder = TCGAUtil.featureStateOrder[feature][
                                "ALL"]
                        if stateOrder:
                            for state in stateOrder:
                                fout.write(feature + "\tstate\t" + state +
                                           "\n")
                            fout.write(feature + "\tstateOrder\t\"" +
                                       string.join(stateOrder, "\",\"") +
                                       "\"\n")
                            fout.write(feature + "\tstateOrderRelax\ttrue\n")

                    if TCGAUtil.featurePriority.has_key(cancer):
                        if TCGAUtil.featurePriority[cancer].has_key(feature):
                            priority = TCGAUtil.featurePriority[cancer][
                                feature]
                            fout.write(feature + "\tpriority\t" +
                                       str(priority) + "\n")
                            fout.write(feature + "\tvisibility\ton\n")

                    if feature in [
                            "gender", "age_at_initial_pathologic_diagnosis",
                            "days_to_last_followup",
                            "days_to_last_known_alive", "sample_type",
                            "mononucleotide_and_dinucleotide_marker_panel_analysis_status",
                            "percent_stromal_cells_BOTTOM",
                            "percent_tumor_nuclei_BOTTOM"
                    ]:
                        fout.write(feature + "\tvisibility\ton\n")
                fout.close()

                #json
                J = {}
                cFJ = {}

                oHandle = open(outfile + ".json", "w")
                #stable
                if pattern == "clinical_sample":
                    if cancer != originCancer:
                        suffix = "clinSample" + PATHPATTERN + originCancer
                    else:
                        suffix = "clinSample" + PATHPATTERN
                if pattern == "clinical_patient":
                    if cancer != originCancer:
                        suffix = "clinPatient" + PATHPATTERN + originCancer
                    else:
                        suffix = "clinPatient" + PATHPATTERN
                if pattern == "clinical_follow_up":
                    if cancer != originCancer:
                        suffix = cgFileName + originCancer
                    else:
                        suffix = cgFileName
                if pattern == "auxiliary":
                    if cancer != originCancer:
                        suffix = "clinAuxiliary" + PATHPATTERN + originCancer
                    else:
                        suffix = "clinAuxiliary" + PATHPATTERN
                if pattern == "biospecimen_slide":
                    if cancer != originCancer:
                        suffix = "bioSlide" + PATHPATTERN + originCancer
                    else:
                        suffix = "bioSlide" + PATHPATTERN
                if pattern == "biospecimen_sample":
                    if cancer != originCancer:
                        suffix = "bioSample" + PATHPATTERN + originCancer
                    else:
                        suffix = "bioSample" + PATHPATTERN
                J["cgDataVersion"] = 1
                J["redistribution"] = True
                J["dataProducer"] = "TCGA biospecimen core resource"
                J["url"]=TCGAUtil.remoteBase \
                          +string.replace(inDir,TCGAUtil.localBase,"") \
                          + string.replace(dataDir,tmpDir,"")[:-1]
                J["version"] = datetime.date.today().isoformat()
                J["wrangler"] = "cgData TCGAscript " + __name__ + " processed on " + datetime.date.today(
                ).isoformat()
                J["dataSubType"] = "phenotype"
                #change description
                J["wrangling_procedure"] = "Clinical data download from TCGA DCC, processed at UCSC into cgData repository"
                J["description"] = "This dataset is the TCGA " + TCGAUtil.cancerHumanReadable[
                    cancer] + " (" + cancer + ") clinical data."

                #change cgData
                J["name"] = "TCGA_" + cancer + "_" + suffix

                cFJ["name"] = J["name"] + "_clinFeat"

                cFJ["type"] = "clinicalFeature"
                J["type"] = "clinicalMatrix"
                J[":sampleMap"] = "TCGA." + cancer + ".sampleMap"
                J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[
                    cancer] + " (" + cancer + ")"
                J[":clinicalFeature"] = cFJ["name"]
                if pattern == "clinical_follow_up":
                    if cancer != originCancer:
                        J["upToDate"] = str(
                            followUpV) + "_" + originCancer  #"Yes"
                    else:
                        J["upToDate"] = str(followUpV)  #"Yes"
                oHandle.write(json.dumps(J, indent=-1))
                oHandle.close()

                oHandle = open(cFfile + ".json", "w")
                oHandle.write(json.dumps(cFJ, indent=-1))
                oHandle.close()
    return
Esempio n. 3
0
def process(inDir, outDir, cancer, flog, PATHPATTERN, originCancer):
    #print status
    print cancer, __name__

    #set output dir
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    if not os.path.exists(outDir + cancer + "/"):
        os.makedirs(outDir + cancer + "/")

    for file in os.listdir(inDir):
        clinMatrix = None
        clinFeature = None
        clinFfile = ""

        #find the file
        #clinMatrix

        if file[0:6] == PATHPATTERN and os.path.exists(inDir + file + ".json"):
            pass
        else:
            continue

        infile = inDir + file

        #json file processing (validation)
        fjson = open(infile + ".json", "U")
        J = json.load(fjson)
        fjson.close()

        if J["type"] != "clinicalMatrix":
            continue

        #clinFeature
        if J.has_key(":clinicalFeature"):
            clinFname = J[":clinicalFeature"]

            for clinFfile in os.listdir(inDir):
                #find the file
                if not os.path.exists(inDir + clinFfile + ".json"):
                    continue

                fjson = open(inDir + clinFfile + ".json", "U")
                clinFJ = json.load(fjson)
                fjson.close()

                #data processing
                if clinFJ["type"] == "clinicalFeature" and clinFJ[
                        "name"] == clinFname:
                    print originCancer, cancer
                    if cancer != originCancer:
                        clinFname = clinFname + "_" + originCancer
                        clinFJ["name"] = clinFname
                    clinFeature = ClinicalFeatureNew(inDir + clinFfile,
                                                     clinFname)
                    for feature in clinFeature.getFeatures():
                        if TCGAUtil.featurePriority.has_key(cancer):
                            if TCGAUtil.featurePriority[cancer].has_key(
                                    feature):
                                priority = TCGAUtil.featurePriority[cancer][
                                    feature]
                                clinFeature.setFeaturePriority(
                                    feature, priority)
                                clinFeature.setFeatureVisibility(feature, "on")
                    break

        #data processing
        clinMatrix = ClinicalMatrixNew(infile, J["name"], False, clinFeature)
        clinMatrix.removeCols(["ethnicity", "race", "jewish_origin"])
        clinMatrix.replaceValue("null", "")
        clinMatrix.replaceValue("NULL", "")
        clinMatrix.replaceValue("Null", "")
        clinMatrix.replaceValue("NA", "")
        clinMatrix.replaceValue("[null]", "")
        clinMatrix.replaceValue("[NULL]", "")
        clinMatrix.replaceValue("[Null]", "")
        clinMatrix.replaceValue("[NA]", "")
        clinMatrix.replaceValue("[Not Available]", "")
        clinMatrix.replaceValue("[Not Reported]", "")
        clinMatrix.replaceValue("[Not Applicable]", "")
        clinMatrix.replaceValue("[Not Requested]", "")
        clinMatrix.replaceValue("[Completed]", "")
        clinMatrix.replaceValue("[Pending]", "")
        clinMatrix.replaceValue("Not Tested", "")
        clinMatrix.replaceValue("[]", "")
        clinMatrix.replaceValue(",\"", "")
        clinMatrix.replaceValue("\"", "")
        clinMatrix.replaceValue("'", "")
        clinMatrix.replaceValue("`", "")
        clinMatrix.replaceValue("|", "")

        #if cancer != originCancer:
        #    clinMatrix.addOneColWithSameValue("cohort",originCancer)

        #json file processing (validation)
        fjson = open(infile + ".json", "U")
        J = json.load(fjson)
        fjson.close()
        if cancer != originCancer:
            J['name'] = J['name'] + "_" + originCancer

        J[":sampleMap"] = "TCGA." + cancer + ".sampleMap"
        J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[
            cancer] + " (" + cancer + ")"
        name = trackName_fix(J['name'])
        if name == False:
            message = "bad object name, need fix otherwise break loader, too long " + J[
                "name"]
            print message
            flog.write(message + "\n")
            return
        else:
            J["name"] = name

        if cancer != originCancer and J.has_key(":clinicalFeature"):
            J[":clinicalFeature"] = J[":clinicalFeature"] + "_" + originCancer

        J["cgDataVersion"] = 1

        #output matrix
        if cancer != originCancer:
            outfile = outDir + cancer + "/" + file + "_" + originCancer
        else:
            outfile = outDir + cancer + "/" + file

        oHandle = open(outfile, "w")
        clinMatrix.store(oHandle, validation=True)
        oHandle.close()

        fjson = open(outfile + ".json", "w")
        json.dump(J, fjson, indent=-1)
        fjson.close()

        #output clinFeature
        if clinFeature:
            if cancer != originCancer:
                outfile = outDir + cancer + "/" + clinFfile + "_" + originCancer
            else:
                outfile = outDir + cancer + "/" + clinFfile
            fout = open(outfile, 'w')
            clinFeature.store(fout)
            fout.close()

            clinFJ["cgDataVersion"] = 1
            fjson = open(outfile + ".json", "w")
            json.dump(clinFJ, fjson, indent=-1)
            fjson.close()
    return
Esempio n. 4
0
def flattenEachSampleMap(sMap, bookDic,onlyGenomicSamples):
    sampleMap = sMap.getName()
    
    jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalMatrix")
    finalClinMatrix= ClinicalMatrixNew(None,jsonName)
    finalClinMatrixJSON={}
    finalClinMatrixJSON["name"]=jsonName
    finalClinMatrixJSON["type"]="clinicalMatrix"
    finalClinMatrixJSON["path"]=""
    finalClinMatrixJSON[":sampleMap"]=sampleMap

    clinFeatures=[]
    finalClinFeatureJSON=None
    finalClinFeature=None

    # add all ids to sMap
    sMapChanged= checkIdsAllIn(sMap, bookDic)

    #build initial clinical Matrix with sampleMap ids, all with empty data
    emptyData={}
    success = finalClinMatrix.addNewRows(sMap.getNodes(),emptyData)
    if not success:
        print "fail to add all initial ids from sampleMap"

    datasets = collectNamesBelongToSampleMap(bookDic, sampleMap)
    datasetsOrdered =[] #only the ClinicalMatrix ordered list
    for name in datasets:  
        obj= bookDic[name]
        if obj['type']=="clinicalMatrix":
            if obj.has_key('outOfDate') and obj['outOfDate'] in ["yes", "Yes","YES"]:
                datasetsOrdered.append(name)
            elif not obj.has_key('outOfDate') and  not obj.has_key('upToDate'):
                datasetsOrdered.insert(0,name)
                
    upToDateSets={}
    for name in datasets:  
        obj= bookDic[name]
        if obj['type']=="clinicalMatrix":
            if obj.has_key('upToDate') :
                upToDateSets[obj['upToDate']]=name
    
    keys= upToDateSets.keys()
    keys.sort()
    for version in keys:
        name = upToDateSets [version]
        datasetsOrdered.insert(0,name)

    for name in datasetsOrdered:  
        obj= bookDic[name]
        if obj['type']=="clinicalMatrix":
            clinFeature=None
            #clinFeature obj
            if obj.has_key(':clinicalFeature'):
                path=  bookDic[obj[':clinicalFeature']]['path']
                neme = bookDic[obj[':clinicalFeature']]['name']
                clinFeature = ClinicalFeatureNew(path, name)

            #get matrix obj
            path = obj['path']
            name = obj['name']

            cMatrix = ClinicalMatrixNew(path,name,False, clinFeature)
            
            if finalClinMatrix==None:
                finalClinMatrix= cMatrix
                
            if finalClinMatrixJSON==None:
                finalClinMatrixJSON= obj

            #merge final and cMatrix
            if finalClinMatrix != cMatrix:
                print "name=",cMatrix.getName()
                r = finalClinMatrix.addNewCols(cMatrix,validation=True)
                if r!=True:
                    print "Fail to merge"
                    return False

            #add clinFeature
            if clinFeature:
                clinFeatures.append(clinFeature)
            
            #merge finalClinMatrixJSON with new json
            if finalClinMatrixJSON != obj:
                jsonName= trackName_fix(sampleMapBaseName(sMap)+"_clinicalMatrix")
                finalClinMatrixJSON= cgDataMergeJSON(finalClinMatrixJSON, obj, jsonName)

            # final ClinFeature json
            if clinFeature:
                clinFeatureJSON = bookDic[obj[':clinicalFeature']]
                if finalClinFeatureJSON==None:
                    finalClinFeatureJSON= clinFeatureJSON
                else:
                    jsonName=  trackName_fix(sampleMapBaseName(sMap)+"_clinicalFeature")
                    finalClinFeatureJSON["version"]=datetime.date.today().isoformat() 
                    finalClinFeatureJSON["type"]="clinicalFeature"
                    finalClinFeatureJSON["name"]=jsonName
            
    #final clinicalFeature
    if finalClinFeatureJSON:
        fout = open(".tmp",'w')
        fout.close()
        for clinF in clinFeatures:
            fout = open(".tmptmp",'w')
            clinF.store(fout)
            fout.close()
            os.system("cat .tmptmp >> .tmp")
        fin = open(".tmp",'r')
        finalClinFeature =ClinicalFeatureNew(fin,finalClinFeatureJSON['name'])
        if not finalClinFeature.isValid():
            print "final clinFeature file .tmp is invalid"
            return 0
        fin.close()
    
    #SURVIVAL analysis data
    foundE=0
    foundT=0
    if finalClinFeature:
        features= finalClinFeature.getFeatures()
        for feature in features:
            sameAs = finalClinFeature.getFeatureSameAs(feature)
            if sameAs =="_TIME_TO_EVENT":
                #check there is only one parameter is set to be _TIME_TO_EVENT
                if foundT==1:
                    print "ERROR there is already _TIME_TO_EVENT"
                    continue
                #check matrix does not have _TIME_TO_EVNET
                if sameAs in finalClinMatrix.getCOLs():
                    print "ERROR there is already _TIME_TO_EVENT in matrix"
                    continue
                #data check need to check these are floats or "" in both clinFeature and clinMatrix 
                GOOD=1
                if finalClinMatrix.isTypeFloat(feature)!= True:
                    print "ERROR _TIME_TO_EVENT parent feature values are not correct", finalClinMatrix.getColStates(feature)
                    GOOD=0
                if GOOD:
                    foundT=1
                    finalClinMatrix.addNewColfromOld(sameAs, feature)
                    finalClinFeature.setFeatureValueType(sameAs,"float")
                    
            if sameAs =="_EVENT":
                #check there is only one parameter is set to be _EVENT
                if foundE==1:
                    print "ERROR there is already _EVENT"
                    continue
                #check matrix does not have _EVNET
                if sameAs in finalClinMatrix.getCOLs():
                    print "ERROR there is already _EVENT in matrix"
                    continue
                #data check
                GOOD=1
                states= finalClinMatrix.getColStates(feature)
                """
                for state in states:
                    if state not in [0,1,"0","1",""]:
                        print "ERROR _EVENT values are not correct", state
                        GOOD=0
                        break
                """
                if len(states) not in [2,3]:
                    GOOD=0
                if len(states)==3 and states.count('')!=1:
                    GOOD=0

                if GOOD:
                    foundE=1
                    finalClinMatrix.addNewColfromOld(sameAs, feature)
                    finalClinFeature.setFeatureValueType(sameAs,"category")
                    #finalClinFeature.setFeatureStates(sameAs,["0","1"])
                    #finalClinFeature.setFeatureStateOrder(sameAs,["0","1"])

    #clinical data push down
    roots = sMap.allRoots()
    for root in roots:
        r = finalClinMatrix.pushToChildren (root,sMap)
        if r != True:
            print "Fail to push down"
            return 0
    print "after clinical push down", sampleMap,finalClinMatrix.getROWnum()
    
    # collect all genomic data
    keepSamples  = getAllGenomicIds(sMap, bookDic)

    # removing rows without genomic data from  clinical data matrix due to mysql enum limitation
    # should remove this step after the display functionality is done better, currently cgb clinical data range in feature control panel shows the full range of clinical data without checking if the specific track/dataset has the full value range.
    if onlyGenomicSamples:
        print "genomic sample count", len(keepSamples)
        success= finalClinMatrix.onlyKeepRows(keepSamples)
        if not success:
            print "fail to remove extra rows"
        else:
            print "after keeping sample with genomic data", finalClinMatrix.getROWnum()
    
    #add to the clinical matrix any samples with genomic data but no clinical data
    emptyData={}
    for col in finalClinMatrix.getCOLs():
        emptyData[col]=""
    success = finalClinMatrix.addNewRows(keepSamples,emptyData)
    if not success:
        print "fail to add new roows"
    else:
        print "after adding all genomic data", finalClinMatrix.getROWnum()

    if finalClinMatrix.validate() != True:
        print "Fail to validate"
        cMatrix = oldCMatrix
        return 0
    # end of collecting all genomic data
    
    #code to remove blacklist samples and all its descendants

    badList= badListSelfAndDescendants (sMap, bookDic)
    if badList!=[]:
        #remove badList samples
        finalClinMatrix.removeRows(badList, True)
        print "after remove badList", finalClinMatrix.getROWnum()
        
    #identify empty features 
    badFeatures= finalClinMatrix.findBadColsNotRemove()

    print "emptye features:", badFeatures

    #finalBadFeatures=[]
    #if finalClinFeature:  ###########  don't understand this
    #    for feature in badFeatures:
    #        #get short label
    #        shortTitle = finalClinFeature.getShortTitle(feature)
    #        if not shortTitle:
    #            print feature,"remove"
    #            finalBadFeatures.append(feature)
    #        else:
    #            print shortTitle,"not remove"
    #else:
    #    finalBadFeatures =badFeatures[:]
        
    #remove bad features
    finalBadFeatures= badFeatures
    finalClinMatrix.removeCols(finalBadFeatures)
    print "remove features", finalBadFeatures

    # add _PATIENT col
    if finalClinMatrix.addColRoot(sMap) == None:
        print "Fail to addColRoot"
        return 0
            
    # add _INTEGRATION col
    intList=[]
    if bookDic.has_key(sampleMap) and bookDic[sampleMap].has_key(":integrationId"):
        intName=bookDic[sampleMap][":integrationId"]
        fin= open(bookDic[intName]["path"],"r")
        intId = IntegrationId(intName,fin)
        intList = intId.getList()
    finalClinMatrix.addColIntegration(sMap,intList)
    
                
    # final ClinFeature json
    if finalClinFeatureJSON==None:
        jsonName=  trackName_fix(sampleMapBaseName(sMap)+"_clinicalFeature")
        finalClinFeatureJSON= {}
        finalClinFeatureJSON["version"]=datetime.date.today().isoformat() 
        finalClinFeatureJSON["type"]="clinicalFeature"
        finalClinFeatureJSON["name"]=jsonName
        finalClinFeatureJSON["path"]=""
        finalClinFeature = ClinicalFeatureNew (None, finalClinFeatureJSON["name"])

    #final clinicalFeature
    if finalClinFeature:
        finalClinFeature.removeFeatures(finalBadFeatures)
        finalClinFeature.cleanState()
        finalClinFeature.checkFeatureWithMatrix(finalClinMatrix)
        #clinicalFeature fillin ValueType
        finalClinFeature.fillInValueTypeWithMatrix(finalClinMatrix)
        #clinicalFeature fillin missing features
        finalClinFeature.fillInFeaturesWithMatrix(finalClinMatrix)
        #clinicalFeature fillin short and long titles
        finalClinFeature.fillInTitles()
        #clinicalFeature fillin priority visibility

        #vis exceptions
        VIS_limit=4
        if bookDic.has_key(sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"):
            VIS_limit= bookDic[sMap.getName()]["VIS"]
        finalClinFeature.fillInPriorityVisibility(VIS_limit)
        
        finalClinFeature.setFeatureShortTitle("_PATIENT","_PATIENT_ID")
        finalClinFeature.setFeatureLongTitle("_PATIENT","_PATIENT_ID")
        finalClinFeature.setFeatureValueType("_PATIENT","category")
        finalClinFeature.setFeatureShortTitle("_INTEGRATION","_SAMPLE_ID")
        finalClinFeature.setFeatureLongTitle("_INTEGRATION","_SAMPLE_ID")
        finalClinFeature.setFeatureValueType("_INTEGRATION","category")
        
    print sampleMap,finalClinMatrix.getROWnum()
    return finalClinMatrix,finalClinMatrixJSON, finalClinFeature, finalClinFeatureJSON
#http://www.tutorialspoint.com/python/python_command_line_arguments.htm
#https://docs.python.org/3.1/library/getopt.html
try:
    opts, args = getopt.getopt(sys.argv[1:], "", ["run"])
except getopt.GetoptError:
    print "python curatedPhenotype.py originalClinFeature(optional) --run"
    sys.exit()

output = "newClinFeature"
clinFeature = None

if len(args) != 0:
    clinFeatureFile = args[0]
    if os.path.exists(clinFeatureFile):
        clinFeature = ClinicalFeatureNew(clinFeatureFile, 'feature')
    else:
        print args[0], "does not exist"
        sys.exit()
else:
    clinFeature = ClinicalFeatureNew(None, 'feature')
    fout = open(output + ".json", 'w')
    J = {}
    J["type"] = "clinicalFeature"
    fout.write(json.dumps(J, indent=2))
    fout.close()

curatedPhenotypeClinFeature(clinFeature)

fout = open(output, 'w')
clinFeature.store(fout)
Esempio n. 6
0
#http://www.tutorialspoint.com/python/python_command_line_arguments.htm
#https://docs.python.org/3.1/library/getopt.html
try:
    opts, args = getopt.getopt(sys.argv[1:],"",["run"])
except getopt.GetoptError:
    print "python curatedPhenotype.py originalClinFeature(optional) --run"
    sys.exit()


output = "newClinFeature"
clinFeature = None

if len(args)!=0:
    clinFeatureFile = args[0]
    if os.path.exists(clinFeatureFile):
        clinFeature = ClinicalFeatureNew(clinFeatureFile,'feature')
    else:
        print args[0],"does not exist"
        sys.exit()
else:
    clinFeature = ClinicalFeatureNew(None,'feature')
    fout = open(output+".json",'w')
    J={}
    J["type"]="clinicalFeature"
    fout.write(json.dumps(J, indent=2))
    fout.close()

curatedPhenotypeClinFeature(clinFeature)

fout = open(output,'w')
clinFeature.store(fout)
Esempio n. 7
0
def process (inDir,outDir,cancer,flog,PATHPATTERN,originCancer):
    #print status
    print cancer, __name__

    #set output dir
    if not os.path.exists( outDir ):
        os.makedirs( outDir )
    if not os.path.exists( outDir +cancer+"/"):
        os.makedirs( outDir+cancer+"/" )

    for file in os.listdir(inDir):
        clinMatrix = None
        clinFeature =None
        clinFfile=""

        #find the file
        #clinMatrix

        if file[0:6]== PATHPATTERN and os.path.exists(inDir+ file+".json") :
            pass
        else:
            continue

        infile = inDir+file

        #json file processing (validation)
        fjson= open(infile+".json","U")
        J =json.load(fjson)
        fjson.close()

        if J["type"]!="clinicalMatrix":
            continue

        #clinFeature
        if J.has_key(":clinicalFeature"):
            clinFname = J[":clinicalFeature"]
       
            for clinFfile in os.listdir(inDir):
                #find the file
                if not os.path.exists(inDir+ clinFfile+".json"):
                    continue

                fjson= open(inDir+clinFfile+".json","U")
                clinFJ =json.load(fjson)
                fjson.close()

                #data processing
                if clinFJ["type"]=="clinicalFeature" and clinFJ["name"]==clinFname:
                    print originCancer, cancer
                    if cancer != originCancer:
                        clinFname= clinFname+"_"+originCancer
                        clinFJ["name"]=clinFname
                    clinFeature= ClinicalFeatureNew(inDir+clinFfile,clinFname)
                    for feature in clinFeature.getFeatures():
                        if TCGAUtil.featurePriority.has_key(cancer):
                            if TCGAUtil.featurePriority[cancer].has_key(feature):
                                priority= TCGAUtil.featurePriority[cancer][feature]
                                clinFeature.setFeaturePriority(feature, priority)
                                clinFeature.setFeatureVisibility(feature, "on")
                    break
                
        #data processing
        clinMatrix = ClinicalMatrixNew(infile, J["name"], False, clinFeature)
        clinMatrix.removeCols(["ethnicity","race","jewish_origin"])
        clinMatrix.replaceValue("null","")
        clinMatrix.replaceValue("NULL","")
        clinMatrix.replaceValue("Null","")
        clinMatrix.replaceValue("NA","")
        clinMatrix.replaceValue("[null]","")
        clinMatrix.replaceValue("[NULL]","")
        clinMatrix.replaceValue("[Null]","")
        clinMatrix.replaceValue("[NA]","")
        clinMatrix.replaceValue("[Not Available]","")
        clinMatrix.replaceValue("[Not Reported]","")
        clinMatrix.replaceValue("[Not Applicable]","")
        clinMatrix.replaceValue("[Not Requested]","")
        clinMatrix.replaceValue("[Completed]","")
        clinMatrix.replaceValue("[Pending]","")
        clinMatrix.replaceValue("Not Tested","")
        clinMatrix.replaceValue("[]","")
        clinMatrix.replaceValue(",\"","")
        clinMatrix.replaceValue("\"","")
        clinMatrix.replaceValue("'","")
        clinMatrix.replaceValue("`","")
        clinMatrix.replaceValue("|","")
        
        #if cancer != originCancer:
        #    clinMatrix.addOneColWithSameValue("cohort",originCancer)

        #json file processing (validation)
        fjson= open(infile+".json","U")
        J =json.load(fjson)
        fjson.close()
        if cancer != originCancer:
            J['name'] = J['name'] +"_"+originCancer

        J[":sampleMap"]="TCGA."+cancer+".sampleMap"
        J["cohort"]="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")"
        name = trackName_fix(J['name'])
        if name ==False:
            message = "bad object name, need fix otherwise break loader, too long "+J["name"]
            print message
            flog.write(message+"\n")
            return
        else:
            J["name"]=name

        if cancer != originCancer and J.has_key(":clinicalFeature"):
            J[":clinicalFeature"] =  J[":clinicalFeature"] +"_"+originCancer

        J["cgDataVersion"]=1

        #output matrix
        if cancer != originCancer:
            outfile = outDir+cancer+"/"+file+"_"+originCancer
        else:
            outfile = outDir+cancer+"/"+file
            
        oHandle = open(outfile,"w")
        clinMatrix.store(oHandle, validation=True)
        oHandle.close()

        fjson = open(outfile+".json","w")
        json.dump(J, fjson, indent=-1)
        fjson.close()

        #output clinFeature 
        if clinFeature:
            if cancer != originCancer:
                outfile = outDir+cancer+"/"+clinFfile+"_"+originCancer
            else:
                outfile = outDir+cancer+"/"+clinFfile
            fout=open(outfile,'w')
            clinFeature.store(fout)
            fout.close()

            clinFJ["cgDataVersion"]=1
            fjson = open(outfile+".json","w")
            json.dump(clinFJ, fjson, indent=-1)
            fjson.close()
    return
def flattenForClinicalFeature(sMap, bookDic):
    clinFeatures = []
    finalClinFeature = None
    sampleMap = sMap.getName()
    datasets = collectNamesBelongToSampleMap(bookDic, sampleMap)
    for name in datasets:
        obj = bookDic[name]
        if obj['type'] == "clinicalMatrix":
            clinFeature = None
            #clinFeature obj
            if obj.has_key(':clinicalFeature'):
                path = bookDic[obj[':clinicalFeature']]['path']
                neme = bookDic[obj[':clinicalFeature']]['name']
                clinFeature = ClinicalFeatureNew(path, name)

            #get matrix obj
            path = obj['path']
            name = obj['name']
            cMatrix = ClinicalMatrixNew(path, name, False, clinFeature)

            if clinFeature:
                clinFeatures.append(clinFeature)

    fout = open(".tmp", 'w')
    fout.close()
    for clinF in clinFeatures:
        fout = open(".tmptmp", 'w')
        clinF.store(fout)
        fout.close()
        os.system("cat .tmptmp >> .tmp")
    fin = open(".tmp", 'r')
    jsonName = trackName_fix(sampleMapBaseName(sMap) + "_clinicalFeature")
    finalClinFeature = ClinicalFeatureNew(fin, jsonName)
    if not finalClinFeature.isValid():
        print "final clinFeature file .tmp is invalid"
        return 0
    fin.close()

    #vis exceptions
    VIS_limit = 4
    if bookDic.has_key(
            sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"):
        VIS_limit = bookDic[sMap.getName()]["VIS"]
    finalClinFeature.fillInPriorityVisibility(VIS_limit)
    finalClinFeature.setFeatureShortTitle("_PATIENT", "_PATIENT_ID")
    finalClinFeature.setFeatureLongTitle("_PATIENT", "_PATIENT_ID")
    finalClinFeature.setFeatureValueType("_PATIENT", "category")
    finalClinFeature.setFeatureShortTitle("_INTEGRATION", "_SAMPLE_ID")
    finalClinFeature.setFeatureLongTitle("_INTEGRATION", "_SAMPLE_ID")
    finalClinFeature.setFeatureValueType("_INTEGRATION", "category")
    return finalClinFeature
def flattenEachSampleMap(sMap, bookDic, onlyGenomicSamples):
    sampleMap = sMap.getName()

    jsonName = trackName_fix(sampleMapBaseName(sMap) + "_clinicalMatrix")
    finalClinMatrix = ClinicalMatrixNew(None, jsonName)
    finalClinMatrixJSON = {}
    finalClinMatrixJSON["name"] = jsonName
    finalClinMatrixJSON["type"] = "clinicalMatrix"
    finalClinMatrixJSON["path"] = ""
    finalClinMatrixJSON[":sampleMap"] = sampleMap

    clinFeatures = []
    finalClinFeatureJSON = None
    finalClinFeature = None

    # add all ids to sMap
    sMapChanged = checkIdsAllIn(sMap, bookDic)

    #build initial clinical Matrix with sampleMap ids, all with empty data
    emptyData = {}
    success = finalClinMatrix.addNewRows(sMap.getNodes(), emptyData)
    if not success:
        print "fail to add all initial ids from sampleMap"

    datasets = collectNamesBelongToSampleMap(bookDic, sampleMap)
    datasetsOrdered = []  #only the ClinicalMatrix ordered list
    for name in datasets:
        obj = bookDic[name]
        if obj['type'] == "clinicalMatrix":
            if obj.has_key('outOfDate') and obj['outOfDate'] in [
                    "yes", "Yes", "YES"
            ]:
                datasetsOrdered.append(name)
            elif not obj.has_key('outOfDate') and not obj.has_key('upToDate'):
                datasetsOrdered.insert(0, name)

    upToDateSets = {}
    for name in datasets:
        obj = bookDic[name]
        if obj['type'] == "clinicalMatrix":
            if obj.has_key('upToDate'):
                upToDateSets[obj['upToDate']] = name

    keys = upToDateSets.keys()
    keys.sort()
    for version in keys:
        name = upToDateSets[version]
        datasetsOrdered.insert(0, name)

    for name in datasetsOrdered:
        obj = bookDic[name]
        if obj['type'] == "clinicalMatrix":
            clinFeature = None
            #clinFeature obj
            if obj.has_key(':clinicalFeature'):
                path = bookDic[obj[':clinicalFeature']]['path']
                neme = bookDic[obj[':clinicalFeature']]['name']
                clinFeature = ClinicalFeatureNew(path, name)

            #get matrix obj
            path = obj['path']
            name = obj['name']

            cMatrix = ClinicalMatrixNew(path, name, False, clinFeature)

            if finalClinMatrix == None:
                finalClinMatrix = cMatrix

            if finalClinMatrixJSON == None:
                finalClinMatrixJSON = obj

            #merge final and cMatrix
            if finalClinMatrix != cMatrix:
                print "name=", cMatrix.getName()
                r = finalClinMatrix.addNewCols(cMatrix, validation=True)
                if r != True:
                    print "Fail to merge"
                    return False

            #add clinFeature
            if clinFeature:
                clinFeatures.append(clinFeature)

            #merge finalClinMatrixJSON with new json
            if finalClinMatrixJSON != obj:
                jsonName = trackName_fix(
                    sampleMapBaseName(sMap) + "_clinicalMatrix")
                finalClinMatrixJSON = cgDataMergeJSON(finalClinMatrixJSON, obj,
                                                      jsonName)

            # final ClinFeature json
            if clinFeature:
                clinFeatureJSON = bookDic[obj[':clinicalFeature']]
                if finalClinFeatureJSON == None:
                    finalClinFeatureJSON = clinFeatureJSON
                else:
                    jsonName = trackName_fix(
                        sampleMapBaseName(sMap) + "_clinicalFeature")
                    finalClinFeatureJSON["version"] = datetime.date.today(
                    ).isoformat()
                    finalClinFeatureJSON["type"] = "clinicalFeature"
                    finalClinFeatureJSON["name"] = jsonName

    #final clinicalFeature
    if finalClinFeatureJSON:
        fout = open(".tmp", 'w')
        fout.close()
        for clinF in clinFeatures:
            fout = open(".tmptmp", 'w')
            clinF.store(fout)
            fout.close()
            os.system("cat .tmptmp >> .tmp")
        fin = open(".tmp", 'r')
        finalClinFeature = ClinicalFeatureNew(fin,
                                              finalClinFeatureJSON['name'])
        if not finalClinFeature.isValid():
            print "final clinFeature file .tmp is invalid"
            return 0
        fin.close()

    #SURVIVAL analysis data
    foundE = 0
    foundT = 0
    if finalClinFeature:
        features = finalClinFeature.getFeatures()
        for feature in features:
            sameAs = finalClinFeature.getFeatureSameAs(feature)
            if sameAs == "_TIME_TO_EVENT":
                #check there is only one parameter is set to be _TIME_TO_EVENT
                if foundT == 1:
                    print "ERROR there is already _TIME_TO_EVENT"
                    continue
                #check matrix does not have _TIME_TO_EVNET
                if sameAs in finalClinMatrix.getCOLs():
                    print "ERROR there is already _TIME_TO_EVENT in matrix"
                    continue
                #data check need to check these are floats or "" in both clinFeature and clinMatrix
                GOOD = 1
                if finalClinMatrix.isTypeFloat(feature) != True:
                    print "ERROR _TIME_TO_EVENT parent feature values are not correct", finalClinMatrix.getColStates(
                        feature)
                    GOOD = 0
                if GOOD:
                    foundT = 1
                    finalClinMatrix.addNewColfromOld(sameAs, feature)
                    finalClinFeature.setFeatureValueType(sameAs, "float")

            if sameAs == "_EVENT":
                #check there is only one parameter is set to be _EVENT
                if foundE == 1:
                    print "ERROR there is already _EVENT"
                    continue
                #check matrix does not have _EVNET
                if sameAs in finalClinMatrix.getCOLs():
                    print "ERROR there is already _EVENT in matrix"
                    continue
                #data check
                GOOD = 1
                states = finalClinMatrix.getColStates(feature)
                """
                for state in states:
                    if state not in [0,1,"0","1",""]:
                        print "ERROR _EVENT values are not correct", state
                        GOOD=0
                        break
                """
                if len(states) not in [2, 3]:
                    GOOD = 0
                if len(states) == 3 and states.count('') != 1:
                    GOOD = 0

                if GOOD:
                    foundE = 1
                    finalClinMatrix.addNewColfromOld(sameAs, feature)
                    finalClinFeature.setFeatureValueType(sameAs, "category")
                    #finalClinFeature.setFeatureStates(sameAs,["0","1"])
                    #finalClinFeature.setFeatureStateOrder(sameAs,["0","1"])

    #clinical data push down
    roots = sMap.allRoots()
    for root in roots:
        r = finalClinMatrix.pushToChildren(root, sMap)
        if r != True:
            print "Fail to push down"
            return 0
    print "after clinical push down", sampleMap, finalClinMatrix.getROWnum()

    # collect all genomic data
    keepSamples = getAllGenomicIds(sMap, bookDic)

    # removing rows without genomic data from  clinical data matrix due to mysql enum limitation
    # should remove this step after the display functionality is done better, currently cgb clinical data range in feature control panel shows the full range of clinical data without checking if the specific track/dataset has the full value range.
    if onlyGenomicSamples:
        print "genomic sample count", len(keepSamples)
        success = finalClinMatrix.onlyKeepRows(keepSamples)
        if not success:
            print "fail to remove extra rows"
        else:
            print "after keeping sample with genomic data", finalClinMatrix.getROWnum(
            )

    #add to the clinical matrix any samples with genomic data but no clinical data
    emptyData = {}
    for col in finalClinMatrix.getCOLs():
        emptyData[col] = ""
    success = finalClinMatrix.addNewRows(keepSamples, emptyData)
    if not success:
        print "fail to add new roows"
    else:
        print "after adding all genomic data", finalClinMatrix.getROWnum()

    if finalClinMatrix.validate() != True:
        print "Fail to validate"
        cMatrix = oldCMatrix
        return 0
    # end of collecting all genomic data

    #code to remove blacklist samples and all its descendants

    badList = badListSelfAndDescendants(sMap, bookDic)
    if badList != []:
        #remove badList samples
        finalClinMatrix.removeRows(badList, True)
        print "after remove badList", finalClinMatrix.getROWnum()

    #identify empty features
    badFeatures = finalClinMatrix.findBadColsNotRemove()

    print "emptye features:", badFeatures

    #finalBadFeatures=[]
    #if finalClinFeature:  ###########  don't understand this
    #    for feature in badFeatures:
    #        #get short label
    #        shortTitle = finalClinFeature.getShortTitle(feature)
    #        if not shortTitle:
    #            print feature,"remove"
    #            finalBadFeatures.append(feature)
    #        else:
    #            print shortTitle,"not remove"
    #else:
    #    finalBadFeatures =badFeatures[:]

    #remove bad features
    finalBadFeatures = badFeatures
    finalClinMatrix.removeCols(finalBadFeatures)
    print "remove features", finalBadFeatures

    # add _PATIENT col
    if finalClinMatrix.addColRoot(sMap) == None:
        print "Fail to addColRoot"
        return 0

    # add _INTEGRATION col
    intList = []
    if bookDic.has_key(sampleMap) and bookDic[sampleMap].has_key(
            ":integrationId"):
        intName = bookDic[sampleMap][":integrationId"]
        fin = open(bookDic[intName]["path"], "r")
        intId = IntegrationId(intName, fin)
        intList = intId.getList()
    finalClinMatrix.addColIntegration(sMap, intList)

    # final ClinFeature json
    if finalClinFeatureJSON == None:
        jsonName = trackName_fix(sampleMapBaseName(sMap) + "_clinicalFeature")
        finalClinFeatureJSON = {}
        finalClinFeatureJSON["version"] = datetime.date.today().isoformat()
        finalClinFeatureJSON["type"] = "clinicalFeature"
        finalClinFeatureJSON["name"] = jsonName
        finalClinFeatureJSON["path"] = ""
        finalClinFeature = ClinicalFeatureNew(None,
                                              finalClinFeatureJSON["name"])

    #final clinicalFeature
    if finalClinFeature:
        finalClinFeature.removeFeatures(finalBadFeatures)
        finalClinFeature.cleanState()
        finalClinFeature.checkFeatureWithMatrix(finalClinMatrix)
        #clinicalFeature fillin ValueType
        finalClinFeature.fillInValueTypeWithMatrix(finalClinMatrix)
        #clinicalFeature fillin missing features
        finalClinFeature.fillInFeaturesWithMatrix(finalClinMatrix)
        #clinicalFeature fillin short and long titles
        finalClinFeature.fillInTitles()
        #clinicalFeature fillin priority visibility

        #vis exceptions
        VIS_limit = 4
        if bookDic.has_key(
                sMap.getName()) and bookDic[sMap.getName()].has_key("VIS"):
            VIS_limit = bookDic[sMap.getName()]["VIS"]
        finalClinFeature.fillInPriorityVisibility(VIS_limit)

        finalClinFeature.setFeatureShortTitle("_PATIENT", "_PATIENT_ID")
        finalClinFeature.setFeatureLongTitle("_PATIENT", "_PATIENT_ID")
        finalClinFeature.setFeatureValueType("_PATIENT", "category")
        finalClinFeature.setFeatureShortTitle("_INTEGRATION", "_SAMPLE_ID")
        finalClinFeature.setFeatureLongTitle("_INTEGRATION", "_SAMPLE_ID")
        finalClinFeature.setFeatureValueType("_INTEGRATION", "category")

    print sampleMap, finalClinMatrix.getROWnum()
    return finalClinMatrix, finalClinMatrixJSON, finalClinFeature, finalClinFeatureJSON