def add_col_PseudoSample(clinMatrix,
                         col):  # add sample type informatin to pseudo samples
    rows = clinMatrix.getROWs()
    for row in rows:
        st = clinMatrix.getDATA(row, col)
        if st != None and st != "":
            #assuming sample ids are TCGA barcode
            integration_id = TCGAUtil.barcode_IntegrationId(row)
            if clinMatrix.hasRow(integration_id):
                clinMatrix.setDATA(integration_id, col, st)
            else:
                clinMatrix.addNewRows([integration_id], {col: st})

    r = clinMatrix.validate()
    if r == False:
        print "add pseudoSample clinical infor", col, "fail"
def buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic):
    for sample in samples:
        #TCGA uuid handling
        uuid = sample
        if sample[0:4] != "TCGA":
            if aliquote_dic.has_key(string.lower(sample)):
                TCGAbarcode = aliquote_dic[string.lower(sample)]
            else:
                print sample
            parent = TCGAbarcode
            child = sample
            sMap.addLink(parent, child)
            sample = parent

        #do TCGA barcode trick
        parts = string.split(sample, "-")
        parent = string.join(parts[0:3], "-")

        #parts[3]
        if len(parts) > 3 and len(parts[3]) == 3:
            child = parent + "-" + parts[3][0:2]
            sMap.addLink(parent, child)
            parent = child
            child = string.join(parts[0:4], "-")
            sMap.addLink(parent, child)
            parent = child

        for i in range(4, len(parts)):
            child = parent + "-" + parts[i]
            #add parent child
            sMap.addLink(parent, child)
            parent = child

        intID = TCGAUtil.barcode_IntegrationId(sample)
        if intDic.has_key(intID):
            intDic[intID].append(uuid)
        else:
            intDic[intID] = [uuid]
        sampleDic[uuid] = intID
Esempio n. 3
0
def buildSampleDic (samples, sMap, intDic, sampleDic,aliquote_dic):
    for sample in samples:
        #TCGA uuid handling
        uuid=sample
        if sample[0:4]!="TCGA": 
            if aliquote_dic.has_key(string.lower(sample)):
                TCGAbarcode = aliquote_dic[string.lower(sample)]
            else:
                print sample
            parent = TCGAbarcode
            child = sample
            sMap.addLink(parent,child)
            sample = parent
                
        #do TCGA barcode trick
        parts= string.split(sample,"-")
        parent = string.join(parts[0:3],"-")
                    
        #parts[3]
        if len(parts)>3 and len(parts[3])==3:
            child=parent +"-" +parts[3][0:2]
            sMap.addLink(parent,child)
            parent=child
            child=string.join(parts[0:4],"-")
            sMap.addLink(parent,child)
            parent=child
                
        for i in range (4,len(parts)):
            child = parent +"-" +parts[i]
            #add parent child
            sMap.addLink(parent,child)
            parent = child
                
        intID= TCGAUtil.barcode_IntegrationId(sample)
        if intDic.has_key(intID):
            intDic[intID].append(uuid)
        else:
            intDic[intID]=[uuid]
        sampleDic[uuid]=intID
def TCGASampleMap (dir, outDir, cancer,log, REALRUN):
    #print status
    print cancer, __name__

    #if cancer in ["PANCAN","PANCAN12"]:
    #    return

    ignore =1
    bookDic = cgWalk(dir,ignore)
    
    existMaps = collectSampleMaps(bookDic)
    missingMaps=  collectMissingSampleMaps(bookDic)

    #removeExistMaps
    for map in existMaps:
        if map not in missingMaps:
            missingMaps[map]=existMaps[map]
        
    # all aliquote uuid dic
    aliquote_dic =TCGAUtil.uuid_Aliquot_all()
    sample_dic =TCGAUtil.uuid_Sample_all()
    
    #missingMaps --- actually this is all the maps
    for map in missingMaps:
        print map
        print missingMaps[map]
        sMap =SampleMapNew(None,map)

        #integration id
        intName= map+".integrationID"
        if intName in bookDic:
            fin = open(bookDic[intName]["path"],'r')
            integrationID=IntegrationId(intName, fin)
            fin.close()
        else:
            integrationID=IntegrationId(intName)

        samples =[]
        for name in missingMaps[map]:
            if REALRUN !=1:
                continue
            print name
            obj=bookDic[name]
            if obj['type']=="genomicMatrix":
                fin =open(obj['path'],'U')
                for sample in string.split(fin.readline()[:-1],"\t")[1:]:
                    if sample =="":
                        print name, "has bad empty sample id"
                    if sample not in samples:
                        samples.append(sample)
                fin.close()
            #elif obj['type']=="clinicalMatrix":
            #    cMa = ClinicalMatrixNew(obj['path'],name)
            #    for sample in cMa.getROWs():
            #        if sample not in samples:
            #            samples.append(sample)
            elif obj['type'] in ["mutationVector","clinicalMatrix"]:
                path = obj['path']
                os.system("cut -f 1 "+path+ " |sort |uniq > .tmp")
                fin=open('.tmp','r')
                fin.readline()
                for line in fin.readlines():
                    #if string.strip(line)=="":
                    #    break
                    sample = string.strip(line) #string.split(line,'\t')[0]
                    if sample =="":
                        break
                    if sample not in samples:
                        samples.append(sample)

            else:
                continue

        for sample in samples:
            if REALRUN !=1:
                continue
            #TCGA uuid handling
            if sample[0:4]!="TCGA": 
                if aliquote_dic.has_key(string.lower(sample)):
                    TCGAbarcode = aliquote_dic[string.lower(sample)]
                else:
                    print sample
                    continue
                parent = TCGAbarcode
                child = sample
                sMap.addLink(parent,string.lower(child))
                sMap.addLink(parent,string.upper(child))
                sample = parent

            #do TCGA barcode trick
            parts= string.split(sample,"-")
            if len(parts)>3 and len(parts[3])==3:
                parts = parts[0:3]+ [parts[3][0:2],parts[3][2]]+parts[4:]
                #print parts

            """
            parent = string.join(parts[0:3],"-")
            #parts[3]
            if len(parts)>3 and len(parts[3])==3:
                child=parent +"-" +parts[3][0:2]
                sMap.addLink(parent,child)
                parent=child
                child=string.join(parts[0:4],"-")
                sMap.addLink(parent,child)
                parent=child
            """
            parent = string.join(parts[0:3],"-")
            for i in range (3,len(parts)):
                if i!=4:
                    child = parent +"-" +parts[i]
                else:
                    child = parent +parts[i]
                #add parent child
                sMap.addLink(parent,child)
                parent = child
                
            intID= TCGAUtil.barcode_IntegrationId(sample)
            integrationID.addId(intID)
            
        #output sampleMap
        if not os.path.exists( outDir ):
            os.makedirs( outDir )
        if not os.path.exists( outDir +cancer+"/"):
                os.makedirs( outDir+cancer+"/" )

        if REALRUN == 1:
            oHandle = open(outDir+cancer+"/"+map,"w")
            sMap.store(oHandle)

        #output integrationID
        if REALRUN ==1:
            oHandle = open(outDir+cancer+"/integrationID","w")
            integrationID.store(oHandle)
            oHandle.close()
        
        #output integrationID json
        oHandle = open(outDir+cancer+"/integrationID.json","w")
        J={}
        J['name']=intName

        J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer]
        J["sample_type"]="tumor"
        if cancer not in ["PANCAN","PANCAN12"]:
            J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer]
        else:
            J["primary_disease"]="cancer"
            
        #J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer]
        J['domain']="TCGA"
        J['owner']="TCGA"
        
        J["cgDataVersion"]=1
        J['type']="integrationId"
        J["version"]= datetime.date.today().isoformat()
        oHandle.write( json.dumps( J, indent=-1 ) )
        oHandle.close()
        
        #output json
        oHandle = open(outDir+cancer+"/"+map+".json","w")
        J['name']=map
        J['type']="sampleMap"
        J["version"]= datetime.date.today().isoformat()
        J["cgDataVersion"]=1
        J[":integrationId"]=intName

        #add info for old clinical data
        if os.path.exists( outDir+cancer+"/oldClin.json" ):
            J[':oldClin']=cancer+"_oldClin" 

        #special code
        if TCGAUtil.featurePriority.has_key(cancer) and len(TCGAUtil.featurePriority[cancer])>=5:
            J["VIS"]=5
        
        #blackList in PAAD
        if J['name'] in ["TCGA.PAAD.sampleMap"]:
            J["blacklist"]= [ "TCGA-FQ-6551",
                              "TCGA-FQ-6552",
                              "TCGA-FQ-6553",
                              "TCGA-FQ-6554",
                              "TCGA-FQ-6555",
                              "TCGA-FQ-6558",
                              "TCGA-FQ-6559"]
            
        oHandle.write( json.dumps( J, indent=-1 ) )

        
    return
def cohort_variable(var, value, inDir, outDir, cancer, REALRUN, doDerived):
    print inDir
    print outDir

    if REALRUN:
        ignore = 1
        bookDic = cgWalk(inDir, ignore)

        existMaps = collectSampleMaps(bookDic)
        missingMaps = collectMissingSampleMaps(bookDic)

        #removeExistMaps
        for map in existMaps:
            if map not in missingMaps:
                missingMaps[map] = existMaps[map]

        # all aliquote uuid dic
        aliquote_dic = TCGAUtil.uuid_Aliquot_all()
        sample_dic = TCGAUtil.uuid_Sample_all()

        if len(missingMaps) != 1:
            return

        map = missingMaps.keys()[0]
        print map
        samples = []
        for name in missingMaps[map]:
            obj = bookDic[name]

            if obj['type'] == "genomicMatrix":
                fin = open(obj['path'], 'U')
                for sample in string.split(fin.readline()[:-1], "\t")[1:]:
                    if sample == "":
                        print name, "has bad empty sample id"
                        sys.exit()
                    if sample not in samples:
                        samples.append(sample)
                fin.close()

                #take too long
                """
            if obj['type']=="mutationVector":
                fin =open(obj['path'],'U')
                fin.readline()
                while 1:
                    line = fin.readline()
                    if string.strip(line) =="":
                        break
                    sample = string.split(line,'\t')[0]
                    if sample not in samples:
                        samples.append(sample)
                        print sample, obj['path']
                fin.close()
            """
        intDic = {}
        for sample in samples:
            #TCGA uuid handling
            uuid = sample
            TCGAbarcode = ""
            if uuid[0:4] != "TCGA":
                if aliquote_dic.has_key(string.lower(uuid)):
                    TCGAbarcode = aliquote_dic[string.lower(uuid)]
                else:
                    TCGAbarcode = uuid
            else:
                TCGAbarcode = sample

            intID = TCGAUtil.barcode_IntegrationId(TCGAbarcode)
            if intID == None:  # ids is on patient level above integration level
                continue
            if not intDic.has_key(intID):
                intDic[intID] = ""

        outfile = outDir + cancer + "/" + var
        fout = open(outfile, "w")
        fout.write("sample\t" + var + "\n")
        for intId in intDic:
            fout.write(intId + "\t" + value + "\n")
        fout.close()

    #data josn
    J = {}
    J["version"] = datetime.date.today().isoformat()
    J["name"] = "TCGA_" + cancer + "_" + var
    J["type"] = "clinicalMatrix"
    J["dataSubType"] = "phenotype"
    J[":sampleMap"] = "TCGA." + cancer + ".sampleMap"
    J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[
        cancer] + " (" + cancer + ")"

    outfile = outDir + cancer + "/" + var
    oHandle = open(outfile + ".json", "w")
    oHandle.write(json.dumps(J, indent=-1))
    oHandle.close()

    if doDerived:
        if cancer in ["LUAD", "LUSC"]:
            derived_cancer = "LUNG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["COAD", "READ"]:
            derived_cancer = "COADREAD"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["GBM", "LGG"]:
            derived_cancer = "GBMLGG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
Esempio n. 6
0
def cohort_variable (var, value, inDir, outDir, cancer, REALRUN, doDerived):
    print inDir
    print outDir

    if REALRUN:
        ignore =1
        bookDic=cgWalk(inDir,ignore)
        
        existMaps = collectSampleMaps(bookDic)
        missingMaps=  collectMissingSampleMaps(bookDic)

        #removeExistMaps
        for map in existMaps:
            if map not in missingMaps:
                missingMaps[map]=existMaps[map]
        
        # all aliquote uuid dic
        aliquote_dic =TCGAUtil.uuid_Aliquot_all()
        sample_dic =TCGAUtil.uuid_Sample_all()

        if len(missingMaps)!=1:
            return

        map = missingMaps.keys()[0]
        print map
        samples =[]
        for name in missingMaps[map]:
            obj=bookDic[name]
            
            if obj['type']=="genomicMatrix":
                fin =open(obj['path'],'U')
                for sample in string.split(fin.readline()[:-1],"\t")[1:]:
                    if sample =="":
                        print name, "has bad empty sample id"
                        sys.exit()
                    if sample not in samples:
                        samples.append(sample)
                fin.close()
            
            #take too long
                """
            if obj['type']=="mutationVector":
                fin =open(obj['path'],'U')
                fin.readline()
                while 1:
                    line = fin.readline()
                    if string.strip(line) =="":
                        break
                    sample = string.split(line,'\t')[0]
                    if sample not in samples:
                        samples.append(sample)
                        print sample, obj['path']
                fin.close()
            """
        intDic={}
        for sample in samples:
            #TCGA uuid handling
            uuid =sample
            TCGAbarcode =""
            if uuid[0:4]!="TCGA": 
                if aliquote_dic.has_key(string.lower(uuid)):
                    TCGAbarcode = aliquote_dic[string.lower(uuid)]
                else:
                    TCGAbarcode =  uuid
            else:
                TCGAbarcode = sample

            intID= TCGAUtil.barcode_IntegrationId(TCGAbarcode)
            if intID == None: # ids is on patient level above integration level
                continue 
            if not intDic.has_key(intID):
                intDic[intID]=""

        outfile = outDir+cancer+"/"+ var
        fout =open(outfile,"w")
        fout.write("sample\t"+var+"\n")
        for intId in intDic:
            fout.write(intId+"\t"+ value+"\n")
        fout.close()

    #data josn
    J={}
    J["version"]= datetime.date.today().isoformat()
    J["name"]="TCGA_"+cancer+"_"+var
    J["type"]= "clinicalMatrix" 
    J["dataSubType"]="phenotype"
    J[":sampleMap"]="TCGA."+cancer+".sampleMap"
    J["cohort"]="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")"

    outfile = outDir+cancer+"/"+var
    oHandle = open(outfile +".json","w")
    oHandle.write( json.dumps( J, indent=-1 ) )
    oHandle.close()

    if doDerived:
        if cancer in ["LUAD","LUSC"]:
            derived_cancer="LUNG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["COAD","READ"]:
            derived_cancer="COADREAD"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["GBM","LGG"]:
            derived_cancer="GBMLGG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)