Python TCGAUtil.uuid_Aliquot_all Examples

Example #1

0

Show file

File: CAVM_TCGA.py Project: jianguozhouzunyimedicaluniversity/wrangle

def CAVMid(dir, outDir, cancer, log, REALRUN):
    print cancer, sys._getframe().f_code.co_name

    ignore = 1
    bookDic = cgWalk(dir, ignore)

    existMaps = collectSampleMaps(bookDic)
    missingMaps = collectMissingSampleMaps(bookDic)

    #removeExistMaps
    for map in existMaps:
        if map not in missingMaps:
            missingMaps[map] = existMaps[map]

    # all aliquote uuid dic
    aliquote_dic = TCGAUtil.uuid_Aliquot_all()
    sample_dic = TCGAUtil.uuid_Sample_all()

    if not os.path.exists(outDir):
        os.system("mkdir " + outDir)

    for map in missingMaps:
        print map
        sMap = SampleMapNew(None, map)
        for name in missingMaps[map]:
            samples = []
            intDic = {}  #keyed on CAVMid
            sampleDic = {}  #keyd on original sample id
            obj = bookDic[name]

            print obj["name"]

            if obj['type'] in ["clinicalMatrix", "mutationVector"]:
                outfile = outDir + os.path.basename(obj['path'])
                os.system("cp " + obj['path'] + ".json " + outfile + ".json")

                fin = open(outfile + ".json", 'r')
                J = json.load(fin)
                fin.close()
                if J.has_key(":clinicalFeature"):
                    cFobj = bookDic[J[":clinicalFeature"]]
                    cFoutfile = outDir + os.path.basename(cFobj['path'])
                    os.system("cp " + cFobj['path'] + " " + cFoutfile)
                    os.system("cp " + cFobj['path'] + ".json " + cFoutfile +
                              ".json")

                if REALRUN == -1:
                    continue

                if REALRUN == 0 and obj['type'] == "mutationVector":
                    continue

                fin = open(obj['path'], 'r')
                fin.readline()
                for line in fin.readlines():
                    sample = string.split(line, "\t")[0]
                    if sample not in samples and sample != "":
                        samples.append(sample)
                buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic)

                fin = open(obj['path'], 'r')
                fout = open(outfile, 'w')
                fout.write(fin.readline())
                for line in fin.readlines():
                    data = string.split(line, "\t")
                    sample = data[0]
                    try:
                        fout.write(sampleDic[sample] + "\t")
                        fout.write(string.join(data[1:], "\t"))
                    except:
                        fout.write(line)
                fout.close()

            if obj['type'] == "genomicMatrix":
                fin = open(obj['path'], 'U')
                for sample in string.split(fin.readline()[:-1], "\t")[1:]:
                    if sample == "":
                        print name, "has bad empty sample id"
                        sys.exit()
                    samples.append(sample)

                fin.close()

                outfile = outDir + os.path.basename(obj['path'])

                os.system("cp " + obj['path'] + ".json " + outfile + ".json")

                if REALRUN != 1:
                    continue

                buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic)
                process(obj['path'], outfile, samples, intDic)

Example #2

0

Show file

File: TCGASampleMap.py Project: jianguozhouzunyimedicaluniversity/wrangle

def TCGASampleMap (dir, outDir, cancer,log, REALRUN):
    #print status
    print cancer, __name__

    #if cancer in ["PANCAN","PANCAN12"]:
    #    return

    ignore =1
    bookDic = cgWalk(dir,ignore)
    
    existMaps = collectSampleMaps(bookDic)
    missingMaps=  collectMissingSampleMaps(bookDic)

    #removeExistMaps
    for map in existMaps:
        if map not in missingMaps:
            missingMaps[map]=existMaps[map]
        
    # all aliquote uuid dic
    aliquote_dic =TCGAUtil.uuid_Aliquot_all()
    sample_dic =TCGAUtil.uuid_Sample_all()
    
    #missingMaps --- actually this is all the maps
    for map in missingMaps:
        print map
        print missingMaps[map]
        sMap =SampleMapNew(None,map)

        #integration id
        intName= map+".integrationID"
        if intName in bookDic:
            fin = open(bookDic[intName]["path"],'r')
            integrationID=IntegrationId(intName, fin)
            fin.close()
        else:
            integrationID=IntegrationId(intName)

        samples =[]
        for name in missingMaps[map]:
            if REALRUN !=1:
                continue
            print name
            obj=bookDic[name]
            if obj['type']=="genomicMatrix":
                fin =open(obj['path'],'U')
                for sample in string.split(fin.readline()[:-1],"\t")[1:]:
                    if sample =="":
                        print name, "has bad empty sample id"
                    if sample not in samples:
                        samples.append(sample)
                fin.close()
            #elif obj['type']=="clinicalMatrix":
            #    cMa = ClinicalMatrixNew(obj['path'],name)
            #    for sample in cMa.getROWs():
            #        if sample not in samples:
            #            samples.append(sample)
            elif obj['type'] in ["mutationVector","clinicalMatrix"]:
                path = obj['path']
                os.system("cut -f 1 "+path+ " |sort |uniq > .tmp")
                fin=open('.tmp','r')
                fin.readline()
                for line in fin.readlines():
                    #if string.strip(line)=="":
                    #    break
                    sample = string.strip(line) #string.split(line,'\t')[0]
                    if sample =="":
                        break
                    if sample not in samples:
                        samples.append(sample)

            else:
                continue

        for sample in samples:
            if REALRUN !=1:
                continue
            #TCGA uuid handling
            if sample[0:4]!="TCGA": 
                if aliquote_dic.has_key(string.lower(sample)):
                    TCGAbarcode = aliquote_dic[string.lower(sample)]
                else:
                    print sample
                    continue
                parent = TCGAbarcode
                child = sample
                sMap.addLink(parent,string.lower(child))
                sMap.addLink(parent,string.upper(child))
                sample = parent

            #do TCGA barcode trick
            parts= string.split(sample,"-")
            if len(parts)>3 and len(parts[3])==3:
                parts = parts[0:3]+ [parts[3][0:2],parts[3][2]]+parts[4:]
                #print parts

            """
            parent = string.join(parts[0:3],"-")
            #parts[3]
            if len(parts)>3 and len(parts[3])==3:
                child=parent +"-" +parts[3][0:2]
                sMap.addLink(parent,child)
                parent=child
                child=string.join(parts[0:4],"-")
                sMap.addLink(parent,child)
                parent=child
            """
            parent = string.join(parts[0:3],"-")
            for i in range (3,len(parts)):
                if i!=4:
                    child = parent +"-" +parts[i]
                else:
                    child = parent +parts[i]
                #add parent child
                sMap.addLink(parent,child)
                parent = child
                
            intID= TCGAUtil.barcode_IntegrationId(sample)
            integrationID.addId(intID)
            
        #output sampleMap
        if not os.path.exists( outDir ):
            os.makedirs( outDir )
        if not os.path.exists( outDir +cancer+"/"):
                os.makedirs( outDir+cancer+"/" )

        if REALRUN == 1:
            oHandle = open(outDir+cancer+"/"+map,"w")
            sMap.store(oHandle)

        #output integrationID
        if REALRUN ==1:
            oHandle = open(outDir+cancer+"/integrationID","w")
            integrationID.store(oHandle)
            oHandle.close()
        
        #output integrationID json
        oHandle = open(outDir+cancer+"/integrationID.json","w")
        J={}
        J['name']=intName

        J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer]
        J["sample_type"]="tumor"
        if cancer not in ["PANCAN","PANCAN12"]:
            J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer]
        else:
            J["primary_disease"]="cancer"
            
        #J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer]
        J['domain']="TCGA"
        J['owner']="TCGA"
        
        J["cgDataVersion"]=1
        J['type']="integrationId"
        J["version"]= datetime.date.today().isoformat()
        oHandle.write( json.dumps( J, indent=-1 ) )
        oHandle.close()
        
        #output json
        oHandle = open(outDir+cancer+"/"+map+".json","w")
        J['name']=map
        J['type']="sampleMap"
        J["version"]= datetime.date.today().isoformat()
        J["cgDataVersion"]=1
        J[":integrationId"]=intName

        #add info for old clinical data
        if os.path.exists( outDir+cancer+"/oldClin.json" ):
            J[':oldClin']=cancer+"_oldClin" 

        #special code
        if TCGAUtil.featurePriority.has_key(cancer) and len(TCGAUtil.featurePriority[cancer])>=5:
            J["VIS"]=5
        
        #blackList in PAAD
        if J['name'] in ["TCGA.PAAD.sampleMap"]:
            J["blacklist"]= [ "TCGA-FQ-6551",
                              "TCGA-FQ-6552",
                              "TCGA-FQ-6553",
                              "TCGA-FQ-6554",
                              "TCGA-FQ-6555",
                              "TCGA-FQ-6558",
                              "TCGA-FQ-6559"]
            
        oHandle.write( json.dumps( J, indent=-1 ) )

        
    return

Example #3

0

Show file

import TCGAUtil

dic = TCGAUtil.uuid_Aliquot_all()
dic = TCGAUtil.uuid_Sample_all()
TCGAUtil.uuid_normal_cellline()
TCGAUtil.uuid_cellline()

Example #4

0

Show file

File: RPPA.py Project: jingchunzhu/cgDataNew

def RPPA (inDir, outDir, cancer, flog,REALRUN):
    print cancer, sys._getframe().f_code.co_name

    PATHPATTERN = "MDA_RPPA_Core"    
    dataProducer= "MD Anderson Cancer Center TCGA proteome characterization center"
    
    garbage=[tmpDir]

    if os.path.exists( tmpDir ):
        os.system("rm -rf "+tmpDir+"*")
    else:
        os.system("mkdir "+tmpDir)

    #multiple files in dir mode
    lastRelease={}
    for file in os.listdir(inDir):
        #find the file
        if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1:
            pass
        else:
            continue
        
        if not os.path.exists(inDir +file+".md5"):
            print "file has no matching .md5 throw out", file
            continue
            
        #find lastest in each archive
        info = string.split(file,".")
        archive = info [-5] 
        release = int(info [-4])

        if not lastRelease.has_key(archive):
            lastRelease[archive]= release
        else:
            if lastRelease[archive]< release:
                lastRelease[archive]=release
                

    rootDir =""
    lastDate=None
    remoteDataDirExample =""
    for file in os.listdir(inDir):
        #find the file
        if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1:
            pass
        else:
            continue

        if not os.path.exists(inDir +file+".md5"):
            continue

        #find the file that is the lastest release for the archive
        info = string.split(file,".")
        archive = info [-5] 
        release = int(info [-4])

        if release != lastRelease[archive]:
            continue

        #file latest date
        newDate=  datetime.date.fromtimestamp(os.stat(inDir+file).st_mtime)
        if not lastDate:
            lastDate = newDate
        if lastDate < newDate:
            lastDate = newDate
            
        if remoteDataDirExample =="":
            remoteDataDirExample = file[:-7]

        #is tar.gz?, uncompress multiple file mode
        if string.find(file,".tar.gz")!=-1 and REALRUN :
            os.system("tar -xzf "+inDir+file +" -C "+tmpDir) 
            rootDir =tmpDir
            
    #make sure there is data
    if REALRUN and (rootDir =="" or not os.path.exists(rootDir)):
        print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__
        return

    #set output dir
    if not os.path.exists( outDir ):
        os.makedirs( outDir )
    if not os.path.exists( outDir +cancer+"/"):
        os.makedirs( outDir+cancer+"/" )

    cgFileName= "RPPA"
    
    #data processing multiple dirs mode
    if REALRUN:
        aliquote_dic =TCGAUtil.uuid_Aliquot_all()
        dataMatrix={}
        allSamples=[]
        probes=[]
        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir+dataDir):
                sample =""
                pattern ="protein_expression"
                if string.find(file,pattern)!=-1:
                    infile = rootDir+dataDir+"/"+file
                    sample = string.split(file,".")[5]
                if sample =="":
                    continue            
                # Test for barcode or UUID     #throw out all normals and control Analyte
                if sample[0:4]!="TCGA":
                    if aliquote_dic.has_key(string.lower(sample)):
                        if TCGAUtil.UUID_CELLLINE.has_key(sample):
                            print "control cell line ignore", sample
                            continue
                    else:
                        print "unknow id:", sample
                        continue
                else:
                    sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
                    if sampleTypeCode == False: # likely a uuid
                        continue
                    elif sampleTypeCode in ["20"]:
                        print "control cell line ignore", sample
                        continue
                if sample not in allSamples:
                    allSamples.append(sample)

        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir+dataDir):
                sample =""
                pattern ="protein_expression"
                if string.find(file,pattern)!=-1:
                    infile = rootDir+dataDir+"/"+file
                    sample = string.split(file,".")[5]
                if sample =="":
                    continue
                if sample not in allSamples:
                    continue
                valuePOS=1
                process(dataMatrix,allSamples,sample, probes, cancer,infile,flog, valuePOS)

    
        outfile = outDir+cancer+"/"+cgFileName
        outputMatrix(dataMatrix, allSamples, probes, outfile)

    oHandle = open(outDir+cancer+"/"+cgFileName+".json","w")
    
    J={}
    #stable
    J["dataSubType"]="protein expression RPPA"
    J["redistribution"]= True
    J["dataProducer"]= dataProducer
    J["colNormalization"]=True
    J["PLATFORM"]= "M.D. Anderson Reverse Phase Protein Array Core platform"
    J["type"]= "genomicMatrix" 
    J[":sampleMap"]="TCGA."+cancer+".sampleMap"
    
    #multiple dirs
    J["url"]=TCGAUtil.remoteBase \
              +string.replace(inDir,TCGAUtil.localBase,"")
    J["version"]= datetime.date.today().isoformat()
    J["wrangler"]= "Xena TCGAscript "+ __name__ +" processed on "+ datetime.date.today().isoformat()
    J["wrangling_procedure"]= "Level_3 Data (file names: *.protein_expression.*) download from TCGA DCC, and processed at UCSC into Xena repository"
    J["label"]= "RPPA"
    J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") reverse phase protein array"
    
    J[":probeMap"]= "md_anderson_antibodies"

    J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer]
    J["sample_type"]=["tumor"]
    J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer]
    J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")"
    J['domain']="TCGA"
    J['owner']="TCGA"
    J["tags"]=["cancer"]+ TCGAUtil.tags[cancer]
    J["unit"]="normalized RPPA value"
    J["description"]= "TCGA "+ TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") protein expression by reverse phase protein array (RPPA).<br><br> The data was generated and processed at the MD Anderson Cancer Center TCGA proteome characterization center RPPA core. Level 3 interpreted level data was downloaded from TCGA data coordination center.<br><br>"
    
    J["description"] = J["description"] + "Data normalization from the MDACC RPPA core: <a href=\"http://bioinformatics.mdanderson.org/main/TCPA:Overview\" target=\"_blank\"><u>under section How are the RPPA data processed</u></a>.<br>"
    
    #change cgData
    J["name"]="TCGA_"+cancer+"_RPPA"
    name = trackName_fix(J['name'])
    if name ==False:
        message = "bad object name, need fix otherwise break loader, too long "+J["name"]
        print message
        flog.write(message+"\n")
        return
    else:
        J["name"]=name        
        
    oHandle.write( json.dumps( J, indent=-1 ) )
    oHandle.close()
    
    return

Example #5

0

Show file

File: RPPA.py Project: jianguozhouzunyimedicaluniversity/wrangle

def RPPA(inDir, outDir, cancer, flog, REALRUN):
    print cancer, sys._getframe().f_code.co_name

    PATHPATTERN = "MDA_RPPA_Core"
    dataProducer = "MD Anderson Cancer Center TCGA proteome characterization center"

    garbage = [tmpDir]

    if os.path.exists(tmpDir):
        os.system("rm -rf " + tmpDir + "*")
    else:
        os.system("mkdir " + tmpDir)

    #multiple files in dir mode
    lastRelease = {}
    for file in os.listdir(inDir):
        #find the file
        if string.find(file, PATHPATTERN) != -1 and string.find(
                file, LEVEL) != -1 and string.find(
                    file, ".tar.gz") != -1 and string.find(file, "md5") == -1:
            pass
        else:
            continue

        if not os.path.exists(inDir + file + ".md5"):
            print "file has no matching .md5 throw out", file
            continue

        #find lastest in each archive
        info = string.split(file, ".")
        archive = info[-5]
        release = int(info[-4])

        if not lastRelease.has_key(archive):
            lastRelease[archive] = release
        else:
            if lastRelease[archive] < release:
                lastRelease[archive] = release

    rootDir = ""
    lastDate = None
    remoteDataDirExample = ""
    for file in os.listdir(inDir):
        #find the file
        if string.find(file, PATHPATTERN) != -1 and string.find(
                file, LEVEL) != -1 and string.find(
                    file, ".tar.gz") != -1 and string.find(file, "md5") == -1:
            pass
        else:
            continue

        if not os.path.exists(inDir + file + ".md5"):
            continue

        #find the file that is the lastest release for the archive
        info = string.split(file, ".")
        archive = info[-5]
        release = int(info[-4])

        if release != lastRelease[archive]:
            continue

        #file latest date
        newDate = datetime.date.fromtimestamp(os.stat(inDir + file).st_mtime)
        if not lastDate:
            lastDate = newDate
        if lastDate < newDate:
            lastDate = newDate

        if remoteDataDirExample == "":
            remoteDataDirExample = file[:-7]

        #is tar.gz?, uncompress multiple file mode
        if string.find(file, ".tar.gz") != -1 and REALRUN:
            os.system("tar -xzf " + inDir + file + " -C " + tmpDir)
            rootDir = tmpDir

    #make sure there is data
    if REALRUN and (rootDir == "" or not os.path.exists(rootDir)):
        print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__
        return

    #set output dir
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    if not os.path.exists(outDir + cancer + "/"):
        os.makedirs(outDir + cancer + "/")

    cgFileName = "RPPA"

    #data processing multiple dirs mode
    if REALRUN:
        aliquote_dic = TCGAUtil.uuid_Aliquot_all()
        dataMatrix = {}
        allSamples = []
        probes = []
        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir + dataDir):
                sample = ""
                pattern = "protein_expression"
                if string.find(file, pattern) != -1:
                    infile = rootDir + dataDir + "/" + file
                    sample = string.split(file, ".")[5]
                if sample == "":
                    continue
                # Test for barcode or UUID     #throw out all normals and control Analyte
                if sample[0:4] != "TCGA":
                    if aliquote_dic.has_key(string.lower(sample)):
                        if TCGAUtil.UUID_CELLLINE.has_key(sample):
                            print "control cell line ignore", sample
                            continue
                    else:
                        print "unknow id:", sample
                        continue
                else:
                    sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
                    if sampleTypeCode == False:  # likely a uuid
                        continue
                    elif sampleTypeCode in ["20"]:
                        print "control cell line ignore", sample
                        continue
                if sample not in allSamples:
                    allSamples.append(sample)

        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir + dataDir):
                sample = ""
                pattern = "protein_expression"
                if string.find(file, pattern) != -1:
                    infile = rootDir + dataDir + "/" + file
                    sample = string.split(file, ".")[5]
                if sample == "":
                    continue
                if sample not in allSamples:
                    continue
                valuePOS = 1
                process(dataMatrix, allSamples, sample, probes, cancer, infile,
                        flog, valuePOS)

        outfile = outDir + cancer + "/" + cgFileName
        outputMatrix(dataMatrix, allSamples, probes, outfile)

    oHandle = open(outDir + cancer + "/" + cgFileName + ".json", "w")

    J = {}
    #stable
    J["dataSubType"] = "protein expression RPPA"
    J["redistribution"] = True
    J["dataProducer"] = dataProducer
    J["colNormalization"] = True
    J["PLATFORM"] = "M.D. Anderson Reverse Phase Protein Array Core platform"
    J["type"] = "genomicMatrix"
    J[":sampleMap"] = "TCGA." + cancer + ".sampleMap"

    #multiple dirs
    J["url"]=TCGAUtil.remoteBase \
              +string.replace(inDir,TCGAUtil.localBase,"")
    J["version"] = datetime.date.today().isoformat()
    J["wrangler"] = "Xena TCGAscript " + __name__ + " processed on " + datetime.date.today(
    ).isoformat()
    J["wrangling_procedure"] = "Level_3 Data (file names: *.protein_expression.*) download from TCGA DCC, and processed at UCSC into Xena repository"
    J["label"] = "RPPA"
    J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[
        cancer] + " (" + cancer + ") reverse phase protein array"

    J[":probeMap"] = "md_anderson_antibodies"

    J["anatomical_origin"] = TCGAUtil.anatomical_origin[cancer]
    J["sample_type"] = ["tumor"]
    J["primary_disease"] = TCGAUtil.cancerGroupTitle[cancer]
    J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[
        cancer] + " (" + cancer + ")"
    J['domain'] = "TCGA"
    J['owner'] = "TCGA"
    J["tags"] = ["cancer"] + TCGAUtil.tags[cancer]
    J["unit"] = "normalized RPPA value"
    J["description"] = "TCGA " + TCGAUtil.cancerOfficial[
        cancer] + " (" + cancer + ") protein expression by reverse phase protein array (RPPA).<br><br> The data was generated and processed at the MD Anderson Cancer Center TCGA proteome characterization center RPPA core. Level 3 interpreted level data was downloaded from TCGA data coordination center.<br><br>"

    J["description"] = J[
        "description"] + "Data normalization from the MDACC RPPA core: <a href=\"http://bioinformatics.mdanderson.org/main/TCPA:Overview\" target=\"_blank\"><u>under section How are the RPPA data processed</u></a>.<br>"

    #change cgData
    J["name"] = "TCGA_" + cancer + "_RPPA"
    name = trackName_fix(J['name'])
    if name == False:
        message = "bad object name, need fix otherwise break loader, too long " + J[
            "name"]
        print message
        flog.write(message + "\n")
        return
    else:
        J["name"] = name

    oHandle.write(json.dumps(J, indent=-1))
    oHandle.close()

    return

Example #6

0

Show file

File: cohort.py Project: jianguozhouzunyimedicaluniversity/wrangle

def cohort_variable(var, value, inDir, outDir, cancer, REALRUN, doDerived):
    print inDir
    print outDir

    if REALRUN:
        ignore = 1
        bookDic = cgWalk(inDir, ignore)

        existMaps = collectSampleMaps(bookDic)
        missingMaps = collectMissingSampleMaps(bookDic)

        #removeExistMaps
        for map in existMaps:
            if map not in missingMaps:
                missingMaps[map] = existMaps[map]

        # all aliquote uuid dic
        aliquote_dic = TCGAUtil.uuid_Aliquot_all()
        sample_dic = TCGAUtil.uuid_Sample_all()

        if len(missingMaps) != 1:
            return

        map = missingMaps.keys()[0]
        print map
        samples = []
        for name in missingMaps[map]:
            obj = bookDic[name]

            if obj['type'] == "genomicMatrix":
                fin = open(obj['path'], 'U')
                for sample in string.split(fin.readline()[:-1], "\t")[1:]:
                    if sample == "":
                        print name, "has bad empty sample id"
                        sys.exit()
                    if sample not in samples:
                        samples.append(sample)
                fin.close()

                #take too long
                """
            if obj['type']=="mutationVector":
                fin =open(obj['path'],'U')
                fin.readline()
                while 1:
                    line = fin.readline()
                    if string.strip(line) =="":
                        break
                    sample = string.split(line,'\t')[0]
                    if sample not in samples:
                        samples.append(sample)
                        print sample, obj['path']
                fin.close()
            """
        intDic = {}
        for sample in samples:
            #TCGA uuid handling
            uuid = sample
            TCGAbarcode = ""
            if uuid[0:4] != "TCGA":
                if aliquote_dic.has_key(string.lower(uuid)):
                    TCGAbarcode = aliquote_dic[string.lower(uuid)]
                else:
                    TCGAbarcode = uuid
            else:
                TCGAbarcode = sample

            intID = TCGAUtil.barcode_IntegrationId(TCGAbarcode)
            if intID == None:  # ids is on patient level above integration level
                continue
            if not intDic.has_key(intID):
                intDic[intID] = ""

        outfile = outDir + cancer + "/" + var
        fout = open(outfile, "w")
        fout.write("sample\t" + var + "\n")
        for intId in intDic:
            fout.write(intId + "\t" + value + "\n")
        fout.close()

    #data josn
    J = {}
    J["version"] = datetime.date.today().isoformat()
    J["name"] = "TCGA_" + cancer + "_" + var
    J["type"] = "clinicalMatrix"
    J["dataSubType"] = "phenotype"
    J[":sampleMap"] = "TCGA." + cancer + ".sampleMap"
    J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[
        cancer] + " (" + cancer + ")"

    outfile = outDir + cancer + "/" + var
    oHandle = open(outfile + ".json", "w")
    oHandle.write(json.dumps(J, indent=-1))
    oHandle.close()

    if doDerived:
        if cancer in ["LUAD", "LUSC"]:
            derived_cancer = "LUNG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["COAD", "READ"]:
            derived_cancer = "COADREAD"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["GBM", "LGG"]:
            derived_cancer = "GBMLGG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)

Example #7

0

Show file

File: test.py Project: jingchunzhu/cgDataNew

import TCGAUtil

dic=TCGAUtil.uuid_Aliquot_all()
dic=TCGAUtil.uuid_Sample_all()
TCGAUtil.uuid_normal_cellline()
TCGAUtil.uuid_cellline()

Example #8

0

Show file

File: cohort.py Project: jingchunzhu/cgDataNew

def cohort_variable (var, value, inDir, outDir, cancer, REALRUN, doDerived):
    print inDir
    print outDir

    if REALRUN:
        ignore =1
        bookDic=cgWalk(inDir,ignore)
        
        existMaps = collectSampleMaps(bookDic)
        missingMaps=  collectMissingSampleMaps(bookDic)

        #removeExistMaps
        for map in existMaps:
            if map not in missingMaps:
                missingMaps[map]=existMaps[map]
        
        # all aliquote uuid dic
        aliquote_dic =TCGAUtil.uuid_Aliquot_all()
        sample_dic =TCGAUtil.uuid_Sample_all()

        if len(missingMaps)!=1:
            return

        map = missingMaps.keys()[0]
        print map
        samples =[]
        for name in missingMaps[map]:
            obj=bookDic[name]
            
            if obj['type']=="genomicMatrix":
                fin =open(obj['path'],'U')
                for sample in string.split(fin.readline()[:-1],"\t")[1:]:
                    if sample =="":
                        print name, "has bad empty sample id"
                        sys.exit()
                    if sample not in samples:
                        samples.append(sample)
                fin.close()
            
            #take too long
                """
            if obj['type']=="mutationVector":
                fin =open(obj['path'],'U')
                fin.readline()
                while 1:
                    line = fin.readline()
                    if string.strip(line) =="":
                        break
                    sample = string.split(line,'\t')[0]
                    if sample not in samples:
                        samples.append(sample)
                        print sample, obj['path']
                fin.close()
            """
        intDic={}
        for sample in samples:
            #TCGA uuid handling
            uuid =sample
            TCGAbarcode =""
            if uuid[0:4]!="TCGA": 
                if aliquote_dic.has_key(string.lower(uuid)):
                    TCGAbarcode = aliquote_dic[string.lower(uuid)]
                else:
                    TCGAbarcode =  uuid
            else:
                TCGAbarcode = sample

            intID= TCGAUtil.barcode_IntegrationId(TCGAbarcode)
            if intID == None: # ids is on patient level above integration level
                continue 
            if not intDic.has_key(intID):
                intDic[intID]=""

        outfile = outDir+cancer+"/"+ var
        fout =open(outfile,"w")
        fout.write("sample\t"+var+"\n")
        for intId in intDic:
            fout.write(intId+"\t"+ value+"\n")
        fout.close()

    #data josn
    J={}
    J["version"]= datetime.date.today().isoformat()
    J["name"]="TCGA_"+cancer+"_"+var
    J["type"]= "clinicalMatrix" 
    J["dataSubType"]="phenotype"
    J[":sampleMap"]="TCGA."+cancer+".sampleMap"
    J["cohort"]="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")"

    outfile = outDir+cancer+"/"+var
    oHandle = open(outfile +".json","w")
    oHandle.write( json.dumps( J, indent=-1 ) )
    oHandle.close()

    if doDerived:
        if cancer in ["LUAD","LUSC"]:
            derived_cancer="LUNG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["COAD","READ"]:
            derived_cancer="COADREAD"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["GBM","LGG"]:
            derived_cancer="GBMLGG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)

Example #9

0

Show file

File: CAVM_TCGA.py Project: jingchunzhu/cgDataNew

def CAVMid (dir, outDir, cancer,log, REALRUN):
    print cancer, sys._getframe().f_code.co_name

    ignore =1
    bookDic=cgWalk(dir,ignore)

    existMaps = collectSampleMaps(bookDic)
    missingMaps=  collectMissingSampleMaps(bookDic)

    #removeExistMaps
    for map in existMaps:
        if map not in missingMaps:
            missingMaps[map]=existMaps[map]
        
    # all aliquote uuid dic
    aliquote_dic =TCGAUtil.uuid_Aliquot_all()
    sample_dic =TCGAUtil.uuid_Sample_all()

    if not os.path.exists (outDir):
        os.system("mkdir "+outDir)
        
    for map in missingMaps:
        print map
        sMap =SampleMapNew(None,map)
        for name in missingMaps[map]:
            samples =[]
            intDic={}#keyed on CAVMid
            sampleDic={} #keyd on original sample id
            obj=bookDic[name]

            print obj["name"]

            if obj['type'] in ["clinicalMatrix","mutationVector"]:
                outfile = outDir +os.path.basename(obj['path'])
                os.system("cp "+obj['path']+".json "+outfile+".json")

                fin = open (outfile+".json",'r')
                J=json.load(fin)
                fin.close()
                if J.has_key(":clinicalFeature"):
                    cFobj= bookDic[J[":clinicalFeature"]]
                    cFoutfile = outDir +os.path.basename(cFobj['path'])
                    os.system("cp "+cFobj['path']+" "+cFoutfile)
                    os.system("cp "+cFobj['path']+".json "+cFoutfile+".json")
                
                if REALRUN ==-1:
                    continue

                if REALRUN ==0 and obj['type']=="mutationVector":
                    continue

                fin = open(obj['path'],'r')
                fin.readline()
                for line in fin.readlines():
                    sample =string.split(line,"\t")[0]
                    if sample not in samples and sample !="":
                        samples.append(sample)
                buildSampleDic (samples, sMap, intDic, sampleDic, aliquote_dic)

                fin = open(obj['path'],'r')
                fout = open(outfile,'w')
                fout.write(fin.readline())
                for line in fin.readlines():
                    data =string.split(line,"\t")
                    sample =data[0]
                    try:
                        fout.write(sampleDic[sample]+"\t")
                        fout.write(string.join(data[1:],"\t"))
                    except:
                        fout.write(line)
                fout.close()
                
            if obj['type']=="genomicMatrix":
                fin =open(obj['path'],'U')
                for sample in string.split(fin.readline()[:-1],"\t")[1:]:
                    if sample =="":
                        print name, "has bad empty sample id"
                        sys.exit()
                    samples.append(sample)

                fin.close()

                outfile = outDir +os.path.basename(obj['path'])

                os.system("cp "+obj['path']+".json "+outfile+".json")

                if REALRUN !=1:
                    continue
                
                buildSampleDic (samples, sMap, intDic, sampleDic,aliquote_dic)
                process(obj['path'], outfile, samples, intDic)