Python TCGAUtil Examples

Programming Language: Python

Class/Type: TCGAUtil

Examples at hotexamples.com: 23

Python TCGAUtil - 23 examples found. These are the top rated real world Python examples of TCGAUtil extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

barcode_SampleType(7)

uuid_Aliquot_all(5)

barcode_IntegrationId(4)

uuid_Sample_all(4)

is_barcode(1)

uuid_cellline(1)

uuid_normal_cellline(1)

Example #1

Show file

def process(samples,cancer,infile,flog, mapping,fout):
    # one sample a file
    fin=open(infile,'U')
    line = fin.readline()

    line = fin.readline()
    sample = string.split(string.strip(line),"\t")[0]
    if mapping.has_key(sample):
        sample=mapping[sample]
    else:
        fin.close()
        message =  "ERROR sample not in sdrf = "+ sample+ " " +cancer+" "+ __name__
        flog.write(message+"\n")
        print message
        return
    
    if sample in samples:
        fin.close()
        message =  "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__
        flog.write(message+"\n")
        print message
        return

    # Test for barcode or UUID     #throw out all normals and control Analyte
    if sample[0:4]!="TCGA":
        if TCGAUtil.UUID_NORMAL_CELLLINE.has_key(sample):
            fin.close()
            return
            
    else:
        sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
        if sampleTypeCode == False: # likely a uuid
            fin.close()
            return
        elif sampleTypeCode in ["10","11","20"]:
            #check TCGAUtil for codes
            fin.close()
            return

        
    samples.append(sample)
    fin.close()

    fin=open(infile,'U')
    line = fin.readline()
    for line in fin.readlines():
        sample,chr,start, end, numMark, segMean = string.split(line[:-1],"\t")
        sample=mapping[sample]
        if chr=="23":
            chr="chrX"
        elif chr=="24":
            chr="chrY"
        elif chr=="M":
            continue
        else:
            chr="chr"+chr
        start = str(int(float(start)))
        end = str(int(float(end)))
        fout.write(sample+"\t"+chr+"\t"+start+"\t"+end+"\t"+segMean+"\n")

Example #2

Show file

def process(samples,cancer,infile,flog, mapping,fout):
    # one sample a file
    fin=open(infile,'U')
    line = fin.readline()

    line = fin.readline()
    sample = string.split(string.strip(line),"\t")[0]
    if mapping.has_key(sample):
        sample=mapping[sample]
    else:
        fin.close()
        message =  "ERROR sample not in sdrf = "+ sample+ " " +cancer+" "+ __name__
        flog.write(message+"\n")
        print message
        return
    
    if sample in samples:
        fin.close()
        message =  "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__
        flog.write(message+"\n")
        print message
        return

    # Test for barcode or UUID     #throw out all normals and control Analyte
    if sample[0:4]!="TCGA":
        if TCGAUtil.UUID_NORMAL_CELLLINE.has_key(sample):
            fin.close()
            return
            
    else:
        sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
        if sampleTypeCode == False: # likely a uuid
            fin.close()
            return
        elif sampleTypeCode in ["10","11","20"]:
            #check TCGAUtil for codes
            fin.close()
            return

        
    samples.append(sample)
    fin.close()

    fin=open(infile,'U')
    line = fin.readline()
    for line in fin.readlines():
        sample,chr,start, end, numMark, segMean = string.split(line[:-1],"\t")
        sample=mapping[sample]
        if chr=="23":
            chr="chrX"
        elif chr=="24":
            chr="chrY"
        elif chr=="M":
            continue
        else:
            chr="chr"+chr
        start = str(int(float(start)))
        end = str(int(float(end)))
        fout.write(sample+"\t"+chr+"\t"+start+"\t"+end+"\t"+segMean+"\n")

Example #3

Show file

File: Methylation450.py Project: jianguozhouzunyimedicaluniversity/wrangle

def process(c, dataMatrix, allSamples, samples, probes, cancer, infile, flog,
            BETA_POS, offset, maxLength):
    # one sample a file
    fin = open(infile, 'U')
    line = fin.readline()

    sample = string.split(line[:-1], "\t")[BETA_POS]
    if sample in allSamples or sample in samples:
        fin.close()
        message = "ERROR duplicated sample = " + sample + " " + cancer + " " + __name__
        flog.write(message + "\n")
        print message
        return c

    # Test for barcode or UUID     #throw out all normals and control Analyte
    if sample[0:4] != "TCGA":
        if TCGAUtil.UUID_CELLLINE.has_key(sample):
            print "control cell line ignore", sample
            fin.close()
            return c
    else:
        sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
        if sampleTypeCode == False:  # likely a uuid
            fin.close()
            return c
        elif sampleTypeCode in ["20"]:
            print "control cell line ignore", sample
            fin.close()
            return c

    p = len(samples)
    samples[sample] = p
    allSamples[sample] = ""
    c = c + 1

    fin.readline()

    for line in fin.readlines():
        data = string.split(line[:-1], "\t")
        probe = data[0]
        value = data[BETA_POS]

        if probe not in probes:
            p = len(probes)
            probes[probe] = p
            l = []
            for j in range(0, maxLength):
                l.append("")
            dataMatrix.append(l)

        if value not in ["", "null", "NULL", "Null", "NA"]:
            value = float(value) + offset
            x = probes[probe]
            y = samples[sample]
            dataMatrix[x][y] = value

    fin.close()
    return c

Example #4

Show file

def process(c, dataMatrix,allSamples, samples, probes, cancer,infile,flog, BETA_POS, offset, maxLength):
    # one sample a file
    fin=open(infile,'U')    
    line = fin.readline()

    sample = string.split(line[:-1],"\t")[BETA_POS]
    if sample in allSamples or sample in samples:
        fin.close()
        message =  "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__
        flog.write(message+"\n")
        print message
        return c

    # Test for barcode or UUID     #throw out all normals and control Analyte
    if sample[0:4]!="TCGA":
        if TCGAUtil.UUID_CELLLINE.has_key(sample):
            print "control cell line ignore", sample
            fin.close()
            return c
    else:
        sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
        if sampleTypeCode == False: # likely a uuid
            fin.close()
            return c
        elif sampleTypeCode in ["20"]:
            print "control cell line ignore", sample
            fin.close()
            return c

    p=len(samples)
    samples[sample]=p
    allSamples[sample]=""
    c= c+1

    fin.readline()
    
    for line in fin.readlines():
        data =string.split(line[:-1],"\t")
        probe = data[0]
        value= data[BETA_POS]

        if probe not in probes:
            p=len(probes)
            probes[probe]=p
            l=[]
            for j in range (0,maxLength):
                l.append("")    
            dataMatrix.append(l)
        
        if value not in ["","null","NULL","Null","NA"]:
            value = float(value)+ offset
            x=probes[probe]
            y=samples[sample]
            dataMatrix[x][y]=value

    fin.close()
    return c

Example #5

Show file

File: Affy.py Project: jianguozhouzunyimedicaluniversity/wrangle

def process(dataMatrix,samples,cancer,infile,mapping, flog):
    # one sample a file
    fin=open(infile,'U')
    line = fin.readline()

    sample = string.split(string.strip(line),"\t")[1]
    if mapping.has_key(sample):
        sample=mapping[sample]
    else:
        fin.close()
        message =  "ERROR sample not in sdrf = "+ sample+ " " +cancer+" "+ __name__
        flog.write(message+"\n")
        print message
        return

    if sample in samples:
        fin.close()
        message =  "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__
        flog.write(message+"\n")
        print message
        return

    # Test for barcode or UUID     #throw out all normals and control Analyte
    if sample[0:4]!="TCGA":
        if TCGAUtil.UUID_CELLLINE.has_key(sample):
            print "control cell line ignore", sample
            fin.close()
            return
    else:
        sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
        if sampleTypeCode == False: # likely a uuid
            fin.close()
            return
        elif sampleTypeCode in ["20"]:
            fin.close()
            print "control cell line ignore", sample
            return

    samples.append(sample)
    fin.readline()

    for line in fin.readlines():
        hugo,value =string.split(line[:-1],"\t")
        if not dataMatrix.has_key(hugo):
            dataMatrix[hugo]={}
        if value not in ["","null","NULL","Null","NA"]:
            dataMatrix[hugo][sample]=value
        else:
            dataMatrix[hugo][sample]="NA"

    fin.close()
    return

Example #6

Show file

File: Clinical.py Project: jianguozhouzunyimedicaluniversity/wrangle

def add_col_PseudoSample(clinMatrix,
                         col):  # add sample type informatin to pseudo samples
    rows = clinMatrix.getROWs()
    for row in rows:
        st = clinMatrix.getDATA(row, col)
        if st != None and st != "":
            #assuming sample ids are TCGA barcode
            integration_id = TCGAUtil.barcode_IntegrationId(row)
            if clinMatrix.hasRow(integration_id):
                clinMatrix.setDATA(integration_id, col, st)
            else:
                clinMatrix.addNewRows([integration_id], {col: st})

    r = clinMatrix.validate()
    if r == False:
        print "add pseudoSample clinical infor", col, "fail"

Example #7

Show file

File: Methylation.py Project: jianguozhouzunyimedicaluniversity/wrangle

def betaMean(total, count,samples, probes,cancer,infile,flog):
    # one sample a file
    fin=open(infile,'r')
    line = fin.readline()
    sample = string.split(string.strip(line),"\t")[BETA_POS]
    if sample in samples:
        fin.close()
        message =  "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__
        flog.write(message+"\n")
        print message
        return total, count


    # Test for barcode or UUID     #throw out all normals and control Analyte
    if sample[0:4]!="TCGA":
        print sample
        if TCGAUtil.UUID_CELLLINE.has_key(sample):
            print "control cell line ignore", sample
            fin.close()
            return total,count
    else:
        sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
        if sampleTypeCode == False: # likely a uuid
            fin.close()
            return
        elif sampleTypeCode in ["20"]:
            print "control cell line ignore", sample
            fin.close()
            return total,count

    p=len(samples)
    samples[sample]=p
    
    fin.readline()

    for line in fin.readlines():
        probe,value =string.split(line[:-1],"\t")[0:BETA_POS+1]
        if probe not in probes:
            p = len(probes)
            probes[probe]= p
        if value in ["","null","NULL","Null","NA"]:
            continue
        value = float(value)
        total = total +value
        count = count +1
    fin.close()
    return total, count

Example #8

Show file

def betaMean(total, count,samples, probes,cancer,infile,flog):
    # one sample a file
    fin=open(infile,'r')
    line = fin.readline()
    sample = string.split(string.strip(line),"\t")[BETA_POS]
    if sample in samples:
        fin.close()
        message =  "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__
        flog.write(message+"\n")
        print message
        return total, count


    # Test for barcode or UUID     #throw out all normals and control Analyte
    if sample[0:4]!="TCGA":
        print sample
        if TCGAUtil.UUID_CELLLINE.has_key(sample):
            print "control cell line ignore", sample
            fin.close()
            return total,count
    else:
        sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
        if sampleTypeCode == False: # likely a uuid
            fin.close()
            return
        elif sampleTypeCode in ["20"]:
            print "control cell line ignore", sample
            fin.close()
            return total,count

    p=len(samples)
    samples[sample]=p
    
    fin.readline()

    for line in fin.readlines():
        probe,value =string.split(line[:-1],"\t")[0:BETA_POS+1]
        if probe not in probes:
            p = len(probes)
            probes[probe]= p
        if value in ["","null","NULL","Null","NA"]:
            continue
        value = float(value)
        total = total +value
        count = count +1
    fin.close()
    return total, count

Example #9

Show file

File: Clinical.py Project: jianguozhouzunyimedicaluniversity/wrangle

def uuid_2_barcode(
        clinMatrix, uuidcol, mapDic,
        flog):  #convert uuid to barcode, if uuid not found, remove the sample
    rows = clinMatrix.getROWs()
    removeSamples = []
    for row in rows:
        uuid = clinMatrix.getDATA(row, uuidcol)
        if TCGAUtil.is_barcode(uuid) == True:
            continue
        try:
            barcode = mapDic[string.lower(uuid)]
            clinMatrix.replaceValueInCol(uuidcol, uuid, barcode)
        except KeyError:
            removeSamples.append(row)
            print uuid, "not found"
            flog.write(uuid + " not found\n")

    if len(removeSamples) > 0:
        r = clinMatrix.removeRows(removeSamples, True)
        if not r:
            print "fail to validate"
    clinMatrix.replaceColName(uuidcol, "tcgaBarCode")

Example #10

Show file

File: CAVM_TCGA.py Project: jingchunzhu/cgDataNew

def buildSampleDic (samples, sMap, intDic, sampleDic,aliquote_dic):
    for sample in samples:
        #TCGA uuid handling
        uuid=sample
        if sample[0:4]!="TCGA": 
            if aliquote_dic.has_key(string.lower(sample)):
                TCGAbarcode = aliquote_dic[string.lower(sample)]
            else:
                print sample
            parent = TCGAbarcode
            child = sample
            sMap.addLink(parent,child)
            sample = parent
                
        #do TCGA barcode trick
        parts= string.split(sample,"-")
        parent = string.join(parts[0:3],"-")
                    
        #parts[3]
        if len(parts)>3 and len(parts[3])==3:
            child=parent +"-" +parts[3][0:2]
            sMap.addLink(parent,child)
            parent=child
            child=string.join(parts[0:4],"-")
            sMap.addLink(parent,child)
            parent=child
                
        for i in range (4,len(parts)):
            child = parent +"-" +parts[i]
            #add parent child
            sMap.addLink(parent,child)
            parent = child
                
        intID= TCGAUtil.barcode_IntegrationId(sample)
        if intDic.has_key(intID):
            intDic[intID].append(uuid)
        else:
            intDic[intID]=[uuid]
        sampleDic[uuid]=intID

Example #11

Show file

File: CAVM_TCGA.py Project: jianguozhouzunyimedicaluniversity/wrangle

def buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic):
    for sample in samples:
        #TCGA uuid handling
        uuid = sample
        if sample[0:4] != "TCGA":
            if aliquote_dic.has_key(string.lower(sample)):
                TCGAbarcode = aliquote_dic[string.lower(sample)]
            else:
                print sample
            parent = TCGAbarcode
            child = sample
            sMap.addLink(parent, child)
            sample = parent

        #do TCGA barcode trick
        parts = string.split(sample, "-")
        parent = string.join(parts[0:3], "-")

        #parts[3]
        if len(parts) > 3 and len(parts[3]) == 3:
            child = parent + "-" + parts[3][0:2]
            sMap.addLink(parent, child)
            parent = child
            child = string.join(parts[0:4], "-")
            sMap.addLink(parent, child)
            parent = child

        for i in range(4, len(parts)):
            child = parent + "-" + parts[i]
            #add parent child
            sMap.addLink(parent, child)
            parent = child

        intID = TCGAUtil.barcode_IntegrationId(sample)
        if intDic.has_key(intID):
            intDic[intID].append(uuid)
        else:
            intDic[intID] = [uuid]
        sampleDic[uuid] = intID

Example #12

Show file

File: CAVM_TCGA.py Project: jingchunzhu/cgDataNew

def CAVMid (dir, outDir, cancer,log, REALRUN):
    print cancer, sys._getframe().f_code.co_name

    ignore =1
    bookDic=cgWalk(dir,ignore)

    existMaps = collectSampleMaps(bookDic)
    missingMaps=  collectMissingSampleMaps(bookDic)

    #removeExistMaps
    for map in existMaps:
        if map not in missingMaps:
            missingMaps[map]=existMaps[map]
        
    # all aliquote uuid dic
    aliquote_dic =TCGAUtil.uuid_Aliquot_all()
    sample_dic =TCGAUtil.uuid_Sample_all()

    if not os.path.exists (outDir):
        os.system("mkdir "+outDir)
        
    for map in missingMaps:
        print map
        sMap =SampleMapNew(None,map)
        for name in missingMaps[map]:
            samples =[]
            intDic={}#keyed on CAVMid
            sampleDic={} #keyd on original sample id
            obj=bookDic[name]

            print obj["name"]

            if obj['type'] in ["clinicalMatrix","mutationVector"]:
                outfile = outDir +os.path.basename(obj['path'])
                os.system("cp "+obj['path']+".json "+outfile+".json")

                fin = open (outfile+".json",'r')
                J=json.load(fin)
                fin.close()
                if J.has_key(":clinicalFeature"):
                    cFobj= bookDic[J[":clinicalFeature"]]
                    cFoutfile = outDir +os.path.basename(cFobj['path'])
                    os.system("cp "+cFobj['path']+" "+cFoutfile)
                    os.system("cp "+cFobj['path']+".json "+cFoutfile+".json")
                
                if REALRUN ==-1:
                    continue

                if REALRUN ==0 and obj['type']=="mutationVector":
                    continue

                fin = open(obj['path'],'r')
                fin.readline()
                for line in fin.readlines():
                    sample =string.split(line,"\t")[0]
                    if sample not in samples and sample !="":
                        samples.append(sample)
                buildSampleDic (samples, sMap, intDic, sampleDic, aliquote_dic)

                fin = open(obj['path'],'r')
                fout = open(outfile,'w')
                fout.write(fin.readline())
                for line in fin.readlines():
                    data =string.split(line,"\t")
                    sample =data[0]
                    try:
                        fout.write(sampleDic[sample]+"\t")
                        fout.write(string.join(data[1:],"\t"))
                    except:
                        fout.write(line)
                fout.close()
                
            if obj['type']=="genomicMatrix":
                fin =open(obj['path'],'U')
                for sample in string.split(fin.readline()[:-1],"\t")[1:]:
                    if sample =="":
                        print name, "has bad empty sample id"
                        sys.exit()
                    samples.append(sample)

                fin.close()

                outfile = outDir +os.path.basename(obj['path'])

                os.system("cp "+obj['path']+".json "+outfile+".json")

                if REALRUN !=1:
                    continue
                
                buildSampleDic (samples, sMap, intDic, sampleDic,aliquote_dic)
                process(obj['path'], outfile, samples, intDic)

Example #13

Show file

File: cohort.py Project: jianguozhouzunyimedicaluniversity/wrangle

def cohort_variable(var, value, inDir, outDir, cancer, REALRUN, doDerived):
    print inDir
    print outDir

    if REALRUN:
        ignore = 1
        bookDic = cgWalk(inDir, ignore)

        existMaps = collectSampleMaps(bookDic)
        missingMaps = collectMissingSampleMaps(bookDic)

        #removeExistMaps
        for map in existMaps:
            if map not in missingMaps:
                missingMaps[map] = existMaps[map]

        # all aliquote uuid dic
        aliquote_dic = TCGAUtil.uuid_Aliquot_all()
        sample_dic = TCGAUtil.uuid_Sample_all()

        if len(missingMaps) != 1:
            return

        map = missingMaps.keys()[0]
        print map
        samples = []
        for name in missingMaps[map]:
            obj = bookDic[name]

            if obj['type'] == "genomicMatrix":
                fin = open(obj['path'], 'U')
                for sample in string.split(fin.readline()[:-1], "\t")[1:]:
                    if sample == "":
                        print name, "has bad empty sample id"
                        sys.exit()
                    if sample not in samples:
                        samples.append(sample)
                fin.close()

                #take too long
                """
            if obj['type']=="mutationVector":
                fin =open(obj['path'],'U')
                fin.readline()
                while 1:
                    line = fin.readline()
                    if string.strip(line) =="":
                        break
                    sample = string.split(line,'\t')[0]
                    if sample not in samples:
                        samples.append(sample)
                        print sample, obj['path']
                fin.close()
            """
        intDic = {}
        for sample in samples:
            #TCGA uuid handling
            uuid = sample
            TCGAbarcode = ""
            if uuid[0:4] != "TCGA":
                if aliquote_dic.has_key(string.lower(uuid)):
                    TCGAbarcode = aliquote_dic[string.lower(uuid)]
                else:
                    TCGAbarcode = uuid
            else:
                TCGAbarcode = sample

            intID = TCGAUtil.barcode_IntegrationId(TCGAbarcode)
            if intID == None:  # ids is on patient level above integration level
                continue
            if not intDic.has_key(intID):
                intDic[intID] = ""

        outfile = outDir + cancer + "/" + var
        fout = open(outfile, "w")
        fout.write("sample\t" + var + "\n")
        for intId in intDic:
            fout.write(intId + "\t" + value + "\n")
        fout.close()

    #data josn
    J = {}
    J["version"] = datetime.date.today().isoformat()
    J["name"] = "TCGA_" + cancer + "_" + var
    J["type"] = "clinicalMatrix"
    J["dataSubType"] = "phenotype"
    J[":sampleMap"] = "TCGA." + cancer + ".sampleMap"
    J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[
        cancer] + " (" + cancer + ")"

    outfile = outDir + cancer + "/" + var
    oHandle = open(outfile + ".json", "w")
    oHandle.write(json.dumps(J, indent=-1))
    oHandle.close()

    if doDerived:
        if cancer in ["LUAD", "LUSC"]:
            derived_cancer = "LUNG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["COAD", "READ"]:
            derived_cancer = "COADREAD"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["GBM", "LGG"]:
            derived_cancer = "GBMLGG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)

Example #14

Show file

def geneRPKM(inDir, outDir, cancer, flog, PATHPATTERN, suffix, namesuffix,
             dataProducer, REALRUN, clean):
    garbage = [tmpDir]
    os.system("rm -rf tmp_*")
    if os.path.exists(tmpDir):
        if clean:
            os.system("rm -rf " + tmpDir + "*")
    else:
        os.system("mkdir " + tmpDir)

    #multiple files in dir mode
    lastRelease = {}
    for file in os.listdir(inDir):
        #find the file
        if string.find(file, PATHPATTERN) != -1 and string.find(
                file, LEVEL) != -1 and string.find(
                    file, ".tar.gz") != -1 and string.find(file, "md5") == -1:
            pass
        else:
            continue

        if not os.path.exists(inDir + file + ".md5"):
            print "file has no matching .md5 throw out", file
            continue

        #find lastest in each archive
        info = string.split(file, ".")
        archive = info[-5]
        release = int(info[-4])

        if not lastRelease.has_key(archive):
            lastRelease[archive] = release
        else:
            if lastRelease[archive] < release:
                lastRelease[archive] = release

    rootDir = ""
    lastDate = None
    remoteDataDirExample = ""
    for file in os.listdir(inDir):
        #find the file
        if string.find(file, PATHPATTERN) != -1 and string.find(
                file, LEVEL) != -1 and string.find(
                    file, ".tar.gz") != -1 and string.find(file, "md5") == -1:
            pass
        else:
            continue

        if not os.path.exists(inDir + file + ".md5"):
            continue

        #find the file that is the lastest release for the archive
        info = string.split(file, ".")
        archive = info[-5]
        release = int(info[-4])

        if release != lastRelease[archive]:
            continue

        #file latest date
        newDate = datetime.date.fromtimestamp(os.stat(inDir + file).st_mtime)
        if not lastDate:
            lastDate = newDate
        if lastDate < newDate:
            lastDate = newDate

        if remoteDataDirExample == "":
            remoteDataDirExample = file[:-7]

        #is tar.gz?, uncompress multiple file mode
        if not clean:
            rootDir = tmpDir
        elif string.find(file, ".tar.gz") != -1 and REALRUN and clean:
            os.system("tar -xzf " + inDir + file + " -C " + tmpDir)
            rootDir = tmpDir

    #make sure there is data
    if REALRUN and (rootDir == "" or not os.path.exists(rootDir)):
        print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__
        return

    #set output dir
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    if not os.path.exists(outDir + cancer + "/"):
        os.makedirs(outDir + cancer + "/")

    cgFileName = namesuffix

    #data processing multiple dirs mode
    if REALRUN:
        #hg19 or not
        pattern = ".hg19.mirbase20.isoform.quantification"
        HG19 = 0
        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir + dataDir):
                if string.find(file, pattern) != -1:
                    HG19 = 1
                    break
            if HG19:
                break
        if HG19:
            pattern = ".hg19.mirbase20.isoform.quantification"
        else:
            pattern = ".isoform.quantification"

        allSamples = {}
        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir + dataDir):
                sample = ""
                #bcgsc gene
                if string.find(file, pattern) != -1:
                    infile = rootDir + dataDir + "/" + file
                    # bcgsc stupid sample name in file name
                    if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center":
                        sample = string.split(file, ".")[0][:15]
                    else:
                        print "please check how to identify sample name"

                if sample == "":
                    continue
                if sample in allSamples:
                    print len(allSamples)
                    message = "ERROR duplicated sample = " + sample + " " + cancer + " " + __name__ + file
                    flog.write(message + "\n")
                    print message
                    continue

                # Test for barcode or UUID     #throw out all normals and control Analyte
                if sample[0:4] != "TCGA":
                    if TCGAUtil.UUID_CELLLINE.has_key(sample):
                        print "control cell line ignore", sample
                        continue
                else:
                    sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
                    if sampleTypeCode == False:  # likely a uuid
                        continue
                    elif sampleTypeCode in ["20"]:
                        print "control cell line ignore", sample
                        continue

                p = len(allSamples)
                allSamples[sample] = p

        c = 0
        dataMatrix = []
        mapping = {}
        tmpSamples = {}
        genes = {}
        oldgenes = {}
        files = []
        GOOD = 1
        BATCH = 2500
        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir + dataDir):
                sample = ""
                if string.find(file, pattern) != -1:
                    infile = rootDir + dataDir + "/" + file
                    # bcgsc stupid sample name in file name
                    if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center":
                        sample = string.split(file, ".")[0][:15]
                    else:
                        print "please check how to identify sample name"
                    valuePOS = 3
                    LOG2 = 1

                if sample == "":
                    continue
                if sample in tmpSamples:  #duplicated samples
                    continue
                if sample not in allSamples:
                    continue

                p = len(tmpSamples)
                tmpSamples[sample] = p

                c = c + 1
                process(dataMatrix, mapping, tmpSamples, sample, genes, cancer,
                        infile, flog, valuePOS, LOG2, BATCH)

                if (c % BATCH) == 0:
                    tmpout = "tmp_" + str(int(c / float(BATCH)))
                    r = outputMatrix(dataMatrix, tmpSamples, genes, oldgenes,
                                     tmpout, flog)
                    if r:
                        GOOD = 0

                    dataMatrix = []
                    tmpSamples = {}
                    oldgenes = copy.deepcopy(genes)
                    genes = {}
                    files.append(tmpout)

        if (c % BATCH) != 0:
            tmpout = "tmp_" + str(int(c / float(BATCH)) + 1)
            files.append(tmpout)
            r = outputMatrix(dataMatrix, tmpSamples, genes, oldgenes, tmpout,
                             flog)
            if r:
                GOOD = 0

        #paste all together
        outfile = outDir + cancer + "/" + cgFileName
        if GOOD:
            os.system("paste -d \'\' " + string.join(files, " ") + " > " +
                      outfile)
        for file in files:
            os.system("rm " + file)
        if not GOOD:
            sys.exit()

        #probeMap
        probefile = outDir + cancer + "/" + namesuffix + ".probeMap"
        outputProbeMap(probefile, mapping)

        #transcript datafile
        datafile = outDir + cancer + "/" + cgFileName
        if not os.path.exists(datafile):
            return

        #gene datafile
        genefile = outDir + cancer + "/" + cgFileName + "_gene"
        os.system(
            "python ../support/genomicMatrixToGeneMatrix_memInEfficient.py " +
            datafile + " " + probefile + ' ' + genefile + " add 1 1")

    #probeMap json
    probefile = outDir + cancer + "/" + namesuffix + ".probeMap"
    fout = open(probefile + ".json", "w")
    J = {}
    J['type'] = 'probeMap'
    J['assembly'] = 'hg19'
    J['name'] = cancer + '_' + namesuffix + ".probeMap"
    fout.write(json.dumps(J, indent=-1))
    fout.close()

    oHandle = open(outDir + cancer + "/" + cgFileName + ".json", "w")
    J = {}
    #stable
    J["cgDataVersion"] = 1
    J["redistribution"] = True
    J["dataProducer"] = dataProducer
    J["colNormalization"] = True
    J["PLATFORM"] = PATHPATTERN
    J["type"] = "genomicMatrix"
    J[":sampleMap"] = "TCGA." + cancer + ".sampleMap"

    #multiple dirs
    J["url"]=TCGAUtil.remoteBase \
              +string.replace(inDir,TCGAUtil.localBase,"")
    J["version"] = datetime.date.today().isoformat()
    J["wrangler"] = "cgData TCGAscript " + __name__ + " processed on " + datetime.date.today(
    ).isoformat()
    J["unit"] = "log2(RPM+1)"
    J[":probeMap"] = cancer + '_' + namesuffix + ".probeMap"

    if PATHPATTERN in ["IlluminaHiSeq_miRNASeq"]:
        platformTitle = "Illumina HiSeq 2000 RNA Sequencing platform"
    if PATHPATTERN in ["IlluminaGA_miRNASeq"]:
        platformTitle = " Illumina Genome Analyzer RNA Sequencing platform"

    #change description
    J["description"] = ""
    J["dataSubType"] = "miRNA isoform expression RNAseq"
    J["label"] = suffix
    J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[
        cancer] + " (" + cancer + ") miRNA isoform expression by RNAseq (" + suffix + ")"
    J["description"]= J["description"] +"TCGA "+ TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") miRNA expression by RNAseq. The miRNA expression profile was measured experimentally using the "+platformTitle+" by the "+ dataProducer +"." + \
        " Level 3 interpreted level data was downloaded from TCGA data coordination center. Download data is in the unit of reads per million mapped reads (RPM). This dataset shows the miRNA transcription estimates in log2 (RPM). For more information see: http://nar.oxfordjournals.org/content/early/2015/08/13/nar.gkv808.full ."

    J["description"] = J["description"] +\
                       "<br><br>In order to more easily view the differential expression between samples, we set the default view to center each miRNA to zero by independently subtracting the mean of each miRNA across the cohort on the fly. Users can view the original non-normalized values by adjusting visualization settings."
    J["description"] = J["description"] + "<br><br>"

    J["wrangling_procedure"] = "Level_3 Data (file names: *.isoform.quantification.txt) download from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository."

    J["anatomical_origin"] = TCGAUtil.anatomical_origin[cancer]
    J["sample_type"] = ["tumor"]
    J["primary_disease"] = TCGAUtil.cancerGroupTitle[cancer]
    J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[
        cancer] + " (" + cancer + ")"
    J['domain'] = "TCGA"
    J['tags'] = ["cancer"] + TCGAUtil.tags[cancer]
    J['owner'] = "TCGA"
    J['gdata_tags'] = ["transcription", "miRNA"]

    #change cgData
    J["name"] = "TCGA_" + cancer + "_" + namesuffix
    name = trackName_fix(J['name'])
    if name == False:
        message = "bad object name, need fix otherwise break loader, too long " + J[
            "name"]
        print message
        flog.write(message + "\n")
        return
    else:
        J["name"] = name

    oHandle.write(json.dumps(J, indent=-1))
    oHandle.close()

    #gene datafile json
    genefile = outDir + cancer + "/" + cgFileName + "_gene"
    oHandle = open(genefile + ".json", "w")
    J.pop("name")
    J.pop(":probeMap")
    J.pop("description")
    J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[
        cancer] + " (" + cancer + ") miRNA mature strand expression by RNAseq (" + suffix + ")"
    J["dataSubType"] = "miRNA mature strand expression RNAseq"
    J["label"] = suffix
    J["wrangling_procedure"] = "Level_3 Data (file names: *.isoform.quantification.txt) download from TCGA DCC, for each sample, all isoform expression for the same miRNA mature strand are added together, log2(total_RPM +1) transformed, and deposited at UCSC into Xena repository."
    oHandle.write(json.dumps(J, indent=-1))
    oHandle.close()

    return

Example #15

Show file

File: TCGASampleMap.py Project: jianguozhouzunyimedicaluniversity/wrangle

def TCGASampleMap (dir, outDir, cancer,log, REALRUN):
    #print status
    print cancer, __name__

    #if cancer in ["PANCAN","PANCAN12"]:
    #    return

    ignore =1
    bookDic = cgWalk(dir,ignore)
    
    existMaps = collectSampleMaps(bookDic)
    missingMaps=  collectMissingSampleMaps(bookDic)

    #removeExistMaps
    for map in existMaps:
        if map not in missingMaps:
            missingMaps[map]=existMaps[map]
        
    # all aliquote uuid dic
    aliquote_dic =TCGAUtil.uuid_Aliquot_all()
    sample_dic =TCGAUtil.uuid_Sample_all()
    
    #missingMaps --- actually this is all the maps
    for map in missingMaps:
        print map
        print missingMaps[map]
        sMap =SampleMapNew(None,map)

        #integration id
        intName= map+".integrationID"
        if intName in bookDic:
            fin = open(bookDic[intName]["path"],'r')
            integrationID=IntegrationId(intName, fin)
            fin.close()
        else:
            integrationID=IntegrationId(intName)

        samples =[]
        for name in missingMaps[map]:
            if REALRUN !=1:
                continue
            print name
            obj=bookDic[name]
            if obj['type']=="genomicMatrix":
                fin =open(obj['path'],'U')
                for sample in string.split(fin.readline()[:-1],"\t")[1:]:
                    if sample =="":
                        print name, "has bad empty sample id"
                    if sample not in samples:
                        samples.append(sample)
                fin.close()
            #elif obj['type']=="clinicalMatrix":
            #    cMa = ClinicalMatrixNew(obj['path'],name)
            #    for sample in cMa.getROWs():
            #        if sample not in samples:
            #            samples.append(sample)
            elif obj['type'] in ["mutationVector","clinicalMatrix"]:
                path = obj['path']
                os.system("cut -f 1 "+path+ " |sort |uniq > .tmp")
                fin=open('.tmp','r')
                fin.readline()
                for line in fin.readlines():
                    #if string.strip(line)=="":
                    #    break
                    sample = string.strip(line) #string.split(line,'\t')[0]
                    if sample =="":
                        break
                    if sample not in samples:
                        samples.append(sample)

            else:
                continue

        for sample in samples:
            if REALRUN !=1:
                continue
            #TCGA uuid handling
            if sample[0:4]!="TCGA": 
                if aliquote_dic.has_key(string.lower(sample)):
                    TCGAbarcode = aliquote_dic[string.lower(sample)]
                else:
                    print sample
                    continue
                parent = TCGAbarcode
                child = sample
                sMap.addLink(parent,string.lower(child))
                sMap.addLink(parent,string.upper(child))
                sample = parent

            #do TCGA barcode trick
            parts= string.split(sample,"-")
            if len(parts)>3 and len(parts[3])==3:
                parts = parts[0:3]+ [parts[3][0:2],parts[3][2]]+parts[4:]
                #print parts

            """
            parent = string.join(parts[0:3],"-")
            #parts[3]
            if len(parts)>3 and len(parts[3])==3:
                child=parent +"-" +parts[3][0:2]
                sMap.addLink(parent,child)
                parent=child
                child=string.join(parts[0:4],"-")
                sMap.addLink(parent,child)
                parent=child
            """
            parent = string.join(parts[0:3],"-")
            for i in range (3,len(parts)):
                if i!=4:
                    child = parent +"-" +parts[i]
                else:
                    child = parent +parts[i]
                #add parent child
                sMap.addLink(parent,child)
                parent = child
                
            intID= TCGAUtil.barcode_IntegrationId(sample)
            integrationID.addId(intID)
            
        #output sampleMap
        if not os.path.exists( outDir ):
            os.makedirs( outDir )
        if not os.path.exists( outDir +cancer+"/"):
                os.makedirs( outDir+cancer+"/" )

        if REALRUN == 1:
            oHandle = open(outDir+cancer+"/"+map,"w")
            sMap.store(oHandle)

        #output integrationID
        if REALRUN ==1:
            oHandle = open(outDir+cancer+"/integrationID","w")
            integrationID.store(oHandle)
            oHandle.close()
        
        #output integrationID json
        oHandle = open(outDir+cancer+"/integrationID.json","w")
        J={}
        J['name']=intName

        J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer]
        J["sample_type"]="tumor"
        if cancer not in ["PANCAN","PANCAN12"]:
            J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer]
        else:
            J["primary_disease"]="cancer"
            
        #J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer]
        J['domain']="TCGA"
        J['owner']="TCGA"
        
        J["cgDataVersion"]=1
        J['type']="integrationId"
        J["version"]= datetime.date.today().isoformat()
        oHandle.write( json.dumps( J, indent=-1 ) )
        oHandle.close()
        
        #output json
        oHandle = open(outDir+cancer+"/"+map+".json","w")
        J['name']=map
        J['type']="sampleMap"
        J["version"]= datetime.date.today().isoformat()
        J["cgDataVersion"]=1
        J[":integrationId"]=intName

        #add info for old clinical data
        if os.path.exists( outDir+cancer+"/oldClin.json" ):
            J[':oldClin']=cancer+"_oldClin" 

        #special code
        if TCGAUtil.featurePriority.has_key(cancer) and len(TCGAUtil.featurePriority[cancer])>=5:
            J["VIS"]=5
        
        #blackList in PAAD
        if J['name'] in ["TCGA.PAAD.sampleMap"]:
            J["blacklist"]= [ "TCGA-FQ-6551",
                              "TCGA-FQ-6552",
                              "TCGA-FQ-6553",
                              "TCGA-FQ-6554",
                              "TCGA-FQ-6555",
                              "TCGA-FQ-6558",
                              "TCGA-FQ-6559"]
            
        oHandle.write( json.dumps( J, indent=-1 ) )

        
    return

Example #16

Show file

def geneRPKM(inDir, outDir, cancer, flog, PATHPATTERN, suffix, namesuffix,
             dataProducer, REALRUN, clean):
    garbage = [tmpDir]
    os.system("rm -rf tmp_*")
    if os.path.exists(tmpDir):
        if clean:
            os.system("rm -rf " + tmpDir + "*")
    else:
        os.system("mkdir " + tmpDir)

    #multiple files in dir mode
    lastRelease = {}
    for file in os.listdir(inDir):
        #find the file
        if string.find(file, PATHPATTERN) != -1 and string.find(
                file, LEVEL) != -1 and string.find(
                    file, ".tar.gz") != -1 and string.find(file, "md5") == -1:
            pass
        else:
            continue

        if not os.path.exists(inDir + file + ".md5"):
            print "file has no matching .md5 throw out", file
            continue

        #find lastest in each archive
        info = string.split(file, ".")
        archive = info[-5]
        release = int(info[-4])

        if not lastRelease.has_key(archive):
            lastRelease[archive] = release
        else:
            if lastRelease[archive] < release:
                lastRelease[archive] = release

    rootDir = ""
    lastDate = None
    remoteDataDirExample = ""
    for file in os.listdir(inDir):
        #find the file
        if string.find(file, PATHPATTERN) != -1 and string.find(
                file, LEVEL) != -1 and string.find(
                    file, ".tar.gz") != -1 and string.find(file, "md5") == -1:
            pass
        else:
            continue

        if not os.path.exists(inDir + file + ".md5"):
            continue

        #find the file that is the lastest release for the archive
        info = string.split(file, ".")
        archive = info[-5]
        release = int(info[-4])

        if release != lastRelease[archive]:
            continue

        #file latest date
        newDate = datetime.date.fromtimestamp(os.stat(inDir + file).st_mtime)
        if not lastDate:
            lastDate = newDate
        if lastDate < newDate:
            lastDate = newDate

        if remoteDataDirExample == "":
            remoteDataDirExample = file[:-7]

        #is tar.gz?, uncompress multiple file mode
        if not clean:
            rootDir = tmpDir
        elif string.find(file, ".tar.gz") != -1 and REALRUN and clean:
            os.system("tar -xzf " + inDir + file + " -C " + tmpDir)
            rootDir = tmpDir

    #make sure there is data
    if REALRUN and (rootDir == "" or not os.path.exists(rootDir)):
        print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__
        return

    #set output dir
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    if not os.path.exists(outDir + cancer + "/"):
        os.makedirs(outDir + cancer + "/")

    cgFileName = namesuffix

    #data processing multiple dirs mode
    if REALRUN:
        allSamples = {}

        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir + dataDir):
                sample = ""

                #v2 bcgsc gene
                pattern = ".gene.quantification"
                if string.find(file, pattern) != -1 and string.find(
                        namesuffix, "exon") == -1:
                    #check if there is .v2
                    if string.find(file, ".v2.") == -1:
                        V2 = 0
                        for file2 in os.listdir(rootDir + dataDir):
                            if string.find(file2, ".v2") != -1:
                                V2 = 1
                                break
                        if V2:
                            continue

                    if string.find(file, ".hg19.") == -1:
                        HG19 = 0
                        for file2 in os.listdir(rootDir + dataDir):
                            if string.find(file2, ".hg19.") != -1:
                                HG19 = 1
                                break
                        if HG19:
                            continue

                    infile = rootDir + dataDir + "/" + file
                    # bcgsc stupid sample name in file name
                    if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center":
                        sample = string.split(file, ".")[0]
                    else:
                        print "please check how to identify sample name"

                #v2 bcgsc exon
                pattern = ".exon.quantification"
                if string.find(file, pattern) != -1 and string.find(
                        namesuffix, "exon") != -1:
                    #check if there is .v2
                    if string.find(file, ".v2.") == -1:
                        V2 = 0
                        for file2 in os.listdir(rootDir + dataDir):
                            if string.find(file2, ".v2.") != -1:
                                V2 = 1
                                break
                        if V2:
                            continue

                    if string.find(file, ".hg19.") == -1:
                        HG19 = 0
                        for file2 in os.listdir(rootDir + dataDir):
                            if string.find(file2, ".hg19.") != -1:
                                HG19 = 1
                                break
                        if HG19:
                            continue

                    infile = rootDir + dataDir + "/" + file
                    # bcgsc stupid sample name in file name
                    if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center":
                        sample = string.split(file, ".")[0]
                    else:
                        print "please check how to identify sample name"

                #v2
                pattern = "rsem.genes.normalized_results"
                if string.find(file, pattern) != -1 and string.find(
                        namesuffix, "exon") == -1:
                    infile = rootDir + dataDir + "/" + file
                    # unc stupid sample name in file name
                    if dataProducer == "University of North Carolina TCGA genome characterization center":
                        sample = string.split(file, ".")[2]
                    else:
                        print "please check how to identify sample name"
                #v2 exon from unc
                pattern = "bt.exon_quantification"
                if string.find(file, pattern) != -1 and string.find(
                        namesuffix, "exon") != -1:
                    infile = rootDir + dataDir + "/" + file
                    # unc stupid sample name in file name
                    if dataProducer == "University of North Carolina TCGA genome characterization center":
                        sample = string.split(file, ".")[2]
                    else:
                        print "please check how to identify sample name"

                if sample == "":
                    continue
                if sample in allSamples:
                    print len(allSamples)
                    message = "ERROR duplicated sample = " + sample + " " + cancer + " " + __name__ + file
                    flog.write(message + "\n")
                    print message
                    continue
                # Test for barcode or UUID     #throw out all normals and control Analyte
                if sample[0:4] != "TCGA":
                    if TCGAUtil.UUID_CELLLINE.has_key(sample):
                        print "control cell line ignore", sample
                        continue
                else:
                    sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
                    if sampleTypeCode == False:  # likely a uuid
                        continue
                    elif sampleTypeCode in ["20"]:
                        print "control cell line ignore", sample
                        continue

                p = len(allSamples)
                allSamples[sample] = p

        c = 0
        dataMatrix = []
        tmpSamples = {}
        genes = {}
        oldgenes = {}
        files = []
        GOOD = 1
        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir + dataDir):
                sample = ""
                #bcgsc v1 and 2
                pattern = "gene.quantification"
                altpattern = ".v2.gene.quantification"
                if string.find(file, pattern) != -1 and string.find(
                        namesuffix, "exon") == -1:
                    #check if there is .v2
                    if string.find(file, ".v2.") == -1:
                        V2 = 0
                        for file2 in os.listdir(rootDir + dataDir):
                            if string.find(file2, altpattern) != -1:
                                V2 = 1
                                break
                        if V2:
                            continue
                    if string.find(file, ".hg19.") == -1:
                        HG19 = 0
                        for file2 in os.listdir(rootDir + dataDir):
                            if string.find(file2, ".hg19.") != -1:
                                HG19 = 1
                                break
                        if HG19:
                            continue

                    infile = rootDir + dataDir + "/" + file
                    # bcgsc stupid sample name in file name
                    if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center":
                        sample = string.split(file, ".")[0]
                    else:
                        print "please check how to identify sample name"
                    valuePOS = 3
                    LOG2 = 1
                    RANK = 0

                #bcgsc exon v1 and v2
                pattern = "exon.quantification"
                altpattern = ".v2.exon.quantification"
                if string.find(file, pattern) != -1 and string.find(
                        namesuffix, "exon") != -1:
                    #check if there is .v2
                    if string.find(file, ".v2.") == -1:
                        V2 = 0
                        for file2 in os.listdir(rootDir + dataDir):
                            if string.find(file2, altpattern) != -1:
                                V2 = 1
                                break
                        if V2:
                            continue

                    if string.find(file, ".hg19.") == -1:
                        HG19 = 0
                        for file2 in os.listdir(rootDir + dataDir):
                            if string.find(file2, ".hg19.") != -1:
                                HG19 = 1
                                break
                        if HG19:
                            continue

                    infile = rootDir + dataDir + "/" + file
                    # bcgsc stupid sample name in file name
                    if dataProducer == "British Columbia Cancer Agency TCGA genome characterization center":
                        sample = string.split(file, ".")[0]
                    else:
                        print "please check how to identify sample name"
                    valuePOS = 3
                    LOG2 = 1
                    RANK = 0

                #v2
                pattern = "rsem.genes.normalized_results"
                if string.find(file, pattern) != -1 and string.find(
                        namesuffix, "exon") == -1:
                    infile = rootDir + dataDir + "/" + file
                    # unc stupid sample name in file name
                    if dataProducer == "University of North Carolina TCGA genome characterization center":
                        sample = string.split(file, ".")[2]
                    else:
                        print "please check how to identify sample name"
                    if string.find(
                            namesuffix, "percentile"
                    ) != -1:  #generated percentileRANK based data
                        RANK = 1
                    else:
                        RANK = 0
                    valuePOS = 1
                    LOG2 = 1

                #v2 exon from unc
                pattern = "bt.exon_quantification"
                if string.find(file, pattern) != -1 and string.find(
                        namesuffix, "exon") != -1:
                    infile = rootDir + dataDir + "/" + file
                    # unc stupid sample name in file name
                    if dataProducer == "University of North Carolina TCGA genome characterization center":
                        sample = string.split(file, ".")[2]
                    else:
                        print "please check how to identify sample name"
                    valuePOS = 3
                    LOG2 = 1
                    RANK = 0

                if sample == "":
                    continue
                if sample in tmpSamples:  #duplicated samples
                    continue
                if sample not in allSamples:
                    continue

                p = len(tmpSamples)
                tmpSamples[sample] = p

                c = c + 1
                #print c
                if RANK:
                    process_percentileRANK(dataMatrix, tmpSamples, sample,
                                           genes, cancer, infile, flog,
                                           valuePOS, 250)
                else:
                    process(dataMatrix, tmpSamples, sample, genes, cancer,
                            infile, flog, valuePOS, LOG2, 250)

                if (c % 250) == 0:
                    tmpout = "tmp_" + str(int(c / 250.0))
                    r = outputMatrix(dataMatrix, tmpSamples, genes, oldgenes,
                                     tmpout, flog)
                    if r:
                        GOOD = 0

                    dataMatrix = []
                    tmpSamples = {}
                    oldgenes = copy.deepcopy(genes)
                    genes = {}
                    files.append(tmpout)

        if (c % 250) != 0:
            tmpout = "tmp_" + str(int(c / 250.0) + 1)
            files.append(tmpout)
            r = outputMatrix(dataMatrix, tmpSamples, genes, oldgenes, tmpout,
                             flog)
            if r:
                GOOD = 0

        #paste all together
        outfile = outDir + cancer + "/" + cgFileName
        if GOOD:
            os.system("paste -d \'\' " + string.join(files, " ") + " > " +
                      outfile)
        for file in files:
            os.system("rm " + file)
        if not GOOD:
            sys.exit()

    datafile = outDir + cancer + "/" + cgFileName
    if not os.path.exists(datafile):
        return

    oHandle = open(outDir + cancer + "/" + cgFileName + ".json", "w")

    J = {}
    #stable
    J["redistribution"] = True
    J["groupTitle"] = "TCGA " + TCGAUtil.cancerGroupTitle[cancer]
    J["dataProducer"] = dataProducer
    J["colNormalization"] = True
    J["PLATFORM"] = PATHPATTERN
    J["type"] = "genomicMatrix"
    J[":sampleMap"] = "TCGA." + cancer + ".sampleMap"

    #multiple dirs
    J["url"]=TCGAUtil.remoteBase \
              +string.replace(inDir,TCGAUtil.localBase,"")
    J["version"] = datetime.date.today().isoformat()
    J["wrangler"] = "Xena TCGAscript " + __name__ + " processed on " + datetime.date.today(
    ).isoformat()

    if string.find(PATHPATTERN, "IlluminaHiSeq") != -1:  #IlluminaHiSeq
        platformTitle = "Illumina HiSeq 2000 RNA Sequencing platform"
    elif string.find(PATHPATTERN, "IlluminaGA") != -1:  #IlluminaGA
        platformTitle = " Illumina Genome Analyzer RNA Sequencing platform"
    assert platformTitle

    #change description
    J["description"] = ""
    J["RNAtype"] = "polyA+"
    if string.find(namesuffix, "total") != -1:  #totalRNA
        J["RNAtype"] = "total RNA"
    EXONGENE = "GENE"
    if string.find(namesuffix, "exon") != -1:  #exon
        EXONGENE = "EXON"

    if EXONGENE == "GENE":  #gene
        J[":probeMap"] = "hugo"
        J["dataSubType"] = "gene expression RNAseq"

        if cancer not in ["OV", "STAD"]:
            J["label"] = suffix
            J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[
                cancer] + " (" + cancer + ") gene expression by RNAseq (" + J[
                    "RNAtype"] + " " + suffix + ")"
        else:
            if dataProducer == "University of North Carolina TCGA genome characterization center":
                J["label"] = suffix + " UNC"
                J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[
                    cancer] + " (" + cancer + ") gene expression by RNAseq (" + J[
                        "RNAtype"] + " " + suffix + " UNC)"
            else:
                J["label"] = suffix + " BC"
                J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[
                    cancer] + " (" + cancer + ") gene expression by RNAseq (" + J[
                        "RNAtype"] + " " + suffix + " BC)"

        J["notes"] = "the probeMap is hugo for the short term, however probably around 10% of the gene symbols are not HUGO names, but ENTRE genes"

        if string.find(namesuffix, "percentile") != -1:  #percentile
            J["description"] = J[
                "description"] + "For each sample, we rank genes RSEM values between 0% to 100%. This dataset is gene expression estimation in percentile rank, which higher value representing higher expression. The dataset can be used to compare this RNAseq data  with other cohorts when the other data is processed in the same way (i.e. percentile ranking)."
        else:  #basic
            J["description"]= J["description"] + "The gene expression profile was measured experimentally using the "+platformTitle+" by the "+ dataProducer +"." + \
                " Level 3 data was downloaded from TCGA data coordination center. This dataset shows the gene-level transcription estimates, "

    else:  #exon
        J["dataSubType"] = "exon expression RNAseq"
        J[":probeMap"] = "unc_RNAseq_exon.hg19"

        if cancer not in ["OV", "STAD"]:
            J["label"] = suffix
            J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[
                cancer] + " (" + cancer + ") exon expression by RNAseq (" + J[
                    "RNAtype"] + " " + suffix + ")"
        else:
            if dataProducer == "University of North Carolina TCGA genome characterization center":
                J["label"] = suffix + " UNC"
                J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[
                    cancer] + " (" + cancer + ") exon expression by RNAseq (" + J[
                        "RNAtype"] + " " + suffix + " UNC)"
            else:
                J["label"] = suffix + " BC"
                J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[
                    cancer] + " (" + cancer + ") exon expression by RNAseq (" + J[
                        "RNAtype"] + " " + suffix + " BC)"


        J["description"]= J["description"] +" The exon expression profile was measured experimentally using the "+platformTitle+" by the "+ dataProducer +"." + \
                          " Level 3 data was downloaded from TCGA data coordination center. This dataset shows the exon-level transcription estimates, "

    #wrangling stuff
    if PATHPATTERN in ["IlluminaHiSeq_RNASeqV2", "IlluminaGA_RNASeqV2"
                       ] and string.find(namesuffix, "exon") == -1:
        if string.find(namesuffix, "percentile") == -1:  #basic
            J["description"] = J[
                "description"] + "as in log2(x+1) transformed RSEM normalized count."
            J["unit"] = "log2(norm_count+1)"
            J["wrangling_procedure"] = "Level_3 data (file names: *.rsem.genes.normalized_results) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository"
        else:  #percentile
            J["unit"] = "percentile rank"
            J["wrangling_procedure"] = "Level_3 data (file names: *.rsem.genes.normalized_results) are downloaded from TCGA DCC, percentile ranked, and processed at UCSC into Xena repository."

    elif string.find(namesuffix, "exon") != -1:  #exon
        J["description"] = J[
            "description"] + "as in RPKM values (Reads Per Kilobase of exon model per Million mapped reads)."
        J["wrangling_procedure"] = "Level_3 data (file names: *.exon_quantification.txt) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository."
        J["unit"] = "log2(RPKM+1)"
    else:
        J["description"] = J[
            "description"] + "as in RPKM values (Reads Per Kilobase of exon model per Million mapped reads)."
        J["wrangling_procedure"] = "Level_3 data (file names: *.gene.quantification.txt) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository."
        J["unit"] = "log2(RPKM+1)"

    #mapping to genomics region
    if string.find(namesuffix, "exon") == -1:  #gene
        J["description"] = J[
            "description"] + " Genes are mapped onto the human genome coordinates using UCSC Xena HUGO probeMap (see ID/Gene mapping link below for details)."
    else:  #exon
        J["description"] = J[
            "description"] + " Exons are mapped onto the human genome coordinates using UCSC Xena unc_RNAseq_exon probeMap (see ID/Gene mapping link below for details."

    #reference
    if dataProducer == "University of North Carolina TCGA genome characterization center":
        J["description"] = J["description"] +\
                           " Reference to method description from "+dataProducer+": <a href=\"" + TCGAUtil.remoteBase +string.replace(inDir,TCGAUtil.localBase,"") +remoteDataDirExample+"/DESCRIPTION.txt\" target=\"_blank\"><u>DCC description</u></a>"

    # comparison
    if string.find(namesuffix, "exon") == -1:  # gene
        if string.find(namesuffix, "percentile") != -1:  #percentile gene
            J["description"] = J[
                "description"] + "<br><br>For comparing data within this cohort, we recommend to use the \"gene expression RNAseq\" dataset. For questions regarding the gene expression of this particular cohort in relation to other types tumors, you can use the pancan normalized version of the \"gene expression RNAseq\" data. For comparing with data outside TCGA, we recommend using the percentile version if the non-TCGA data is normalized by percentile ranking. For more information, please see our Data FAQ: <a href=https://docs.google.com/document/d/1q-7Tkzd7pci4Rz-_IswASRMRzYrbgx1FTTfAWOyHbmk/edit?usp=sharing target=\"_blank\"><u>here</u></a>."

    #viz setting
    J["description"] = J["description"] +\
                       "<br><br>In order to more easily view the differential expression between samples, we set the default view to center each gene or exon to zero by independently subtracting the mean of each gene or exon on the fly. Users can view the original non-normalized values by adjusting visualization settings."
    J["description"] = J["description"] + "<br><br>"

    J["anatomical_origin"] = TCGAUtil.anatomical_origin[cancer]
    J["sample_type"] = ["tumor"]
    J["primary_disease"] = TCGAUtil.cancerGroupTitle[cancer]
    J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[
        cancer] + " (" + cancer + ")"
    J['tags'] = ["cancer"] + TCGAUtil.tags[cancer]
    J['owner'] = "TCGA"
    J['gdata_tags'] = ["transcription"]

    #change cgData
    J["name"] = "TCGA_" + cancer + "_exp_" + namesuffix
    name = trackName_fix(J['name'])
    if name == False:
        message = "bad object name, need fix otherwise break loader, too long " + J[
            "name"]
        print message
        flog.write(message + "\n")
        return
    else:
        J["name"] = name

    oHandle.write(json.dumps(J, indent=-1))
    oHandle.close()

    return

Example #17

Show file

File: RPPA.py Project: jianguozhouzunyimedicaluniversity/wrangle

def RPPA(inDir, outDir, cancer, flog, REALRUN):
    print cancer, sys._getframe().f_code.co_name

    PATHPATTERN = "MDA_RPPA_Core"
    dataProducer = "MD Anderson Cancer Center TCGA proteome characterization center"

    garbage = [tmpDir]

    if os.path.exists(tmpDir):
        os.system("rm -rf " + tmpDir + "*")
    else:
        os.system("mkdir " + tmpDir)

    #multiple files in dir mode
    lastRelease = {}
    for file in os.listdir(inDir):
        #find the file
        if string.find(file, PATHPATTERN) != -1 and string.find(
                file, LEVEL) != -1 and string.find(
                    file, ".tar.gz") != -1 and string.find(file, "md5") == -1:
            pass
        else:
            continue

        if not os.path.exists(inDir + file + ".md5"):
            print "file has no matching .md5 throw out", file
            continue

        #find lastest in each archive
        info = string.split(file, ".")
        archive = info[-5]
        release = int(info[-4])

        if not lastRelease.has_key(archive):
            lastRelease[archive] = release
        else:
            if lastRelease[archive] < release:
                lastRelease[archive] = release

    rootDir = ""
    lastDate = None
    remoteDataDirExample = ""
    for file in os.listdir(inDir):
        #find the file
        if string.find(file, PATHPATTERN) != -1 and string.find(
                file, LEVEL) != -1 and string.find(
                    file, ".tar.gz") != -1 and string.find(file, "md5") == -1:
            pass
        else:
            continue

        if not os.path.exists(inDir + file + ".md5"):
            continue

        #find the file that is the lastest release for the archive
        info = string.split(file, ".")
        archive = info[-5]
        release = int(info[-4])

        if release != lastRelease[archive]:
            continue

        #file latest date
        newDate = datetime.date.fromtimestamp(os.stat(inDir + file).st_mtime)
        if not lastDate:
            lastDate = newDate
        if lastDate < newDate:
            lastDate = newDate

        if remoteDataDirExample == "":
            remoteDataDirExample = file[:-7]

        #is tar.gz?, uncompress multiple file mode
        if string.find(file, ".tar.gz") != -1 and REALRUN:
            os.system("tar -xzf " + inDir + file + " -C " + tmpDir)
            rootDir = tmpDir

    #make sure there is data
    if REALRUN and (rootDir == "" or not os.path.exists(rootDir)):
        print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__
        return

    #set output dir
    if not os.path.exists(outDir):
        os.makedirs(outDir)
    if not os.path.exists(outDir + cancer + "/"):
        os.makedirs(outDir + cancer + "/")

    cgFileName = "RPPA"

    #data processing multiple dirs mode
    if REALRUN:
        aliquote_dic = TCGAUtil.uuid_Aliquot_all()
        dataMatrix = {}
        allSamples = []
        probes = []
        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir + dataDir):
                sample = ""
                pattern = "protein_expression"
                if string.find(file, pattern) != -1:
                    infile = rootDir + dataDir + "/" + file
                    sample = string.split(file, ".")[5]
                if sample == "":
                    continue
                # Test for barcode or UUID     #throw out all normals and control Analyte
                if sample[0:4] != "TCGA":
                    if aliquote_dic.has_key(string.lower(sample)):
                        if TCGAUtil.UUID_CELLLINE.has_key(sample):
                            print "control cell line ignore", sample
                            continue
                    else:
                        print "unknow id:", sample
                        continue
                else:
                    sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
                    if sampleTypeCode == False:  # likely a uuid
                        continue
                    elif sampleTypeCode in ["20"]:
                        print "control cell line ignore", sample
                        continue
                if sample not in allSamples:
                    allSamples.append(sample)

        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir + dataDir):
                sample = ""
                pattern = "protein_expression"
                if string.find(file, pattern) != -1:
                    infile = rootDir + dataDir + "/" + file
                    sample = string.split(file, ".")[5]
                if sample == "":
                    continue
                if sample not in allSamples:
                    continue
                valuePOS = 1
                process(dataMatrix, allSamples, sample, probes, cancer, infile,
                        flog, valuePOS)

        outfile = outDir + cancer + "/" + cgFileName
        outputMatrix(dataMatrix, allSamples, probes, outfile)

    oHandle = open(outDir + cancer + "/" + cgFileName + ".json", "w")

    J = {}
    #stable
    J["dataSubType"] = "protein expression RPPA"
    J["redistribution"] = True
    J["dataProducer"] = dataProducer
    J["colNormalization"] = True
    J["PLATFORM"] = "M.D. Anderson Reverse Phase Protein Array Core platform"
    J["type"] = "genomicMatrix"
    J[":sampleMap"] = "TCGA." + cancer + ".sampleMap"

    #multiple dirs
    J["url"]=TCGAUtil.remoteBase \
              +string.replace(inDir,TCGAUtil.localBase,"")
    J["version"] = datetime.date.today().isoformat()
    J["wrangler"] = "Xena TCGAscript " + __name__ + " processed on " + datetime.date.today(
    ).isoformat()
    J["wrangling_procedure"] = "Level_3 Data (file names: *.protein_expression.*) download from TCGA DCC, and processed at UCSC into Xena repository"
    J["label"] = "RPPA"
    J["longTitle"] = "TCGA " + TCGAUtil.cancerOfficial[
        cancer] + " (" + cancer + ") reverse phase protein array"

    J[":probeMap"] = "md_anderson_antibodies"

    J["anatomical_origin"] = TCGAUtil.anatomical_origin[cancer]
    J["sample_type"] = ["tumor"]
    J["primary_disease"] = TCGAUtil.cancerGroupTitle[cancer]
    J["cohort"] = "TCGA " + TCGAUtil.cancerHumanReadable[
        cancer] + " (" + cancer + ")"
    J['domain'] = "TCGA"
    J['owner'] = "TCGA"
    J["tags"] = ["cancer"] + TCGAUtil.tags[cancer]
    J["unit"] = "normalized RPPA value"
    J["description"] = "TCGA " + TCGAUtil.cancerOfficial[
        cancer] + " (" + cancer + ") protein expression by reverse phase protein array (RPPA).<br><br> The data was generated and processed at the MD Anderson Cancer Center TCGA proteome characterization center RPPA core. Level 3 interpreted level data was downloaded from TCGA data coordination center.<br><br>"

    J["description"] = J[
        "description"] + "Data normalization from the MDACC RPPA core: <a href=\"http://bioinformatics.mdanderson.org/main/TCPA:Overview\" target=\"_blank\"><u>under section How are the RPPA data processed</u></a>.<br>"

    #change cgData
    J["name"] = "TCGA_" + cancer + "_RPPA"
    name = trackName_fix(J['name'])
    if name == False:
        message = "bad object name, need fix otherwise break loader, too long " + J[
            "name"]
        print message
        flog.write(message + "\n")
        return
    else:
        J["name"] = name

    oHandle.write(json.dumps(J, indent=-1))
    oHandle.close()

    return

Example #18

Show file

import TCGAUtil

dic = TCGAUtil.uuid_Aliquot_all()
dic = TCGAUtil.uuid_Sample_all()
TCGAUtil.uuid_normal_cellline()
TCGAUtil.uuid_cellline()

Example #19

Show file

File: test.py Project: jingchunzhu/cgDataNew

import TCGAUtil

dic=TCGAUtil.uuid_Aliquot_all()
dic=TCGAUtil.uuid_Sample_all()
TCGAUtil.uuid_normal_cellline()
TCGAUtil.uuid_cellline()

Example #20

Show file

File: RPPA.py Project: jingchunzhu/cgDataNew

def RPPA (inDir, outDir, cancer, flog,REALRUN):
    print cancer, sys._getframe().f_code.co_name

    PATHPATTERN = "MDA_RPPA_Core"    
    dataProducer= "MD Anderson Cancer Center TCGA proteome characterization center"
    
    garbage=[tmpDir]

    if os.path.exists( tmpDir ):
        os.system("rm -rf "+tmpDir+"*")
    else:
        os.system("mkdir "+tmpDir)

    #multiple files in dir mode
    lastRelease={}
    for file in os.listdir(inDir):
        #find the file
        if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1:
            pass
        else:
            continue
        
        if not os.path.exists(inDir +file+".md5"):
            print "file has no matching .md5 throw out", file
            continue
            
        #find lastest in each archive
        info = string.split(file,".")
        archive = info [-5] 
        release = int(info [-4])

        if not lastRelease.has_key(archive):
            lastRelease[archive]= release
        else:
            if lastRelease[archive]< release:
                lastRelease[archive]=release
                

    rootDir =""
    lastDate=None
    remoteDataDirExample =""
    for file in os.listdir(inDir):
        #find the file
        if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1:
            pass
        else:
            continue

        if not os.path.exists(inDir +file+".md5"):
            continue

        #find the file that is the lastest release for the archive
        info = string.split(file,".")
        archive = info [-5] 
        release = int(info [-4])

        if release != lastRelease[archive]:
            continue

        #file latest date
        newDate=  datetime.date.fromtimestamp(os.stat(inDir+file).st_mtime)
        if not lastDate:
            lastDate = newDate
        if lastDate < newDate:
            lastDate = newDate
            
        if remoteDataDirExample =="":
            remoteDataDirExample = file[:-7]

        #is tar.gz?, uncompress multiple file mode
        if string.find(file,".tar.gz")!=-1 and REALRUN :
            os.system("tar -xzf "+inDir+file +" -C "+tmpDir) 
            rootDir =tmpDir
            
    #make sure there is data
    if REALRUN and (rootDir =="" or not os.path.exists(rootDir)):
        print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__
        return

    #set output dir
    if not os.path.exists( outDir ):
        os.makedirs( outDir )
    if not os.path.exists( outDir +cancer+"/"):
        os.makedirs( outDir+cancer+"/" )

    cgFileName= "RPPA"
    
    #data processing multiple dirs mode
    if REALRUN:
        aliquote_dic =TCGAUtil.uuid_Aliquot_all()
        dataMatrix={}
        allSamples=[]
        probes=[]
        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir+dataDir):
                sample =""
                pattern ="protein_expression"
                if string.find(file,pattern)!=-1:
                    infile = rootDir+dataDir+"/"+file
                    sample = string.split(file,".")[5]
                if sample =="":
                    continue            
                # Test for barcode or UUID     #throw out all normals and control Analyte
                if sample[0:4]!="TCGA":
                    if aliquote_dic.has_key(string.lower(sample)):
                        if TCGAUtil.UUID_CELLLINE.has_key(sample):
                            print "control cell line ignore", sample
                            continue
                    else:
                        print "unknow id:", sample
                        continue
                else:
                    sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
                    if sampleTypeCode == False: # likely a uuid
                        continue
                    elif sampleTypeCode in ["20"]:
                        print "control cell line ignore", sample
                        continue
                if sample not in allSamples:
                    allSamples.append(sample)

        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir+dataDir):
                sample =""
                pattern ="protein_expression"
                if string.find(file,pattern)!=-1:
                    infile = rootDir+dataDir+"/"+file
                    sample = string.split(file,".")[5]
                if sample =="":
                    continue
                if sample not in allSamples:
                    continue
                valuePOS=1
                process(dataMatrix,allSamples,sample, probes, cancer,infile,flog, valuePOS)

    
        outfile = outDir+cancer+"/"+cgFileName
        outputMatrix(dataMatrix, allSamples, probes, outfile)

    oHandle = open(outDir+cancer+"/"+cgFileName+".json","w")
    
    J={}
    #stable
    J["dataSubType"]="protein expression RPPA"
    J["redistribution"]= True
    J["dataProducer"]= dataProducer
    J["colNormalization"]=True
    J["PLATFORM"]= "M.D. Anderson Reverse Phase Protein Array Core platform"
    J["type"]= "genomicMatrix" 
    J[":sampleMap"]="TCGA."+cancer+".sampleMap"
    
    #multiple dirs
    J["url"]=TCGAUtil.remoteBase \
              +string.replace(inDir,TCGAUtil.localBase,"")
    J["version"]= datetime.date.today().isoformat()
    J["wrangler"]= "Xena TCGAscript "+ __name__ +" processed on "+ datetime.date.today().isoformat()
    J["wrangling_procedure"]= "Level_3 Data (file names: *.protein_expression.*) download from TCGA DCC, and processed at UCSC into Xena repository"
    J["label"]= "RPPA"
    J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") reverse phase protein array"
    
    J[":probeMap"]= "md_anderson_antibodies"

    J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer]
    J["sample_type"]=["tumor"]
    J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer]
    J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")"
    J['domain']="TCGA"
    J['owner']="TCGA"
    J["tags"]=["cancer"]+ TCGAUtil.tags[cancer]
    J["unit"]="normalized RPPA value"
    J["description"]= "TCGA "+ TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") protein expression by reverse phase protein array (RPPA).<br><br> The data was generated and processed at the MD Anderson Cancer Center TCGA proteome characterization center RPPA core. Level 3 interpreted level data was downloaded from TCGA data coordination center.<br><br>"
    
    J["description"] = J["description"] + "Data normalization from the MDACC RPPA core: <a href=\"http://bioinformatics.mdanderson.org/main/TCPA:Overview\" target=\"_blank\"><u>under section How are the RPPA data processed</u></a>.<br>"
    
    #change cgData
    J["name"]="TCGA_"+cancer+"_RPPA"
    name = trackName_fix(J['name'])
    if name ==False:
        message = "bad object name, need fix otherwise break loader, too long "+J["name"]
        print message
        flog.write(message+"\n")
        return
    else:
        J["name"]=name        
        
    oHandle.write( json.dumps( J, indent=-1 ) )
    oHandle.close()
    
    return

Example #21

Show file

File: CAVM_TCGA.py Project: jianguozhouzunyimedicaluniversity/wrangle

def CAVMid(dir, outDir, cancer, log, REALRUN):
    print cancer, sys._getframe().f_code.co_name

    ignore = 1
    bookDic = cgWalk(dir, ignore)

    existMaps = collectSampleMaps(bookDic)
    missingMaps = collectMissingSampleMaps(bookDic)

    #removeExistMaps
    for map in existMaps:
        if map not in missingMaps:
            missingMaps[map] = existMaps[map]

    # all aliquote uuid dic
    aliquote_dic = TCGAUtil.uuid_Aliquot_all()
    sample_dic = TCGAUtil.uuid_Sample_all()

    if not os.path.exists(outDir):
        os.system("mkdir " + outDir)

    for map in missingMaps:
        print map
        sMap = SampleMapNew(None, map)
        for name in missingMaps[map]:
            samples = []
            intDic = {}  #keyed on CAVMid
            sampleDic = {}  #keyd on original sample id
            obj = bookDic[name]

            print obj["name"]

            if obj['type'] in ["clinicalMatrix", "mutationVector"]:
                outfile = outDir + os.path.basename(obj['path'])
                os.system("cp " + obj['path'] + ".json " + outfile + ".json")

                fin = open(outfile + ".json", 'r')
                J = json.load(fin)
                fin.close()
                if J.has_key(":clinicalFeature"):
                    cFobj = bookDic[J[":clinicalFeature"]]
                    cFoutfile = outDir + os.path.basename(cFobj['path'])
                    os.system("cp " + cFobj['path'] + " " + cFoutfile)
                    os.system("cp " + cFobj['path'] + ".json " + cFoutfile +
                              ".json")

                if REALRUN == -1:
                    continue

                if REALRUN == 0 and obj['type'] == "mutationVector":
                    continue

                fin = open(obj['path'], 'r')
                fin.readline()
                for line in fin.readlines():
                    sample = string.split(line, "\t")[0]
                    if sample not in samples and sample != "":
                        samples.append(sample)
                buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic)

                fin = open(obj['path'], 'r')
                fout = open(outfile, 'w')
                fout.write(fin.readline())
                for line in fin.readlines():
                    data = string.split(line, "\t")
                    sample = data[0]
                    try:
                        fout.write(sampleDic[sample] + "\t")
                        fout.write(string.join(data[1:], "\t"))
                    except:
                        fout.write(line)
                fout.close()

            if obj['type'] == "genomicMatrix":
                fin = open(obj['path'], 'U')
                for sample in string.split(fin.readline()[:-1], "\t")[1:]:
                    if sample == "":
                        print name, "has bad empty sample id"
                        sys.exit()
                    samples.append(sample)

                fin.close()

                outfile = outDir + os.path.basename(obj['path'])

                os.system("cp " + obj['path'] + ".json " + outfile + ".json")

                if REALRUN != 1:
                    continue

                buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic)
                process(obj['path'], outfile, samples, intDic)

Example #22

Show file

File: RNAseq.py Project: jingchunzhu/cgDataNew

def geneRPKM (inDir, outDir, cancer,flog,PATHPATTERN,suffix, namesuffix, dataProducer,REALRUN,clean):
    garbage=[tmpDir]
    os.system("rm -rf tmp_*") 
    if os.path.exists( tmpDir ):
        if clean:
            os.system("rm -rf "+tmpDir+"*")
    else:
        os.system("mkdir "+tmpDir)

    #multiple files in dir mode
    lastRelease={}
    for file in os.listdir(inDir):
        #find the file
        if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1:
            pass
        else:
            continue
        
        if not os.path.exists(inDir +file+".md5"):
            print "file has no matching .md5 throw out", file
            continue
            
        #find lastest in each archive
        info = string.split(file,".")
        archive = info [-5] 
        release = int(info [-4])

        if not lastRelease.has_key(archive):
            lastRelease[archive]= release
        else:
            if lastRelease[archive]< release:
                lastRelease[archive]=release
                

    rootDir =""
    lastDate=None
    remoteDataDirExample =""
    for file in os.listdir(inDir):
        #find the file
        if string.find(file,PATHPATTERN)!=-1 and string.find(file,LEVEL)!=-1 and string.find(file,".tar.gz")!=-1 and string.find(file,"md5")==-1:
            pass
        else:
            continue

        if not os.path.exists(inDir +file+".md5"):
            continue

        #find the file that is the lastest release for the archive
        info = string.split(file,".")
        archive = info [-5] 
        release = int(info [-4])

        if release != lastRelease[archive]:
            continue

        #file latest date
        newDate=  datetime.date.fromtimestamp(os.stat(inDir+file).st_mtime)
        if not lastDate:
            lastDate = newDate
        if lastDate < newDate:
            lastDate = newDate
            
        if remoteDataDirExample =="":
            remoteDataDirExample = file[:-7]

        #is tar.gz?, uncompress multiple file mode
        if not clean:
            rootDir =tmpDir
        elif string.find(file,".tar.gz")!=-1 and REALRUN and clean:
            os.system("tar -xzf "+inDir+file +" -C "+tmpDir) 
            rootDir =tmpDir
            
    #make sure there is data
    if REALRUN and (rootDir =="" or not os.path.exists(rootDir)):
        print "ERROR expect data, but wrong dirpath", rootDir, cancer, __name__
        return

    #set output dir
    if not os.path.exists( outDir ):
        os.makedirs( outDir )
    if not os.path.exists( outDir +cancer+"/"):
        os.makedirs( outDir+cancer+"/" )

    cgFileName= namesuffix 

    #data processing multiple dirs mode
    if REALRUN:
        allSamples={}

        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir+dataDir):
                sample =""

                #v2 bcgsc gene
                pattern =".gene.quantification"
                if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")==-1:
                    #check if there is .v2
                    if string.find(file,".v2.")==-1:
                        V2=0
                        for file2 in os.listdir(rootDir+dataDir):
                            if string.find(file2,".v2")!=-1:
                                V2=1
                                break
                        if V2:
                            continue

                    if string.find(file,".hg19.")==-1:
                        HG19=0
                        for file2 in os.listdir(rootDir+dataDir):
                            if string.find(file2,".hg19.")!=-1:
                                HG19=1
                                break
                        if HG19:
                            continue

                    infile = rootDir+dataDir+"/"+file
                    # bcgsc stupid sample name in file name
                    if dataProducer=="British Columbia Cancer Agency TCGA genome characterization center":
                        sample = string.split(file,".")[0]
                    else:
                        print "please check how to identify sample name"
                
                #v2 bcgsc exon
                pattern =".exon.quantification"
                if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")!=-1:
                    #check if there is .v2
                    if string.find(file,".v2.")==-1:
                        V2=0
                        for file2 in os.listdir(rootDir+dataDir):
                            if string.find(file2,".v2.")!=-1:
                                V2=1
                                break
                        if V2:
                            continue

                    if string.find(file,".hg19.")==-1:
                        HG19=0
                        for file2 in os.listdir(rootDir+dataDir):
                            if string.find(file2,".hg19.")!=-1:
                                HG19=1
                                break
                        if HG19:
                            continue

                    infile = rootDir+dataDir+"/"+file
                    # bcgsc stupid sample name in file name
                    if dataProducer=="British Columbia Cancer Agency TCGA genome characterization center":
                        sample = string.split(file,".")[0]
                    else:
                        print "please check how to identify sample name"

                #v2
                pattern ="rsem.genes.normalized_results"
                if string.find(file,pattern)!=-1  and string.find(namesuffix,"exon")==-1:
                    infile = rootDir+dataDir+"/"+file
                    # unc stupid sample name in file name
                    if dataProducer =="University of North Carolina TCGA genome characterization center":
                        sample = string.split(file,".")[2]
                    else:
                        print "please check how to identify sample name"
                #v2 exon from unc
                pattern ="bt.exon_quantification" 
                if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")!=-1:
                    infile = rootDir+dataDir+"/"+file
                    # unc stupid sample name in file name
                    if dataProducer =="University of North Carolina TCGA genome characterization center":
                        sample = string.split(file,".")[2]
                    else:
                        print "please check how to identify sample name"

                if sample=="":
                    continue
                if sample in allSamples:
                    print len(allSamples)
                    message =  "ERROR duplicated sample = "+ sample+ " " +cancer+" "+ __name__ +file
                    flog.write(message+"\n")
                    print message
                    continue
                # Test for barcode or UUID     #throw out all normals and control Analyte
                if sample[0:4]!="TCGA":
                    if TCGAUtil.UUID_CELLLINE.has_key(sample):
                        print "control cell line ignore", sample
                        continue
                else:
                    sampleTypeCode = TCGAUtil.barcode_SampleType(sample)
                    if sampleTypeCode == False: # likely a uuid
                        continue
                    elif sampleTypeCode in ["20"]:
                        print "control cell line ignore", sample
                        continue

                p=len(allSamples)
                allSamples[sample]=p
                    
        c=0
        dataMatrix=[]
        tmpSamples={}
        genes={}
        oldgenes={}
        files=[]
        GOOD=1
        for dataDir in os.listdir(rootDir):
            for file in os.listdir(rootDir+dataDir):
                sample=""
                #bcgsc v1 and 2
                pattern ="gene.quantification" 
                altpattern =".v2.gene.quantification"
                if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")==-1:
                    #check if there is .v2
                    if string.find(file,".v2.")==-1:
                        V2=0
                        for file2 in os.listdir(rootDir+dataDir):
                            if string.find(file2,altpattern)!=-1:
                                V2=1
                                break
                        if V2:
                            continue
                    if string.find(file,".hg19.")==-1:
                        HG19=0
                        for file2 in os.listdir(rootDir+dataDir):
                            if string.find(file2,".hg19.")!=-1:
                                HG19=1
                                break
                        if HG19:
                            continue

                    infile = rootDir+dataDir+"/"+file
                    # bcgsc stupid sample name in file name
                    if dataProducer=="British Columbia Cancer Agency TCGA genome characterization center":
                        sample = string.split(file,".")[0]
                    else:
                        print "please check how to identify sample name"
                    valuePOS=3
                    LOG2=1
                    RANK=0

                #bcgsc exon v1 and v2
                pattern ="exon.quantification" 
                altpattern =".v2.exon.quantification"
                if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")!=-1:
                    #check if there is .v2
                    if string.find(file,".v2.")==-1:
                        V2=0
                        for file2 in os.listdir(rootDir+dataDir):
                            if string.find(file2,altpattern)!=-1:
                                V2=1
                                break
                        if V2:
                            continue

                    if string.find(file,".hg19.")==-1:
                        HG19=0
                        for file2 in os.listdir(rootDir+dataDir):
                            if string.find(file2,".hg19.")!=-1:
                                HG19=1
                                break
                        if HG19:
                            continue

                    infile = rootDir+dataDir+"/"+file
                    # bcgsc stupid sample name in file name
                    if dataProducer=="British Columbia Cancer Agency TCGA genome characterization center":
                        sample = string.split(file,".")[0]
                    else:
                        print "please check how to identify sample name"
                    valuePOS=3
                    LOG2=1
                    RANK=0


                #v2
                pattern ="rsem.genes.normalized_results"
                if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")==-1:
                    infile = rootDir+dataDir+"/"+file
                    # unc stupid sample name in file name
                    if dataProducer =="University of North Carolina TCGA genome characterization center":
                        sample = string.split(file,".")[2]
                    else:
                        print "please check how to identify sample name"
                    if string.find(namesuffix,"percentile") !=-1: #generated percentileRANK based data
                        RANK=1
                    else:
                        RANK=0
                    valuePOS=1
                    LOG2=1

                #v2 exon from unc
                pattern ="bt.exon_quantification" 
                if string.find(file,pattern)!=-1 and string.find(namesuffix,"exon")!=-1:
                    infile = rootDir+dataDir+"/"+file
                    # unc stupid sample name in file name
                    if dataProducer =="University of North Carolina TCGA genome characterization center":
                        sample = string.split(file,".")[2]
                    else:
                        print "please check how to identify sample name"
                    valuePOS=3
                    LOG2=1
                    RANK=0

                if sample=="":
                    continue
                if sample in tmpSamples: #duplicated samples
                    continue
                if sample not in allSamples:
                    continue

                p=len(tmpSamples)
                tmpSamples[sample]=p
                
                c=c+1
                #print c
                if RANK:
                    process_percentileRANK(dataMatrix,tmpSamples,sample,genes, cancer,infile,flog, valuePOS,250)
                else:
                    process(dataMatrix,tmpSamples,sample,genes, cancer,infile,flog, valuePOS,LOG2,250)

                if (c % 250)==0:
                    tmpout="tmp_"+ str(int(c/250.0))
                    r =outputMatrix(dataMatrix, tmpSamples, genes, oldgenes, tmpout, flog)
                    if r:
                        GOOD=0
                        
                    dataMatrix=[]
                    tmpSamples={}
                    oldgenes=copy.deepcopy(genes)
                    genes ={}
                    files.append(tmpout)
                    
        if (c % 250)!=0:
            tmpout= "tmp_"+ str(int(c/250.0)+1)
            files.append(tmpout)
            r= outputMatrix(dataMatrix, tmpSamples, genes, oldgenes,tmpout, flog)
            if r:
                GOOD=0
                
        #paste all together
        outfile = outDir+cancer+"/"+cgFileName
        if GOOD:
            os.system("paste -d \'\' "+string.join(files," ")+" > "+ outfile)
        for file in files: 
            os.system("rm "+ file) 
        if not GOOD:
            sys.exit()
    
    datafile= outDir+cancer+"/"+cgFileName
    if not os.path.exists(datafile):
        return

    oHandle = open(outDir+cancer+"/"+cgFileName+".json","w")
    
    J={}
    #stable    
    J["redistribution"]= True
    J["groupTitle"]="TCGA "+TCGAUtil.cancerGroupTitle[cancer]
    J["dataProducer"]= dataProducer
    J["colNormalization"]=True
    J["PLATFORM"]= PATHPATTERN
    J["type"]= "genomicMatrix" 
    J[":sampleMap"]="TCGA."+cancer+".sampleMap"
    
    #multiple dirs
    J["url"]=TCGAUtil.remoteBase \
              +string.replace(inDir,TCGAUtil.localBase,"")
    J["version"]= datetime.date.today().isoformat()
    J["wrangler"]= "Xena TCGAscript "+ __name__ +" processed on "+ datetime.date.today().isoformat()

    if string.find(PATHPATTERN, "IlluminaHiSeq")!=-1: #IlluminaHiSeq
        platformTitle ="Illumina HiSeq 2000 RNA Sequencing platform"
    elif string.find(PATHPATTERN, "IlluminaGA")!=-1: #IlluminaGA
        platformTitle =" Illumina Genome Analyzer RNA Sequencing platform"
    assert platformTitle

    #change description
    J["description"]=""
    J["RNAtype"]="polyA+"
    if string.find(namesuffix, "total")!=-1: #totalRNA
        J["RNAtype"]= "total RNA"
    EXONGENE= "GENE"
    if string.find(namesuffix, "exon")!=-1: #exon
        EXONGENE = "EXON"

    if EXONGENE == "GENE": #gene
        J[":probeMap"]= "hugo"
        J["dataSubType"]="gene expression RNAseq"

        if cancer not in ["OV", "STAD"]:
            J["label"]= suffix
            J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") gene expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+")"
        else:
            if dataProducer =="University of North Carolina TCGA genome characterization center":
                J["label"]= suffix +" UNC"
                J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") gene expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+" UNC)"
            else:
                J["label"]= suffix+" BC"
                J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") gene expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+" BC)"
                
        J["notes"]= "the probeMap is hugo for the short term, however probably around 10% of the gene symbols are not HUGO names, but ENTRE genes"
        
        if  string.find(namesuffix, "percentile") != -1:  #percentile
            J["description"]= J["description"] + "For each sample, we rank genes RSEM values between 0% to 100%. This dataset is gene expression estimation in percentile rank, which higher value representing higher expression. The dataset can be used to compare this RNAseq data  with other cohorts when the other data is processed in the same way (i.e. percentile ranking)."
        else:  #basic
            J["description"]= J["description"] + "The gene expression profile was measured experimentally using the "+platformTitle+" by the "+ dataProducer +"." + \
                " Level 3 data was downloaded from TCGA data coordination center. This dataset shows the gene-level transcription estimates, "

    else: #exon
        J["dataSubType"]="exon expression RNAseq"
        J[":probeMap"]= "unc_RNAseq_exon.hg19" 

        if cancer not in [ "OV","STAD"]:
            J["label"]= suffix
            J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") exon expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+")"
        else:
            if dataProducer =="University of North Carolina TCGA genome characterization center":
                J["label"]= suffix+" UNC"
                J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") exon expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+" UNC)"
            else:
                J["label"]= suffix+" BC"
                J["longTitle"]="TCGA "+TCGAUtil.cancerOfficial[cancer]+" ("+cancer+") exon expression by RNAseq ("+ J["RNAtype"] + " "+ suffix+" BC)"


        J["description"]= J["description"] +" The exon expression profile was measured experimentally using the "+platformTitle+" by the "+ dataProducer +"." + \
                          " Level 3 data was downloaded from TCGA data coordination center. This dataset shows the exon-level transcription estimates, "

    #wrangling stuff
    if PATHPATTERN in [ "IlluminaHiSeq_RNASeqV2","IlluminaGA_RNASeqV2"] and string.find(namesuffix, "exon")==-1:
        if  string.find(namesuffix, "percentile")==-1: #basic
            J["description"] = J["description"] + "as in log2(x+1) transformed RSEM normalized count."
            J["unit"]="log2(norm_count+1)"
            J["wrangling_procedure"]= "Level_3 data (file names: *.rsem.genes.normalized_results) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository"
        else: #percentile
            J["unit"]="percentile rank"
            J["wrangling_procedure"]= "Level_3 data (file names: *.rsem.genes.normalized_results) are downloaded from TCGA DCC, percentile ranked, and processed at UCSC into Xena repository."
            
    elif string.find(namesuffix, "exon")!=-1: #exon
        J["description"] = J["description"] + "as in RPKM values (Reads Per Kilobase of exon model per Million mapped reads)."
        J["wrangling_procedure"]= "Level_3 data (file names: *.exon_quantification.txt) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository."
        J["unit"]="log2(RPKM+1)"
    else:
        J["description"] = J["description"] + "as in RPKM values (Reads Per Kilobase of exon model per Million mapped reads)."
        J["wrangling_procedure"]= "Level_3 data (file names: *.gene.quantification.txt) are downloaded from TCGA DCC, log2(x+1) transformed, and processed at UCSC into Xena repository."
        J["unit"]="log2(RPKM+1)"

    #mapping to genomics region
    if string.find(namesuffix, "exon")==-1: #gene
        J["description"] = J["description"] + " Genes are mapped onto the human genome coordinates using UCSC Xena HUGO probeMap (see ID/Gene mapping link below for details)."
    else: #exon
        J["description"] = J["description"] + " Exons are mapped onto the human genome coordinates using UCSC Xena unc_RNAseq_exon probeMap (see ID/Gene mapping link below for details."
    
    #reference
    if dataProducer =="University of North Carolina TCGA genome characterization center":
        J["description"] = J["description"] +\
                           " Reference to method description from "+dataProducer+": <a href=\"" + TCGAUtil.remoteBase +string.replace(inDir,TCGAUtil.localBase,"") +remoteDataDirExample+"/DESCRIPTION.txt\" target=\"_blank\"><u>DCC description</u></a>"
    
    # comparison 
    if string.find(namesuffix, "exon")==-1: # gene
        if  string.find(namesuffix, "percentile")!=-1: #percentile gene
            J["description"]= J["description"] +"<br><br>For comparing data within this cohort, we recommend to use the \"gene expression RNAseq\" dataset. For questions regarding the gene expression of this particular cohort in relation to other types tumors, you can use the pancan normalized version of the \"gene expression RNAseq\" data. For comparing with data outside TCGA, we recommend using the percentile version if the non-TCGA data is normalized by percentile ranking. For more information, please see our Data FAQ: <a href=https://docs.google.com/document/d/1q-7Tkzd7pci4Rz-_IswASRMRzYrbgx1FTTfAWOyHbmk/edit?usp=sharing target=\"_blank\"><u>here</u></a>."
        
    #viz setting
    J["description"] = J["description"] +\
                       "<br><br>In order to more easily view the differential expression between samples, we set the default view to center each gene or exon to zero by independently subtracting the mean of each gene or exon on the fly. Users can view the original non-normalized values by adjusting visualization settings."
    J["description"] = J["description"] +"<br><br>"

    J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer]
    J["sample_type"]=["tumor"]
    J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer]
    J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")"
    J['tags']=["cancer"]+ TCGAUtil.tags[cancer]
    J['owner']="TCGA"
    J['gdata_tags'] =["transcription"]
    
    #change cgData
    J["name"]="TCGA_"+cancer+"_exp_"+namesuffix
    name = trackName_fix(J['name'])
    if name ==False:
        message = "bad object name, need fix otherwise break loader, too long "+J["name"]
        print message
        flog.write(message+"\n")
        return
    else:
        J["name"]=name        
        
    oHandle.write( json.dumps( J, indent=-1 ) )
    oHandle.close()
    
    return

Example #23

Show file

File: cohort.py Project: jingchunzhu/cgDataNew

def cohort_variable (var, value, inDir, outDir, cancer, REALRUN, doDerived):
    print inDir
    print outDir

    if REALRUN:
        ignore =1
        bookDic=cgWalk(inDir,ignore)
        
        existMaps = collectSampleMaps(bookDic)
        missingMaps=  collectMissingSampleMaps(bookDic)

        #removeExistMaps
        for map in existMaps:
            if map not in missingMaps:
                missingMaps[map]=existMaps[map]
        
        # all aliquote uuid dic
        aliquote_dic =TCGAUtil.uuid_Aliquot_all()
        sample_dic =TCGAUtil.uuid_Sample_all()

        if len(missingMaps)!=1:
            return

        map = missingMaps.keys()[0]
        print map
        samples =[]
        for name in missingMaps[map]:
            obj=bookDic[name]
            
            if obj['type']=="genomicMatrix":
                fin =open(obj['path'],'U')
                for sample in string.split(fin.readline()[:-1],"\t")[1:]:
                    if sample =="":
                        print name, "has bad empty sample id"
                        sys.exit()
                    if sample not in samples:
                        samples.append(sample)
                fin.close()
            
            #take too long
                """
            if obj['type']=="mutationVector":
                fin =open(obj['path'],'U')
                fin.readline()
                while 1:
                    line = fin.readline()
                    if string.strip(line) =="":
                        break
                    sample = string.split(line,'\t')[0]
                    if sample not in samples:
                        samples.append(sample)
                        print sample, obj['path']
                fin.close()
            """
        intDic={}
        for sample in samples:
            #TCGA uuid handling
            uuid =sample
            TCGAbarcode =""
            if uuid[0:4]!="TCGA": 
                if aliquote_dic.has_key(string.lower(uuid)):
                    TCGAbarcode = aliquote_dic[string.lower(uuid)]
                else:
                    TCGAbarcode =  uuid
            else:
                TCGAbarcode = sample

            intID= TCGAUtil.barcode_IntegrationId(TCGAbarcode)
            if intID == None: # ids is on patient level above integration level
                continue 
            if not intDic.has_key(intID):
                intDic[intID]=""

        outfile = outDir+cancer+"/"+ var
        fout =open(outfile,"w")
        fout.write("sample\t"+var+"\n")
        for intId in intDic:
            fout.write(intId+"\t"+ value+"\n")
        fout.close()

    #data josn
    J={}
    J["version"]= datetime.date.today().isoformat()
    J["name"]="TCGA_"+cancer+"_"+var
    J["type"]= "clinicalMatrix" 
    J["dataSubType"]="phenotype"
    J[":sampleMap"]="TCGA."+cancer+".sampleMap"
    J["cohort"]="TCGA "+TCGAUtil.cancerHumanReadable[cancer]+" ("+cancer+")"

    outfile = outDir+cancer+"/"+var
    oHandle = open(outfile +".json","w")
    oHandle.write( json.dumps( J, indent=-1 ) )
    oHandle.close()

    if doDerived:
        if cancer in ["LUAD","LUSC"]:
            derived_cancer="LUNG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["COAD","READ"]:
            derived_cancer="COADREAD"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)
        if cancer in ["GBM","LGG"]:
            derived_cancer="GBMLGG"
            doDerivedCancer(var, outDir, cancer, derived_cancer, REALRUN)