def convertCAVM(inDir, outD, REALRUN, CAVM, TCGA, MAPID=1):
    bookDic = {}
    sampleMaps = {}
    bookDic = cgWalk(inDir, 0)

    if not os.path.exists(outD):
        os.system("mkdir " + outD)

    if not bookDic:
        print "repo has problem"
        return 0

    sampleMaps = collectSampleMaps(bookDic)
    missingMaps = collectMissingSampleMaps(bookDic)

    allMaps = sampleMaps.keys()
    allMaps.extend(missingMaps.keys())

    for sampleMap in allMaps:
        print sampleMap
        outDir = outD + sampleMap + "/"
        if not os.path.exists(outDir):
            os.system("mkdir " + outDir)

        path = bookDic[sampleMap]['path']
        """
        if string.find(os.path.abspath(path), "/inside/home/jzhu/cgDataJing/scripts/data_flatten/") ==-1:
            print "ignore "+path
            continue
        """
        if sampleMap in missingMaps:
            #construct an empty sampleMap
            sMap = SampleMapNew(None, sampleMap)
            #fill sMap with individual nodes, no connection
            changed = checkIdsAllIn(sMap, bookDic)
            #build connection
        else:
            name = bookDic[sampleMap]['name']
            fin = open(path, 'r')
            sMap = SampleMapNew(fin, name)
            if not sMap.getName():
                print "Fail to initiate", name
                return 0
            fin.close()

        #cohort sampleMap json
        sMapJ = {}
        fin = open(bookDic[sampleMap]['path'] + ".json", 'r')
        sMapJ = json.loads(fin.read())
        fin.close()

        #integration list
        integrationList = []
        rootDic = {}
        clinFile = ""
        clinMatrix = None

        #cohort
        COHORT = ""
        cohortPath = string.join(
            string.split(bookDic[sampleMap]['path'], "/")[0:-1],
            "/") + "/cohort.json"
        if os.path.exists(cohortPath):
            fin = open(cohortPath, 'r')
            cohortJ = json.loads(fin.read())
            COHORT = cohortJ["name"]

        for name in sampleMaps[sampleMap]:
            obj = bookDic[name]
            if obj['type'] == "clinicalMatrix":
                clinFile = outDir + os.path.basename(obj['path'])

                #JSON
                fin = open(obj['path'] + ".json", 'r')
                J = json.load(fin)
                fin.close()

                if COHORT:
                    J["cohort"] = COHORT
                else:
                    J['cohort'] = J[':sampleMap']

                J["label"] = "Phenotypes"
                if CAVM:
                    J.pop(':sampleMap')
                    if J.has_key("dataSubType"):
                        if J.has_key(":dataSubType"):
                            J.pop(':dataSubType')
                    else:
                        if J.has_key(":dataSubType"):
                            J["dataSubType"] = J[":dataSubType"]
                            J.pop(':dataSubType')

                fout = open(clinFile + ".json", 'w')
                fout.write(json.dumps(J, indent=-1))
                fout.close()

                if REALRUN != 0 and REALRUN != 1:
                    continue

                if clinMatrix != None:
                    print "only one clinical matrix is allowed"
                    sys.exit()

                fin = open(obj['path'], 'U')
                fout = open(clinFile, 'w')
                line = fin.readline()
                fout.write(line)

                samples = []
                for line in fin.readlines():
                    if MAPID:
                        sample = string.split(line[:-1], "\t")[-1]
                    else:
                        sample = string.split(line[:-1], "\t")[0]
                    if sample not in samples and sample != "":
                        samples.append(sample)
                        fout.write(sample + "\t")
                        if MAPID:
                            fout.write(
                                string.join(
                                    string.split(line[:-1], "\t")[1:], "\t"))
                        else:
                            fout.write(
                                string.join(
                                    string.split(line[:-1], "\t")[1:-1], "\t")
                                + "\t" + sample)
                        fout.write("\n")
                fout.close()
                integrationList = copy.deepcopy(samples)

                #clinicalFeature
                if J.has_key(":clinicalFeature"):
                    cFobj = bookDic[J[":clinicalFeature"]]
                    outfile = outDir + os.path.basename(cFobj['path'])
                    os.system("cp " + cFobj['path'] + "  " + outfile)
                    os.system("cp " + cFobj['path'] + ".json " + outfile +
                              ".json")

                #sampleMap data mapping information #cgData 1
                if not CAVM:
                    os.system("cp " + bookDic[sampleMap]['path'] + " " +
                              outDir + "sampleMap")

                #only expect one clinical matrix
                clinMatrix = ClinicalMatrixNew(clinFile, "clinMatrix")
                break

        for name in sampleMaps[sampleMap]:
            obj = bookDic[name]
            if obj['type'] in ["genomicSegment", "mutationVector"]:
                path = obj['path']
                print path

                outfile = outDir + os.path.basename(obj['path'])
                fin = open(obj['path'] + ".json", 'r')
                J = json.load(fin)
                fin.close()

                if COHORT:
                    J["cohort"] = COHORT
                else:
                    J['cohort'] = J[':sampleMap']

                if CAVM:
                    J.pop(':sampleMap')
                    if J.has_key("dataSubType"):
                        if J.has_key(":dataSubType"):
                            J.pop(':dataSubType')
                    else:
                        if J.has_key(":dataSubType"):
                            J["dataSubType"] = J[":dataSubType"]
                            J.pop(':dataSubType')

                fout = open(outfile + ".json", 'w')
                fout.write(json.dumps(J, indent=-1))
                fout.close()

                if REALRUN == 1:
                    fin = open(path, 'r')
                    fout = open(outDir + os.path.basename(path), 'w')
                    for line in fin.readlines():
                        data = string.split(line, '\t')
                        sample = data[0]
                        if rootDic.has_key(sample):
                            root = rootDic[sample]
                        else:
                            root = sMap.getIntegrationId(
                                sample, integrationList)
                            if not root:
                                root = sample
                            rootDic[sample] = root
                        fout.write(root + "\t" + string.join(data[1:], '\t'))
                    fin.close()
                    fout.close()

            if obj['type'] == "genomicMatrix":
                print obj['name']
                #JSON
                outfile = outDir + os.path.basename(obj['path'])
                fin = open(obj['path'] + ".json", 'r')
                J = json.load(fin)
                fin.close()

                if COHORT:
                    J["cohort"] = COHORT
                else:
                    J['cohort'] = J[':sampleMap']

                if CAVM:
                    J.pop(':sampleMap')
                    if J.has_key("dataSubType"):
                        if J.has_key(":dataSubType"):
                            J.pop(':dataSubType')
                    else:
                        if J.has_key(":dataSubType"):
                            J["dataSubType"] = J[":dataSubType"]
                            J.pop(':dataSubType')

                fout = open(outfile + ".json", 'w')
                fout.write(json.dumps(J, indent=-1))
                fout.close()

                if J.has_key('anatomical_origin'):
                    sMapJ['anatomical_origin'] = J['anatomical_origin']
                if J.has_key('primary_disease'):
                    sMapJ['primary_disease'] = J['primary_disease']
                if J.has_key('domain'):
                    sMapJ['domain'] = J['domain']
                if J.has_key('sample_type'):
                    sMapJ['sample_type'] = J['sample_type']
                if J.has_key('tags'):
                    sMapJ['tags'] = J['tags']

                if REALRUN != 1 and REALRUN != 0:
                    continue

                # add to clinMatrix the id mappings
                mappingCol = "_GENOMIC_ID_" + obj['name']
                clinMatrix.addOneColWithSameValue(mappingCol, "")

                # need to find it out if there are more than one sample map to each _INTEGRATION ID
                roots = {}
                findDup = 0
                fin = open(obj['path'], 'U')
                samples = string.split(fin.readline()[:-1], "\t")[1:]
                for i in range(0, len(samples)):
                    sample = samples[i]
                    if sample == "":
                        print name, "has bad empty sample id"
                        sys.exit()
                    if rootDic.has_key(sample):
                        root = rootDic[sample]
                    else:
                        root = sMap.getIntegrationId(sample, integrationList)
                        if not root:
                            root = sample
                        rootDic[sample] = root

                    genomic_Id = clinMatrix.getDATA(root, mappingCol)
                    if genomic_Id is None or genomic_Id == "":
                        clinMatrix.setDATA(root, mappingCol, sample)
                    else:
                        genomic_Id = string.split(genomic_Id, ",")
                        if sample not in genomic_Id:
                            genomic_Id.append(sample)
                            genomic_Id = string.join(genomic_Id, ',')
                            #print sample, genomic_Id
                            clinMatrix.setDATA(root, mappingCol, genomic_Id)

                    if roots.has_key(root):
                        roots[root].append(i)
                        findDup = 1
                    else:
                        roots[root] = [i]
                fin.close()

                if REALRUN != 1:
                    continue

                #probemap for genomic segment
                #if J.has_key(':genomicSegment'):
                if J.has_key(':probeMap'):
                    if bookDic.has_key(J[':probeMap']):
                        probeMap = bookDic[J[':probeMap']]['path']
                        os.system("cp " + probeMap + " " + outDir +
                                  os.path.basename(probeMap))
                        os.system("cp " + probeMap + ".json " + outDir +
                                  os.path.basename(probeMap) + ".json")

                #need to figure out if there are duplication in the probe ids
                findDupProbe = []
                process = os.popen(
                    "r=$(cut -f 1  " + obj['path'] +
                    " | more +2 | sort |uniq -c | sed -e 's/ *//' -e 's/ /\t/' | sort -n |cut -f 1 |sort -un|tail -n 1); if [ $r -ne \"1\" ]; then echo $r ; fi"
                )
                r = process.read()
                if r:
                    print string.strip(r), obj['path']
                    process = os.popen(
                        "cut -f 1  " + obj['path'] +
                        " | more +2 | sort |uniq -c | sed -e 's/ *//' -e 's/ /\t/' | sort -n |  grep -vP ^1'\t' | cut -f 2 |sort"
                    )
                    r = process.read()
                    list = string.split(r, "\n")
                    print len(list)
                    for probe in list:
                        findDupProbe.append(probe)

                #genomic data no dup
                fout = open(outfile, 'w')
                fin = open(obj['path'], 'U')
                if findDup == 0 and findDupProbe == []:
                    data = string.split(fin.readline()[:-1], "\t")
                    samples = data[1:]
                    fout.write(data[0])
                    for sample in samples:
                        if rootDic.has_key(sample):
                            root = rootDic[sample]
                        else:
                            root = sMap.getIntegrationId(
                                sample, integrationList)
                            if not root:
                                root = sample
                            rootDic[sample] = root
                        fout.write('\t' + root)
                    fout.write('\n')

                    if TCGA:
                        fin.close()
                        fout.close()
                        os.system("cat " + obj['path'] + " |sed 1d >> " +
                                  outfile)
                        #os.system("more +2 "+obj['path']+" >> "+outfile)
                    else:
                        while 1:
                            line = fin.readline()
                            if line == "":
                                break
                            line = string.replace(line, "\tnan\t", "\tNA\t")
                            line = string.replace(line, "\tNAN\t", "\tNA\t")
                            line = string.replace(line, "\tNaN\t", "\tNA\t")
                            fout.write(line)
                        fin.close()
                        fout.close()

                #genomic data with dup
                else:
                    print "genomic with dup", obj['path']
                    data = string.split(fin.readline()[:-1], "\t")
                    fout.write(data[0])
                    for root in roots:
                        fout.write('\t' + root)
                    fout.write('\n')

                    dupDic = {}
                    while 1:
                        duplist = []
                        line = fin.readline()[:-1]
                        if line == "":
                            break

                        data = string.split(line, "\t")

                        if data[0] not in findDupProbe:
                            fout.write(data[0])
                        else:
                            if data[0] not in dupDic:
                                dupDic[data[0]] = []

                        values = data[1:]

                        for root in roots:
                            if len(roots[root]) != 1:
                                total = "NA"
                                n = 0
                                for i in roots[root]:
                                    if values[i] in ["nan", "NAN", "NaN"]:
                                        pass
                                    else:
                                        try:
                                            float(values[i])
                                            if total == "NA":
                                                total = float(values[i])
                                            else:
                                                total = total + float(
                                                    values[i])
                                            n = n + 1
                                        except:
                                            pass
                                if total != "NA":
                                    average = str(total / n)
                                else:
                                    average = "NA"
                            else:
                                if values[roots[root][0]] in [
                                        "nan", "NAN", "NaN"
                                ]:
                                    average = "NA"
                                else:
                                    try:
                                        float(values[roots[root][0]])
                                        average = values[roots[root][0]]
                                    except:
                                        average = "NA"
                            if data[0] not in findDupProbe:
                                fout.write('\t' + average)
                            else:
                                duplist.append(average)
                        if data[0] not in findDupProbe:
                            fout.write('\n')
                        else:
                            dupDic[data[0]].append(duplist[:])

                    if dupDic != {}:
                        for probe in dupDic:
                            fout.write(probe)
                            k = len(dupDic[probe][0])
                            valList = []
                            nList = []
                            for i in range(0, k):
                                valList.append("NA")
                                nList.append(0)

                            for list in dupDic[probe]:
                                for i in range(0, k):
                                    try:
                                        float(list[i])
                                        if valList[i] == "NA":
                                            valList[i] = float(list[i])
                                        else:
                                            valList[i] = valList[i] + float(
                                                list[i])
                                        nList[i] = nList[i] + 1
                                    except:
                                        pass
                            for i in range(0, k):
                                try:
                                    float(valList[i])
                                    fout.write(
                                        "\t" +
                                        str(float(valList[i]) / nList[i]))
                                except:
                                    fout.write("\tNA")
                            fout.write("\n")
                    fin.close()
                    fout.close()

        #final clinical matrix output
        if REALRUN == 0 or REALRUN == 1:
            fout = open(clinFile, 'w')
            clinMatrix.store(fout)

        #sampleMap json #cgData1
        if not CAVM:
            outfile = outDir + "sampleMap.json"
            fout = open(outfile, 'w')
            fout.write(json.dumps(sMapJ, indent=-1))
            fout.close()

        #cohort json cp or create
        cohortPath = string.join(
            string.split(bookDic[sampleMap]['path'], "/")[0:-1],
            "/") + "/cohort.json"
        if os.path.exists(cohortPath):
            os.system("cp " + cohortPath + " " + outDir)
        else:
            outfile = outDir + "cohort.json"
            fout = open(outfile, 'w')
            cohortJ = {}
            cohortJ["type"] = "cohort"
            cohortJ["name"] = sampleMap
            fout.write(json.dumps(cohortJ, indent=-1))
            fout.close()
Ejemplo n.º 2
0
def runFlatten(inDir, outDir, REALRUN, onlyGenomicSamples, SMAPNAME=None):
    dir = inDir
    bookDic={}
    sampleMaps={}
    ignore=0
    bookDic=cgWalk(dir,ignore)
    if not bookDic :
        print "repo has problem"
        return 0
    sampleMaps = collectSampleMaps(bookDic)
    missingMaps= collectMissingSampleMaps(bookDic)

    allMaps = sampleMaps.keys()
    allMaps.extend(missingMaps.keys())

    for sampleMap in allMaps:
        if SMAPNAME and SMAPNAME!=sampleMap:
            print "skip", sampleMap
            continue

        print sampleMap
        path = bookDic[sampleMap]['path']
        if os.path.abspath(path) in [ \
            "/inside/home/jzhu/cgDataJing/scripts/data/public/TCGA/PANCAN/TCGA.PANCAN.sampleMap", \
                "/inside/home/jzhu/cgDataJing/scripts/data/public/TCGA/PANCAN12/TCGA.PANCAN12.sampleMap" ]:
            print "ignore "+path
            continue

        if sampleMap in missingMaps:
            #construct an empty sampleMap
            sMap = SampleMapNew(None,sampleMap)
            #fill sMap with individual nodes, no connection
            changed = checkIdsAllIn (sMap, bookDic)
            #build connection
        else:
            name = bookDic[sampleMap]['name']
            fin = open(path,'r')
            sMap = SampleMapNew(fin,name)
            if not sMap.getName():
                print "Fail to initiate", name
                return 0
            fin.close()
            changed = checkIdsAllIn (sMap, bookDic)
        
        if REALRUN in [0,1]:
            r = flattenEachSampleMap(sMap,bookDic,onlyGenomicSamples)
            if r== False:
                return 0
            finalClinicalMatrix,finalClinicalMatrixJSON,finalClinFeature,finalClinFeatureJSON= r
            if finalClinicalMatrix.getROWnum()!=0:
                outputEachSampleMapRelated(outDir, bookDic, sMap,
                                           finalClinicalMatrix,finalClinicalMatrixJSON,
                                           finalClinFeature,finalClinFeatureJSON,REALRUN)
        if REALRUN == -2:
            finalClinFeature = flattenForClinicalFeature(sMap, bookDic)
            outputForClinFeature(outDir,sMap, finalClinFeature)
            
        cpGenomicEachSample(REALRUN, outDir, bookDic, sMap)
        cpProbeMaps(REALRUN,outDir,bookDic,sMap)

        #cpCohort if exists
        path = string.join(string.split(bookDic[sampleMap]['path'],"/")[0:-1],"/") + "/cohort.json"
        if os.path.exists(path):
            dataPackageDir = outDir + sampleMapBaseName(sMap)
            os.system("cp "+path+" "+dataPackageDir+"/")

    return 1
def TCGASampleMap (dir, outDir, cancer,log, REALRUN):
    #print status
    print cancer, __name__

    #if cancer in ["PANCAN","PANCAN12"]:
    #    return

    ignore =1
    bookDic = cgWalk(dir,ignore)
    
    existMaps = collectSampleMaps(bookDic)
    missingMaps=  collectMissingSampleMaps(bookDic)

    #removeExistMaps
    for map in existMaps:
        if map not in missingMaps:
            missingMaps[map]=existMaps[map]
        
    # all aliquote uuid dic
    aliquote_dic =TCGAUtil.uuid_Aliquot_all()
    sample_dic =TCGAUtil.uuid_Sample_all()
    
    #missingMaps --- actually this is all the maps
    for map in missingMaps:
        print map
        print missingMaps[map]
        sMap =SampleMapNew(None,map)

        #integration id
        intName= map+".integrationID"
        if intName in bookDic:
            fin = open(bookDic[intName]["path"],'r')
            integrationID=IntegrationId(intName, fin)
            fin.close()
        else:
            integrationID=IntegrationId(intName)

        samples =[]
        for name in missingMaps[map]:
            if REALRUN !=1:
                continue
            print name
            obj=bookDic[name]
            if obj['type']=="genomicMatrix":
                fin =open(obj['path'],'U')
                for sample in string.split(fin.readline()[:-1],"\t")[1:]:
                    if sample =="":
                        print name, "has bad empty sample id"
                    if sample not in samples:
                        samples.append(sample)
                fin.close()
            #elif obj['type']=="clinicalMatrix":
            #    cMa = ClinicalMatrixNew(obj['path'],name)
            #    for sample in cMa.getROWs():
            #        if sample not in samples:
            #            samples.append(sample)
            elif obj['type'] in ["mutationVector","clinicalMatrix"]:
                path = obj['path']
                os.system("cut -f 1 "+path+ " |sort |uniq > .tmp")
                fin=open('.tmp','r')
                fin.readline()
                for line in fin.readlines():
                    #if string.strip(line)=="":
                    #    break
                    sample = string.strip(line) #string.split(line,'\t')[0]
                    if sample =="":
                        break
                    if sample not in samples:
                        samples.append(sample)

            else:
                continue

        for sample in samples:
            if REALRUN !=1:
                continue
            #TCGA uuid handling
            if sample[0:4]!="TCGA": 
                if aliquote_dic.has_key(string.lower(sample)):
                    TCGAbarcode = aliquote_dic[string.lower(sample)]
                else:
                    print sample
                    continue
                parent = TCGAbarcode
                child = sample
                sMap.addLink(parent,string.lower(child))
                sMap.addLink(parent,string.upper(child))
                sample = parent

            #do TCGA barcode trick
            parts= string.split(sample,"-")
            if len(parts)>3 and len(parts[3])==3:
                parts = parts[0:3]+ [parts[3][0:2],parts[3][2]]+parts[4:]
                #print parts

            """
            parent = string.join(parts[0:3],"-")
            #parts[3]
            if len(parts)>3 and len(parts[3])==3:
                child=parent +"-" +parts[3][0:2]
                sMap.addLink(parent,child)
                parent=child
                child=string.join(parts[0:4],"-")
                sMap.addLink(parent,child)
                parent=child
            """
            parent = string.join(parts[0:3],"-")
            for i in range (3,len(parts)):
                if i!=4:
                    child = parent +"-" +parts[i]
                else:
                    child = parent +parts[i]
                #add parent child
                sMap.addLink(parent,child)
                parent = child
                
            intID= TCGAUtil.barcode_IntegrationId(sample)
            integrationID.addId(intID)
            
        #output sampleMap
        if not os.path.exists( outDir ):
            os.makedirs( outDir )
        if not os.path.exists( outDir +cancer+"/"):
                os.makedirs( outDir+cancer+"/" )

        if REALRUN == 1:
            oHandle = open(outDir+cancer+"/"+map,"w")
            sMap.store(oHandle)

        #output integrationID
        if REALRUN ==1:
            oHandle = open(outDir+cancer+"/integrationID","w")
            integrationID.store(oHandle)
            oHandle.close()
        
        #output integrationID json
        oHandle = open(outDir+cancer+"/integrationID.json","w")
        J={}
        J['name']=intName

        J["anatomical_origin"]= TCGAUtil.anatomical_origin[cancer]
        J["sample_type"]="tumor"
        if cancer not in ["PANCAN","PANCAN12"]:
            J["primary_disease"]=TCGAUtil.cancerGroupTitle[cancer]
        else:
            J["primary_disease"]="cancer"
            
        #J["cohort"] ="TCGA "+TCGAUtil.cancerHumanReadable[cancer]
        J['domain']="TCGA"
        J['owner']="TCGA"
        
        J["cgDataVersion"]=1
        J['type']="integrationId"
        J["version"]= datetime.date.today().isoformat()
        oHandle.write( json.dumps( J, indent=-1 ) )
        oHandle.close()
        
        #output json
        oHandle = open(outDir+cancer+"/"+map+".json","w")
        J['name']=map
        J['type']="sampleMap"
        J["version"]= datetime.date.today().isoformat()
        J["cgDataVersion"]=1
        J[":integrationId"]=intName

        #add info for old clinical data
        if os.path.exists( outDir+cancer+"/oldClin.json" ):
            J[':oldClin']=cancer+"_oldClin" 

        #special code
        if TCGAUtil.featurePriority.has_key(cancer) and len(TCGAUtil.featurePriority[cancer])>=5:
            J["VIS"]=5
        
        #blackList in PAAD
        if J['name'] in ["TCGA.PAAD.sampleMap"]:
            J["blacklist"]= [ "TCGA-FQ-6551",
                              "TCGA-FQ-6552",
                              "TCGA-FQ-6553",
                              "TCGA-FQ-6554",
                              "TCGA-FQ-6555",
                              "TCGA-FQ-6558",
                              "TCGA-FQ-6559"]
            
        oHandle.write( json.dumps( J, indent=-1 ) )

        
    return
def CAVMid(dir, outDir, cancer, log, REALRUN):
    print cancer, sys._getframe().f_code.co_name

    ignore = 1
    bookDic = cgWalk(dir, ignore)

    existMaps = collectSampleMaps(bookDic)
    missingMaps = collectMissingSampleMaps(bookDic)

    #removeExistMaps
    for map in existMaps:
        if map not in missingMaps:
            missingMaps[map] = existMaps[map]

    # all aliquote uuid dic
    aliquote_dic = TCGAUtil.uuid_Aliquot_all()
    sample_dic = TCGAUtil.uuid_Sample_all()

    if not os.path.exists(outDir):
        os.system("mkdir " + outDir)

    for map in missingMaps:
        print map
        sMap = SampleMapNew(None, map)
        for name in missingMaps[map]:
            samples = []
            intDic = {}  #keyed on CAVMid
            sampleDic = {}  #keyd on original sample id
            obj = bookDic[name]

            print obj["name"]

            if obj['type'] in ["clinicalMatrix", "mutationVector"]:
                outfile = outDir + os.path.basename(obj['path'])
                os.system("cp " + obj['path'] + ".json " + outfile + ".json")

                fin = open(outfile + ".json", 'r')
                J = json.load(fin)
                fin.close()
                if J.has_key(":clinicalFeature"):
                    cFobj = bookDic[J[":clinicalFeature"]]
                    cFoutfile = outDir + os.path.basename(cFobj['path'])
                    os.system("cp " + cFobj['path'] + " " + cFoutfile)
                    os.system("cp " + cFobj['path'] + ".json " + cFoutfile +
                              ".json")

                if REALRUN == -1:
                    continue

                if REALRUN == 0 and obj['type'] == "mutationVector":
                    continue

                fin = open(obj['path'], 'r')
                fin.readline()
                for line in fin.readlines():
                    sample = string.split(line, "\t")[0]
                    if sample not in samples and sample != "":
                        samples.append(sample)
                buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic)

                fin = open(obj['path'], 'r')
                fout = open(outfile, 'w')
                fout.write(fin.readline())
                for line in fin.readlines():
                    data = string.split(line, "\t")
                    sample = data[0]
                    try:
                        fout.write(sampleDic[sample] + "\t")
                        fout.write(string.join(data[1:], "\t"))
                    except:
                        fout.write(line)
                fout.close()

            if obj['type'] == "genomicMatrix":
                fin = open(obj['path'], 'U')
                for sample in string.split(fin.readline()[:-1], "\t")[1:]:
                    if sample == "":
                        print name, "has bad empty sample id"
                        sys.exit()
                    samples.append(sample)

                fin.close()

                outfile = outDir + os.path.basename(obj['path'])

                os.system("cp " + obj['path'] + ".json " + outfile + ".json")

                if REALRUN != 1:
                    continue

                buildSampleDic(samples, sMap, intDic, sampleDic, aliquote_dic)
                process(obj['path'], outfile, samples, intDic)
Ejemplo n.º 5
0
def convertCAVM (inDir, outD ,REALRUN, CAVM, TCGA, MAPID=1):
    bookDic={}
    sampleMaps={}
    bookDic=cgWalk(inDir,0)
    
    if not os.path.exists (outD):
        os.system("mkdir "+outD)

    if not bookDic :
        print "repo has problem"
        return 0


    sampleMaps = collectSampleMaps(bookDic)
    missingMaps= collectMissingSampleMaps(bookDic)

    allMaps = sampleMaps.keys()
    allMaps.extend(missingMaps.keys())

    for sampleMap in allMaps:
        print sampleMap
        outDir = outD + sampleMap+"/"
        if not os.path.exists (outDir):
            os.system("mkdir "+outDir)

        path = bookDic[sampleMap]['path']
        """
        if string.find(os.path.abspath(path), "/inside/home/jzhu/cgDataJing/scripts/data_flatten/") ==-1:
            print "ignore "+path
            continue
        """    
        if sampleMap in missingMaps:
            #construct an empty sampleMap
            sMap = SampleMapNew(None,sampleMap)
            #fill sMap with individual nodes, no connection
            changed = checkIdsAllIn(sMap, bookDic)
            #build connection
        else:
            name = bookDic[sampleMap]['name']
            fin = open(path,'r')
            sMap = SampleMapNew(fin,name)
            if not sMap.getName():
                print "Fail to initiate", name
                return 0
            fin.close()

        #cohort sampleMap json
        sMapJ={}
        fin = open(bookDic[sampleMap]['path']+".json",'r')
        sMapJ = json.loads(fin.read())
        fin.close()

        #integration list
        integrationList =[]
        rootDic={}
        clinFile =""
        clinMatrix = None
        
        #cohort
        COHORT =""
        cohortPath= string.join(string.split(bookDic[sampleMap]['path'],"/")[0:-1],"/")+"/cohort.json"
        if os.path.exists(cohortPath):
            fin = open (cohortPath,'r')
            cohortJ= json.loads(fin.read())
            COHORT = cohortJ["name"]

        for name in sampleMaps[sampleMap]:
            obj=bookDic[name]
            if obj['type']=="clinicalMatrix":
                clinFile = outDir +os.path.basename(obj['path'])

                #JSON
                fin = open (obj['path']+".json",'r')
                J=json.load(fin)
                fin.close()

                if COHORT :
                    J["cohort"]=COHORT
                else:
                    J['cohort']=J[':sampleMap']

                J["label"]="Phenotypes"
                if CAVM:
                    J.pop(':sampleMap') 
                    if J.has_key("dataSubType"):
                        if J.has_key(":dataSubType"):
                            J.pop(':dataSubType') 
                    else:
                        if J.has_key(":dataSubType"):
                            J["dataSubType"]=J[":dataSubType"]
                            J.pop(':dataSubType')

                fout=open(clinFile+".json",'w')
                fout.write(json.dumps (J, indent=-1))
                fout.close()

                if REALRUN != 0 and  REALRUN !=1:
                    continue

                if clinMatrix != None:
                    print "only one clinical matrix is allowed"
                    sys.exit()
                    
                fin = open(obj['path'],'U')
                fout = open(clinFile,'w')
                line = fin.readline()
                fout.write(line)

                samples =[]
                for line in fin.readlines():
                    if MAPID:
                        sample =string.split(line[:-1],"\t")[-1]
                    else:
                        sample =string.split(line[:-1],"\t")[0]
                    if sample not in samples and sample !="":
                        samples.append(sample)
                        fout.write(sample+"\t")
                        if MAPID:
                            fout.write(string.join(string.split(line[:-1],"\t")[1:],"\t"))
                        else:
                            fout.write(string.join(string.split(line[:-1],"\t")[1:-1],"\t")+"\t"+sample)
                        fout.write("\n")
                fout.close()
                integrationList = copy.deepcopy(samples)
                

                #clinicalFeature
                if J.has_key(":clinicalFeature"):
                    cFobj= bookDic[J[":clinicalFeature"]]
                    outfile = outDir +os.path.basename(cFobj['path'])
                    os.system("cp "+cFobj['path']+"  "+outfile)
                    os.system("cp "+cFobj['path']+".json "+outfile+".json")
                
                #sampleMap data mapping information #cgData 1
                if not CAVM:
                    os.system("cp "+ bookDic[sampleMap]['path'] +" "+outDir+"sampleMap")

                #only expect one clinical matrix
                clinMatrix= ClinicalMatrixNew(clinFile, "clinMatrix")
                break

        for name in sampleMaps[sampleMap]:
            obj=bookDic[name]
            if obj['type'] in ["genomicSegment","mutationVector"]:
                path= obj['path']
                print path

                outfile = outDir +os.path.basename(obj['path'])
                fin = open (obj['path']+".json",'r')
                J=json.load(fin)
                fin.close()

                if COHORT :
                    J["cohort"]=COHORT
                else:
                    J['cohort']=J[':sampleMap']

                if CAVM:
                    J.pop(':sampleMap')
                    if J.has_key("dataSubType"):
                        if J.has_key(":dataSubType"):
                            J.pop(':dataSubType') 
                    else:
                        if J.has_key(":dataSubType"):
                            J["dataSubType"]=J[":dataSubType"]
                            J.pop(':dataSubType') 


                fout=open(outfile+".json",'w')
                fout.write(json.dumps (J, indent=-1))
                fout.close()

                if REALRUN ==1 :
                    fin =open(path,'r')
                    fout =open(outDir+os.path.basename(path),'w')
                    for line in fin.readlines():
                        data = string.split(line,'\t')
                        sample =data[0]
                        if rootDic.has_key(sample):
                            root = rootDic[sample]
                        else:
                            root = sMap.getIntegrationId(sample,integrationList)
                            if not root:
                                root = sample
                            rootDic[sample]=root
                        fout.write(root+"\t"+string.join(data[1:],'\t'))
                    fin.close()
                    fout.close()
                
            if obj['type']=="genomicMatrix":
                print obj['name']
                #JSON
                outfile = outDir +os.path.basename(obj['path'])
                fin = open (obj['path']+".json",'r')
                J=json.load(fin)
                fin.close()

                if COHORT :
                    J["cohort"]=COHORT
                else:
                    J['cohort']=J[':sampleMap']

                if CAVM:
                    J.pop(':sampleMap')
                    if J.has_key("dataSubType"):
                        if J.has_key(":dataSubType"):
                            J.pop(':dataSubType') 
                    else:
                        if J.has_key(":dataSubType"):
                            J["dataSubType"]=J[":dataSubType"]
                            J.pop(':dataSubType') 

                fout=open(outfile+".json",'w')
                fout.write(json.dumps (J, indent=-1))
                fout.close()

                if J.has_key('anatomical_origin'):
                    sMapJ['anatomical_origin']=J['anatomical_origin']
                if J.has_key('primary_disease'):
                    sMapJ['primary_disease']=J['primary_disease']
                if J.has_key('domain'):
                    sMapJ['domain']=J['domain']
                if J.has_key('sample_type'):
                    sMapJ['sample_type']=J['sample_type']
                if J.has_key('tags'):
                    sMapJ['tags']=J['tags']
                
                if REALRUN != 1 and REALRUN !=0:
                    continue
                
                # add to clinMatrix the id mappings
                mappingCol= "_GENOMIC_ID_"+obj['name']
                clinMatrix.addOneColWithSameValue(mappingCol,"")
                
                # need to find it out if there are more than one sample map to each _INTEGRATION ID
                roots={}
                findDup=0
                fin =open(obj['path'],'U')
                samples =string.split(fin.readline()[:-1],"\t")[1:]
                for i in range(0,len(samples)):
                    sample = samples[i]
                    if sample =="":
                        print name, "has bad empty sample id"
                        sys.exit()
                    if rootDic.has_key(sample):
                        root = rootDic[sample]
                    else:
                        root = sMap.getIntegrationId(sample,integrationList)
                        if not root:
                            root = sample
                        rootDic[sample]=root

                    genomic_Id= clinMatrix.getDATA(root, mappingCol)
                    if genomic_Id is None or genomic_Id =="":
                        clinMatrix.setDATA(root, mappingCol,sample)
                    else:
                        genomic_Id = string.split(genomic_Id,",")
                        if sample not in genomic_Id:
                            genomic_Id.append(sample)
                            genomic_Id= string.join(genomic_Id,',')
                            #print sample, genomic_Id
                            clinMatrix.setDATA(root, mappingCol,genomic_Id)
                            
                    if roots.has_key(root):
                        roots[root].append(i)
                        findDup=1
                    else:
                        roots[root]=[i]
                fin.close()

                if REALRUN != 1:
                    continue
                
                #probemap for genomic segment
                #if J.has_key(':genomicSegment'):
                if J.has_key(':probeMap'):
                    if bookDic.has_key(J[':probeMap']):
                        probeMap = bookDic[J[':probeMap']]['path']
                        os.system("cp "+probeMap+" "+outDir+os.path.basename(probeMap))
                        os.system("cp "+probeMap+".json "+outDir+os.path.basename(probeMap)+".json")

                #need to figure out if there are duplication in the probe ids
                findDupProbe=[]
                process = os.popen("r=$(cut -f 1  "+obj['path']+" | more +2 | sort |uniq -c | sed -e 's/ *//' -e 's/ /\t/' | sort -n |cut -f 1 |sort -un|tail -n 1); if [ $r -ne \"1\" ]; then echo $r ; fi")
                r = process.read()
                if r:
                    print string.strip(r), obj['path']
                    process = os.popen("cut -f 1  "+obj['path']+" | more +2 | sort |uniq -c | sed -e 's/ *//' -e 's/ /\t/' | sort -n |  grep -vP ^1'\t' | cut -f 2 |sort")
                    r = process.read()
                    list = string.split(r,"\n")
                    print len(list)
                    for probe in list:
                        findDupProbe.append(probe)
                        
                #genomic data no dup
                fout=open(outfile ,'w')
                fin =open(obj['path'],'U')
                if findDup ==0 and findDupProbe ==[]:
                    data =string.split(fin.readline()[:-1],"\t")
                    samples= data[1:]
                    fout.write(data[0])
                    for sample in samples:
                        if rootDic.has_key(sample):
                            root = rootDic[sample]
                        else:
                            root = sMap.getIntegrationId(sample,integrationList)
                            if not root:
                                root = sample
                            rootDic[sample]=root
                        fout.write('\t'+root)
                    fout.write('\n')

                    if TCGA:
                        fin.close()
                        fout.close()
                        os.system("cat "+obj['path']+" |sed 1d >> "+outfile)
                        #os.system("more +2 "+obj['path']+" >> "+outfile)
                    else:
                        while 1:
                            line = fin.readline()
                            if line =="":
                                break
                            line = string.replace(line,"\tnan\t","\tNA\t")
                            line = string.replace(line,"\tNAN\t","\tNA\t")
                            line = string.replace(line,"\tNaN\t","\tNA\t")
                            fout.write(line)
                        fin.close()
                        fout.close()

                #genomic data with dup
                else:
                    print "genomic with dup",obj['path']
                    data =string.split(fin.readline()[:-1],"\t")
                    fout.write(data[0])
                    for root in roots:
                        fout.write('\t'+root)
                    fout.write('\n')

                    dupDic ={}
                    while 1:
                        duplist=[]
                        line = fin.readline()[:-1]
                        if line =="":
                            break

                        data = string.split(line,"\t")
                                                
                        if data[0] not in findDupProbe:
                            fout.write(data[0])                            
                        else:
                            if data[0] not in dupDic:
                                dupDic[data[0]]=[]

                        values =data[1:]
                        
                        for root in roots:
                            if len(roots[root])!=1:
                                total="NA"
                                n=0
                                for i in roots[root]:
                                    if values[i] in ["nan","NAN","NaN"]:
                                        pass
                                    else:
                                        try:
                                            float(values[i])
                                            if total=="NA":
                                                total = float(values[i])
                                            else:
                                                total = total +float(values[i])
                                            n=n+1
                                        except:
                                            pass
                                if total != "NA":
                                    average = str(total / n)
                                else:
                                    average ="NA"
                            else:
                                if values[roots[root][0]] in ["nan","NAN","NaN"]:
                                    average="NA"
                                else:
                                    try:
                                        float(values[roots[root][0]])
                                        average = values[roots[root][0]]
                                    except:
                                        average="NA"
                            if data[0] not in findDupProbe:
                                fout.write('\t'+average)
                            else:
                                duplist.append(average)
                        if data[0] not in findDupProbe:
                            fout.write('\n')
                        else:
                            dupDic[data[0]].append(duplist[:])
                            
                    if dupDic!={}:
                        for probe in dupDic:
                            fout.write(probe)
                            k = len (dupDic[probe][0])
                            valList=[]
                            nList=[]
                            for i in range (0,k):
                                valList.append("NA")
                                nList.append(0)
                            
                            for list in dupDic[probe]:
                                for i in range (0,k):
                                    try:
                                        float(list[i])
                                        if valList [i]=="NA":
                                            valList[i]=float(list[i])
                                        else:
                                            valList[i] =valList[i] +float(list[i])
                                        nList[i]=nList[i]+1
                                    except:
                                        pass
                            for i in range (0,k):
                                try:
                                    float(valList[i])
                                    fout.write("\t"+str(float(valList[i])/nList[i]))
                                except:
                                    fout.write("\tNA")
                            fout.write("\n")
                    fin.close()
                    fout.close()
                

        #final clinical matrix output
        if REALRUN == 0 or  REALRUN ==1:
            fout= open(clinFile,'w')
            clinMatrix.store(fout)
    
        #sampleMap json #cgData1
        if not CAVM:
            outfile = outDir +"sampleMap.json"
            fout=open(outfile, 'w')
            fout.write(json.dumps(sMapJ,indent=-1))
            fout.close()
        
        #cohort json cp or create
        cohortPath= string.join(string.split(bookDic[sampleMap]['path'],"/")[0:-1],"/")+"/cohort.json"
        if os.path.exists(cohortPath):
            os.system("cp " + cohortPath +" " + outDir)
        else:
            outfile = outDir+"cohort.json"
            fout=open(outfile,'w')
            cohortJ={}
            cohortJ["type"]="cohort"
            cohortJ["name"]= sampleMap
            fout.write(json.dumps(cohortJ,indent=-1))
            fout.close()
def runFlatten(inDir, outDir, REALRUN, onlyGenomicSamples, SMAPNAME=None):
    dir = inDir
    bookDic = {}
    sampleMaps = {}
    ignore = 0
    bookDic = cgWalk(dir, ignore)
    if not bookDic:
        print "repo has problem"
        return 0
    sampleMaps = collectSampleMaps(bookDic)
    missingMaps = collectMissingSampleMaps(bookDic)

    allMaps = sampleMaps.keys()
    allMaps.extend(missingMaps.keys())

    for sampleMap in allMaps:
        if SMAPNAME and SMAPNAME != sampleMap:
            print "skip", sampleMap
            continue

        print sampleMap
        path = bookDic[sampleMap]['path']
        if os.path.abspath(path) in [ \
            "/inside/home/jzhu/cgDataJing/scripts/data/public/TCGA/PANCAN/TCGA.PANCAN.sampleMap", \
                "/inside/home/jzhu/cgDataJing/scripts/data/public/TCGA/PANCAN12/TCGA.PANCAN12.sampleMap" ]:
            print "ignore " + path
            continue

        if sampleMap in missingMaps:
            #construct an empty sampleMap
            sMap = SampleMapNew(None, sampleMap)
            #fill sMap with individual nodes, no connection
            changed = checkIdsAllIn(sMap, bookDic)
            #build connection
        else:
            name = bookDic[sampleMap]['name']
            fin = open(path, 'r')
            sMap = SampleMapNew(fin, name)
            if not sMap.getName():
                print "Fail to initiate", name
                return 0
            fin.close()
            changed = checkIdsAllIn(sMap, bookDic)

        if REALRUN in [0, 1]:
            r = flattenEachSampleMap(sMap, bookDic, onlyGenomicSamples)
            if r == False:
                return 0
            finalClinicalMatrix, finalClinicalMatrixJSON, finalClinFeature, finalClinFeatureJSON = r
            if finalClinicalMatrix.getROWnum() != 0:
                outputEachSampleMapRelated(outDir, bookDic, sMap,
                                           finalClinicalMatrix,
                                           finalClinicalMatrixJSON,
                                           finalClinFeature,
                                           finalClinFeatureJSON, REALRUN)
        if REALRUN == -2:
            finalClinFeature = flattenForClinicalFeature(sMap, bookDic)
            outputForClinFeature(outDir, sMap, finalClinFeature)

        cpGenomicEachSample(REALRUN, outDir, bookDic, sMap)
        cpProbeMaps(REALRUN, outDir, bookDic, sMap)

        #cpCohort if exists
        path = string.join(
            string.split(bookDic[sampleMap]['path'], "/")[0:-1],
            "/") + "/cohort.json"
        if os.path.exists(path):
            dataPackageDir = outDir + sampleMapBaseName(sMap)
            os.system("cp " + path + " " + dataPackageDir + "/")

    return 1