def removeNonTumorSamples(dataD): ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] ) numCol = len(dataD['colLabels']) keepCols = [0] * numCol tumorList = [] numOutCol = 0 for jj in range(numCol): aCode = dataD['colLabels'][jj] tumorCode = '' # if the barcode is not even long enough to specify the sample type, # we will just assume that we keep it ... if (len(aCode) < 15): tumorCode = aCode else: # if the barcode is at least 15 characters long, then we parse it # ... aCode = miscTCGA.fixTCGAbarcode(aCode) (site, patient, sample, vial, portion, analyte, plate, center) = miscTCGA.parseTCGAbarcode(aCode) try: iSample = int(sample) except: iSample = -1 if (sample != aCode): print " what is going on here ??? ", aCode sys.exit(-1) if (iSample > 0 and iSample < 10): tumorCode = miscTCGA.sampleLevelCode(aCode) if (tumorCode != ''): if (tumorCode not in tumorList): tumorList += [tumorCode] keepCols[jj] = 1 numOutCol += 1 else: print " WARNING: in removeNonTumorSamples ... multiple columns for the same tumor sample ??? " print aCode, tumorCode # print tumorList print " --> keeping only the first one " # sys.exit(-1) rmColList = [] for jj in range(numCol): if (keepCols[jj] == 0): rmColList += [jj] print " will remove sample <%s> " % dataD['colLabels'][jj] # filter out the columns we don't want ... dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList) print " back from filter_dataMatrix ... ", dataD['colLabels'][:5] # NOTE: this next bit may no longer be necessary ... # and also set the shortened TCGA barcodes as labels ... if (len(tumorList) != len(dataD['dataMatrix'][0])): print " ERROR !!! length of tumorList does not correspond to size of dataMatrix !!! " print len(tumorList) tsvIO.lookAtDataD(dataD) sys.exit(-1) dataD['colLabels'] = tumorList print " now using shortened barcodes .. ", dataD['colLabels'][:5] return (dataD)
def removeNonTumorSamples(dataD): ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] ) numCol = len(dataD['colLabels']) keepCols = [0] * numCol tumorList = [] numOutCol = 0 for jj in range(numCol): aCode = dataD['colLabels'][jj] tumorCode = '' # hack to not mess with ITMI samples ... if (aCode.startswith("ITMI-")): tumorCode = aCode # if the barcode is not even long enough to specify the sample type, # we will just assume that we keep it ... elif (len(aCode) < 16): tumorCode = aCode else: # if the barcode is at least 16 characters long, then we parse it # ... if (aCode.startswith("ITMI-")): doNothing = 1 else: aCode = miscTCGA.fixTCGAbarcode(aCode) (site, patient, sample, vial, portion, analyte, plate, center) = miscTCGA.parseTCGAbarcode(aCode) try: iSample = int(sample) except: iSample = -1 if (sample != aCode): print " what is going on here ??? ", aCode sys.exit(-1) if (iSample > 0 and iSample < 10): tumorCode = miscTCGA.sampleLevelCode(aCode) if (tumorCode != ''): if (tumorCode not in tumorList): tumorList += [tumorCode] keepCols[jj] = 1 numOutCol += 1 else: print " WARNING: in removeNonTumorSamples ... multiple columns for the same tumor sample ??? " print aCode, tumorCode # print tumorList print " --> keeping only the first one " # sys.exit(-1) rmColList = [] for jj in range(numCol): if (keepCols[jj] == 0): rmColList += [jj] print " will remove sample <%s> " % dataD['colLabels'][ jj] # filter out the columns we don't want ... dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList) print " back from filter_dataMatrix ... ", dataD['colLabels'][:5] # NOTE: this next bit may no longer be necessary ... # and also set the shortened TCGA barcodes as labels ... if (len(tumorList) != len(dataD['dataMatrix'][0])): print " ERROR !!! length of tumorList does not correspond to size of dataMatrix !!! " print len(tumorList) tsvIO.lookAtDataD(dataD) sys.exit(-1) dataD['colLabels'] = tumorList print " now using shortened barcodes .. ", dataD['colLabels'][:5] return (dataD)
print " " print " in filterTSVbySampList.py ... " print " input file : ", inFile print " output file : ", outFile print " list info : ", listInfo print " " # print " " # print " ***************************************************************** " # print " calling readTSV ... ", inFile dataD = tsvIO.readTSV(inFile) if (dataD == {}): sys.exit(-1) tsvIO.lookAtDataD(dataD) # print " " # print " reading sample list ... " numLists = len(listInfo) listDetails = [0] * numLists listBW = [0] * numLists listLS = [0] * numLists for iList in range(numLists): print " --> loading sample list #%d from <%s> " % ((iList + 1), listInfo[iList][0]) listDetails[iList] = readSampleListFromFile(listInfo[iList][0]) listBW[iList] = listInfo[iList][1] listLS[iList] = listInfo[iList][2] if (len(listDetails[iList]) > 0): print " %4d samples found, eg <%s> " % (len(listDetails[iList]), listDetails[iList][0]) else:
outFile = sys.argv[4] corrThresh = abs(float(sys.argv[5])) if (not othrType.startswith("N:")): othrType = "N:" + othrType ## corrThresh = 0.40 ## corrThresh = 0.20 print " " print " " print " ************* " print " in methCorr : " print " ***************************************************************** " print " calling readTSV ... ", methFile methD = tsvIO.readTSV(methFile) tsvIO.lookAtDataD(methD) print " calling readTSV ... ", othrFile othrD = tsvIO.readTSV(othrFile) tsvIO.lookAtDataD(othrD) try: methRowLabels = methD['rowLabels'] methColLabels = methD['colLabels'] methDataMatrix = methD['dataMatrix'] except: print " no valid METH feature matrix " sys.exit(-1) numMethRow = len(methRowLabels) numMethCol = len(methColLabels)
## check to see if we actually have any data ... skipFile = 0 try: if (len(testD) == 0): skipFile = 1 if (len(testD['rowLabels']) == 0): skipFile = 1 if (len(testD['colLabels']) == 0): skipFile = 1 except: print " ERROR in looking at data from <%s> ??? " % (aFile) skipFile = 1 if (skipFile): print " --> nothing found ??? continuing ... " continue print " first look ... " tsvIO.lookAtDataD(testD) if (1): # TCGA-CJ-4635-01A-02R # the first 12 characters identify the patient # the first 16 characters identify the sample # now we check for duplicates at the sample level ... print " " print " checking for duplicates ... " testD = removeDuplicateSamples(testD, 16, 0) print " second look ... " tsvIO.lookAtDataD(testD) if (1):
if (1): if (len(sys.argv) != 3): print " Usage : %s <input file> <output file> " % sys.argv[0] sys.exit(-1) inFile = sys.argv[1] outFile = sys.argv[2] print " " print " " print " **************** " print " in jitterTSV : " print " " print " ***************************************************************** " print " calling readTSV ... ", inFile inD = tsvIO.readTSV(inFile) tsvIO.lookAtDataD(inD) rowLabels = inD['rowLabels'] colLabels = inD['colLabels'] dataMatrix = inD['dataMatrix'] numRow = len(rowLabels) numCol = len(colLabels) outD = {} outD['dataType'] = inD['dataType'] outD['rowLabels'] = rowLabels outD['colLabels'] = colLabels numRow = len(rowLabels) numCol = len(colLabels)
## colMaxNAfrac = 0.90 ## rowMaxNAfrac = 0.90 for aFile in inFileList: print " " print " ***************************************************************** " print " calling readTSV ... ", aFile testD = tsvIO.readTSV(aFile) if (len(testD) == 0): print " --> nothing found ??? continuing ... " continue tsvIO.lookAtDataD(testD) if (1): # TCGA-CJ-4635-01A-02R # the first 12 characters identify the patient # the first 15 characters identify the sample # now we check for duplicates at the sample level ... print " " print " checking for duplicates ... " testD = removeDuplicateSamples(testD, 15, 0) if (0): print " " print " calling removeNonTumorSamples ... "
print " " print " in filterTSVbySampList.py ... " print " input file : ", inFile print " output file : ", outFile print " list info : ", listInfo print " " # print " " # print " ***************************************************************** " # print " calling readTSV ... ", inFile dataD = tsvIO.readTSV(inFile) if (dataD == {}): sys.exit(-1) tsvIO.lookAtDataD(dataD) # print " " # print " reading sample list ... " numLists = len(listInfo) listDetails = [0] * numLists listBW = [0] * numLists listLS = [0] * numLists for iList in range(numLists): print " --> loading sample list #%d from <%s> " % ( (iList + 1), listInfo[iList][0]) listDetails[iList] = readSampleListFromFile(listInfo[iList][0]) listBW[iList] = listInfo[iList][1] listLS[iList] = listInfo[iList][2] if (len(listDetails[iList]) > 0): print " %4d samples found, eg <%s> " % (len(
tsvFile = sys.argv[1] outFile = sys.argv[2] dThresh = int ( sys.argv[3] ) minCount = int ( sys.argv[4] ) corrThresh = float ( sys.argv[5] ) print " " print " " print " ************** " print " in methCorr3 : ", dThresh, minCount, corrThresh print " ***************************************************************** " print ' (a) TIME ', time.asctime(time.localtime(time.time())) print " calling readTSV ... ", tsvFile tsvD = tsvIO.readTSV(tsvFile) tsvIO.lookAtDataD(tsvD) try: tsvRowLabels = tsvD['rowLabels'] tsvColLabels = tsvD['colLabels'] tsvDataMatrix = tsvD['dataMatrix'] except: print " no valid METH feature matrix " sys.exit(-1) numRow = len(tsvRowLabels) numCol = len(tsvColLabels) keepMeth = {} keepGexp = {}
if (len(sys.argv) != 3): print " Usage : %s <input file> <output file> " % sys.argv[0] print " ERROR -- bad command line arguments " sys.exit(-1) inFile = sys.argv[1] outFile = sys.argv[2] print " " print " " print " **************** " print " in jitterTSV : " print " " print " ***************************************************************** " print " calling readTSV ... ", inFile inD = tsvIO.readTSV(inFile) tsvIO.lookAtDataD(inD) rowLabels = inD['rowLabels'] colLabels = inD['colLabels'] dataMatrix = inD['dataMatrix'] numRow = len(rowLabels) numCol = len(colLabels) outD = {} outD['dataType'] = inD['dataType'] outD['rowLabels'] = rowLabels outD['colLabels'] = colLabels numRow = len(rowLabels) numCol = len(colLabels)