def checkFeatures(dataD): print " " print " in checkFeatures ... " # the feature matrix has thousands of features x hundreds of patients rowLabels = dataD['rowLabels'] colLabels = dataD['colLabels'] numRow = len(rowLabels) numCol = len(colLabels) dataMatrix = dataD['dataMatrix'] print " %d rows x %d columns " % (numRow, numCol) # print rowLabels[:5] # print rowLabels[-5:] rmRowList = [] for iRow in range(numRow): featName = rowLabels[iRow] # watch out for "bad" feature names, eg 1-Dec or unknown if (badFeatureName(featName)): rmRowList += [iRow] continue uVals = [] for iCol in range(numCol): if (dataMatrix[iRow][iCol] != "NA"): if (dataMatrix[iRow][iCol] != NA_VALUE): if (dataMatrix[iRow][iCol] not in uVals): uVals += [dataMatrix[iRow][iCol]] if (len(uVals) < 2): rmRowList += [iRow] # print " will remove row #%d <%s> " % ( iRow, featName ), uVals else: if (len(uVals) == 2): uVals.sort() if (featName[0] != "B"): if (uVals == [0, 1]): newName = "B" + featName[1:] print " fixing feature name : <%s> --> <%s> " % ( featName, newName), uVals rowLabels[iRow] = newName dataD['rowLabels'] = rowLabels print " --> removing %d uniform features " % len(rmRowList) newD = tsvIO.filter_dataMatrix(dataD, rmRowList, []) return (newD)
def checkFeatures(dataD): print " " print " in checkFeatures ... " # the feature matrix has thousands of features x hundreds of patients rowLabels = dataD['rowLabels'] colLabels = dataD['colLabels'] numRow = len(rowLabels) numCol = len(colLabels) dataMatrix = dataD['dataMatrix'] print " %d rows x %d columns " % (numRow, numCol) # print rowLabels[:5] # print rowLabels[-5:] rmRowList = [] for iRow in range(numRow): featName = rowLabels[iRow] # watch out for "bad" feature names, eg 1-Dec or unknown if (badFeatureName(featName)): rmRowList += [iRow] continue uVals = [] for iCol in range(numCol): if (dataMatrix[iRow][iCol] != "NA"): if (dataMatrix[iRow][iCol] != NA_VALUE): if (dataMatrix[iRow][iCol] not in uVals): uVals += [dataMatrix[iRow][iCol]] if (len(uVals) < 2): rmRowList += [iRow] # print " will remove row #%d <%s> " % ( iRow, featName ), uVals else: if (len(uVals) == 2): uVals.sort() if (featName[0] != "B"): if (uVals == [0, 1]): newName = "B" + featName[1:] print " fixing feature name : <%s> --> <%s> " % (featName, newName), uVals rowLabels[iRow] = newName dataD['rowLabels'] = rowLabels print " --> removing %d uniform features " % len(rmRowList) newD = tsvIO.filter_dataMatrix(dataD, rmRowList, []) return (newD)
def removeIdenticalFeatures(inD, featType): print " in removeIdenticalFeatures ... <%s> " % featType rowLabels = inD['rowLabels'] dataMatrix = inD['dataMatrix'] nRowIn = len(dataMatrix) nColIn = len(dataMatrix[0]) rmRowList = [] iRow = 0 while (iRow < nRowIn): print " " print " working on feature # %d " % iRow iFeatname = rowLabels[iRow] if (featType != "ANY"): if (iFeatname.find(featType) < 0): iRow += 1 continue if (iFeatname.find(":ja_") > 0 ): iRow += 1 continue curTokens = iFeatname.split(':') iGeneName = curTokens[2] jRow = iRow + 1 done = 0 # check to see if this feature is a ~constant~ if (constantVec(dataMatrix[iRow])): rmRowList += [iRow] print " --> adding constant iRow (%d) to rmRowList <%s> " % (iRow, iFeatname) done = 1 while not done: if (jRow >= nRowIn): done = 1 continue if (jRow in rmRowList): jRow += 1 continue jFeatname = rowLabels[jRow] if (featType != "ANY"): if (jFeatname.find(featType) < 0): jRow += 1 continue if (jFeatname.find(":ja_") > 0): jRow += 1 continue curTokens = jFeatname.split(':') jGeneName = curTokens[2] ## HERE this makes things MUCH faster, but may not uncover ## all identical features ... if ( 1 ): if (jGeneName != iGeneName): done = 1 continue identFeat = 1 ## print " --> comparing %d and %d ( %s and %s ) " % (iRow, jRow, iFeatname, jFeatname) for iCol in range(nColIn): if (dataMatrix[iRow][iCol] != dataMatrix[jRow][iCol]): identFeat = 0 if (identFeat): print " --> identical !!! %d and %d ( %s and %s ) " % (iRow, jRow, iFeatname, jFeatname) newName = chooseMostSpecific([iFeatname, jFeatname]) if (newName == iFeatname): if (jRow not in rmRowList): rmRowList += [jRow] print " --> adding jRow (%d) to rmRowList <%s> " % (jRow, jFeatname) elif (newName == jFeatname): if (iRow not in rmRowList): rmRowList += [iRow] print " --> adding iRow (%d) to rmRowList <%s> " % (iRow, iFeatname) print " --> skipping to next iRow " done = 1 continue jRow += 1 iRow += 1 print len(rmRowList) # print rmRowList outD = tsvIO.filter_dataMatrix(inD, rmRowList, []) print " returning ... " return (outD)
# now we have tested this gene against all nearby methylation probes # ... if (jNeg >= 0): if (maxNeg <= -corrThresh): keepMeth[jNeg] = 1 print " (b) keeping Spearman rho \t %.3f " % maxNeg if (jPos >= 0): if (maxPos >= corrThresh): keepMeth[jPos] = 1 print " (c) keeping Spearman rho \t %.3f " % maxPos # and at this point we have tested all genes!!! print " length of keepMeth dictionary : ", len(keepMeth) print " original number of rows in methylation file : ", numMethRow rmMethRowList = [] for jRow in range(numMethRow): if (jRow not in keepMeth.keys()): rmMethRowList += [jRow] print " number of rows to be removed : ", len(rmMethRowList) methD = tsvIO.filter_dataMatrix(methD, rmMethRowList, []) sortRowFlag = 0 sortColFlag = 0 tsvIO.writeTSV_dataMatrix(methD, sortRowFlag, sortColFlag, outFile) # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
refGeneDict, cytoDict, forceFlag, nameChangeFlag ) # check that the feature names are still unique ... print " --> verify that the feature names are unique ... " ( newLabels, rmList ) = tsvIO.uniqueFeatureLabels(annotD['rowLabels'], annotD['dataMatrix']) print " back from tsvIO.uniqueFeatureLabels " # quick sanity check that labels are still what I think they are ... for ii in range(len(newLabels)): if (not (newLabels[ii] == annotD['rowLabels'][ii])): print " " print " BAILING !!! ", newLabels[ii], annotD['rowLabels'][ii] print " " sys.exit(-1) # remove any 'extra' features that need removing ... if ( len(rmList) > 0 ): print " --> need to remove these rows ", rmList tmpD = tsvIO.filter_dataMatrix ( annotD, rmList, [] ) annotD = tmpD # and write the matrix back out print " --> calling tsvIO.writeTSV_dataMatrix ... " tsvIO.writeTSV_dataMatrix(annotD, 0, 0, outFile) print " " print " DONE " print " " # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
print " " print " handling ", aCat rmColList = [] for iC in range(numCol): if (catVec[iC] != aCat): rmColList += [iC] numRm = len(rmColList) numKp = numCol - numRm if (numKp < 10): print " --> too few columns remaining ... skipping this category ... (%d) " % numKp else: outD = tsvIO.filter_dataMatrix(dataD, [], rmColList) # make sure that we are not left with any features that are all-NA ... # or nearly all 0 ... (adding this 10sep12) dataMatrix = outD['dataMatrix'] outLabels = outD['rowLabels'] numRowOut = len(dataMatrix) numColOut = len(dataMatrix[0]) # print numRowOut, numColOut rmRowList = [] rmTypes = {} for iRow in range(numRowOut): curType = outLabels[iRow][2:6] allNA = 1 iCol = 0 while (allNA == 1 and iCol < numColOut):
def removeDuplicateSamples(dataD, barcodeLen, firstLast=0): ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] ) numCol = len(dataD['colLabels']) keepCols = [1] * numCol numOutCol = 0 sampleDict = {} for jj in range(numCol): aCode = dataD['colLabels'][jj] if (aCode.startswith("ITMI-")): doNothing = 1 elif (len(aCode) > barcodeLen): aCode = miscTCGA.fixTCGAbarcode(aCode) aCode = aCode[:barcodeLen] if (aCode not in sampleDict.keys()): sampleDict[aCode] = 0 sampleDict[aCode] += 1 allKeys = sampleDict.keys() allKeys.sort() for aKey in allKeys: if (sampleDict[aKey] > 1): dupList = [] print " duplicate columns for sample <%s> ??? " % aKey for jj in range(numCol): aCode = dataD['colLabels'][jj] if (aCode.startswith(aKey)): print " ", aCode dupList += [aCode] dupList.sort() # print dupList ## ['TCGA-A3-3308-01A-01T-0860-13', 'TCGA-A3-3308-01A-02R-1324-13'] # now decide which one to keep ... we prefer "R" over "T" # apparently ... keepCode = "NA" for kk in range(len(dupList)): aCode = dupList[kk] if (len(aCode) > 19): if (aCode[19] == "R"): keepCode = aCode # if there isn't one with an "R", just take either the first or # the last one, based on the firstLast flag ... if (keepCode == "NA"): if (firstLast == 0): keepCode = dupList[-1] elif (firstLast == 1): keepCode = dupList[0] else: keepCode = dupList[0] # now create the rmList by removing the keepCode from the dupList # ... rmList = [] for aCode in dupList: if (aCode != keepCode): rmList += [aCode] # and finally set the flags to drop the duplicates ... for jj in range(numCol): aCode = dataD['colLabels'][jj] if (aCode in rmList): keepCols[jj] = 0 print " --> will drop column %d <%s> " % (jj, aCode) rmColList = [] for jj in range(numCol): if (keepCols[jj] == 0): rmColList += [jj] print " will remove sample <%s> " % dataD['colLabels'][jj] # filter out the columns we don't want ... dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList) print " back from filter_dataMatrix ... ", dataD['colLabels'][:5] return (dataD)
print " " print " " for iRow in range(len(skipRowList)): print " REMOVING FEATURE %s " % rowLabels[skipRowList[iRow]] print " " print " " if (len(skipRowList) > int(0.90 * numRow)): print " " print " WARNING !!! more than 90% of the data is going to be lost ??? !!! " print " " # sys.exit(-1) print " " print " calling filter_dataMatrix ... " outD = tsvIO.filter_dataMatrix(dataD, skipRowList, []) # now build up the list of actual features that we have ... curRowLabels = outD['rowLabels'] numRow = len(curRowLabels) print " now we have %d output features " % numRow # and put together a new ordered list for the output ... # (here we need the 'strict' feature names) print " building up new outputOrder vector " outputOrder = [] for jRow in range(len(orderedWhiteList)): aFeat = orderedWhiteList[jRow] if (is_in_list(aFeat, curRowLabels, 'strict')): outputOrder += [aFeat] elif (is_in_list(aFeat, curRowLabels, 'loose')):
print " " print " " for iCol in range(len(skipColList)): print " REMOVING SAMPLE %s " % colLabels[skipColList[iCol]] print " " print " " if (len(skipColList) > int(0.90 * numCol)): print " " print " WARNING !!! more than 90% of the data is going to be lost ??? !!! " print " " # sys.exit(-1) # print " " # print " calling filter_dataMatrix ... " outD = tsvIO.filter_dataMatrix(dataD, [], skipColList) # make sure that we are not left with any features that are all-NA ... dataMatrix = outD['dataMatrix'] numRow = len(dataMatrix) numCol = len(dataMatrix[0]) skipRowList = [] for iRow in range(numRow): allNA = 1 iCol = 0 while (allNA == 1 and iCol < numCol): if (dataMatrix[iRow][iCol] != NA_VALUE): allNA = 0 iCol += 1 if (allNA): skipRowList += [iRow]
def removeIdenticalFeatures(inD, featType): print " in removeIdenticalFeatures ... <%s> " % featType rowLabels = inD['rowLabels'] dataMatrix = inD['dataMatrix'] nRowIn = len(dataMatrix) nColIn = len(dataMatrix[0]) rmRowList = [] iRow = 0 while (iRow < nRowIn): print " " print " working on feature # %d " % iRow iFeatname = rowLabels[iRow] if (featType != "ANY"): if (iFeatname.find(featType) < 0): iRow += 1 continue if (iFeatname.find(":ja_") > 0): iRow += 1 continue curTokens = iFeatname.split(':') iGeneName = curTokens[2] jRow = iRow + 1 done = 0 # check to see if this feature is a ~constant~ if (constantVec(dataMatrix[iRow])): rmRowList += [iRow] print " --> adding constant iRow (%d) to rmRowList <%s> " % ( iRow, iFeatname) done = 1 while not done: if (jRow >= nRowIn): done = 1 continue if (jRow in rmRowList): jRow += 1 continue jFeatname = rowLabels[jRow] if (featType != "ANY"): if (jFeatname.find(featType) < 0): jRow += 1 continue if (jFeatname.find(":ja_") > 0): jRow += 1 continue curTokens = jFeatname.split(':') jGeneName = curTokens[2] ## HERE this makes things MUCH faster, but may not uncover ## all identical features ... if (1): if (jGeneName != iGeneName): done = 1 continue identFeat = 1 ## print " --> comparing %d and %d ( %s and %s ) " % (iRow, jRow, iFeatname, jFeatname) for iCol in range(nColIn): if (dataMatrix[iRow][iCol] != dataMatrix[jRow][iCol]): identFeat = 0 if (identFeat): print " --> identical !!! %d and %d ( %s and %s ) " % ( iRow, jRow, iFeatname, jFeatname) newName = chooseMostSpecific([iFeatname, jFeatname]) if (newName == iFeatname): if (jRow not in rmRowList): rmRowList += [jRow] print " --> adding jRow (%d) to rmRowList <%s> " % ( jRow, jFeatname) elif (newName == jFeatname): if (iRow not in rmRowList): rmRowList += [iRow] print " --> adding iRow (%d) to rmRowList <%s> " % ( iRow, iFeatname) print " --> skipping to next iRow " done = 1 continue jRow += 1 iRow += 1 print len(rmRowList) # print rmRowList outD = tsvIO.filter_dataMatrix(inD, rmRowList, []) print " returning ... " return (outD)
print " " print " " for iRow in range(len(skipRowList)): print " REMOVING FEATURE %s " % rowLabels[skipRowList[iRow]] print " " print " " if (len(skipRowList) > int(0.90 * numRow)): print " " print " WARNING !!! more than 90% of the data is going to be lost ??? !!! " print " " # sys.exit(-1) print " " print " calling filter_dataMatrix ... " outD = tsvIO.filter_dataMatrix(dataD, skipRowList, []) # now build up the list of actual features that we have ... curRowLabels = outD['rowLabels'] numRow = len(curRowLabels) print " now we have %d output features " % numRow # and put together a new ordered list for the output ... # (here we need the 'strict' feature names) print " building up new outputOrder vector " outputOrder = [] for jRow in range(len(orderedWhiteList)): aFeat = orderedWhiteList[jRow] if ( is_in_list ( aFeat, curRowLabels, 'strict' ) ): outputOrder += [ aFeat ] elif ( is_in_list ( aFeat, curRowLabels, 'loose' ) ):
print ' (c) TIME ', time.asctime(time.localtime(time.time())) # and at this point we have tested all genes!!! print " length of keepMeth dictionary : ", len(keepMeth) print " length of keepGexp dictionary : ", len(keepGexp) print " original number of rows in input TSV file : ", numRow rmRowList = [] for jRow in range(numRow): if (tsvRowLabels[jRow].lower().find("platform") < 0 ): if (jRow not in keepMeth.keys()): if (jRow not in keepGexp.keys()): rmRowList += [jRow] print " number of rows to be removed : ", len(rmRowList) tsvD = tsvIO.filter_dataMatrix(tsvD, rmRowList, []) print ' (d) TIME ', time.asctime(time.localtime(time.time())) sortRowFlag = 0 sortColFlag = 0 tsvIO.writeTSV_dataMatrix(tsvD, sortRowFlag, sortColFlag, outFile) print ' (e) TIME ', time.asctime(time.localtime(time.time())) print " DONE !!! " # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-# # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
def removeDuplicateSamples(dataD, barcodeLen, firstLast=0): ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] ) numCol = len(dataD['colLabels']) keepCols = [1] * numCol numOutCol = 0 sampleDict = {} for jj in range(numCol): aCode = dataD['colLabels'][jj] if (aCode.startswith("ITMI-")): doNothing = 1 else: aCode = miscTCGA.fixTCGAbarcode(aCode) if (len(aCode) < barcodeLen): aCode = miscTCGA.get_barcode16(aCode) if (aCode not in sampleDict.keys()): sampleDict[aCode] = 0 sampleDict[aCode] += 1 allKeys = sampleDict.keys() allKeys.sort() for aKey in allKeys: if (sampleDict[aKey] > 1): dupList = [] print " duplicate columns for sample <%s> ??? " % aKey for jj in range(numCol): aCode = dataD['colLabels'][jj] if (aCode.startswith(aKey)): print " ", aCode dupList += [aCode] dupList.sort() # print dupList ## ['TCGA-A3-3308-01A-01T-0860-13', 'TCGA-A3-3308-01A-02R-1324-13'] # now decide which one to keep ... we prefer "R" over "T" # apparently ... keepCode = "NA" for kk in range(len(dupList)): aCode = dupList[kk] if (len(aCode) > 19): if (aCode[19] == "R"): keepCode = aCode # if there isn't one with an "R", just take either the first or # the last one, based on the firstLast flag ... if (keepCode == "NA"): if (firstLast == 0): keepCode = dupList[-1] elif (firstLast == 1): keepCode = dupList[0] else: keepCode = dupList[0] # now create the rmList by removing the keepCode from the dupList # ... rmList = [] for aCode in dupList: if (aCode != keepCode): rmList += [aCode] # and finally set the flags to drop the duplicates ... for jj in range(numCol): aCode = dataD['colLabels'][jj] if (aCode in rmList): keepCols[jj] = 0 print " --> will drop column %d <%s> " % (jj, aCode) rmColList = [] for jj in range(numCol): if (keepCols[jj] == 0): rmColList += [jj] print " will remove sample <%s> " % dataD['colLabels'][ jj] # filter out the columns we don't want ... dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList) print " back from filter_dataMatrix ... ", dataD['colLabels'][:5] return (dataD)
nzHist[numNZ] += 1 if (numNZ < minNZC): rmRowList += [iRow] print " --> number of rows to be skipped : %d out of %d " % (len(rmRowList), numRow) print " number of rows remaining : %d " % (numRow - len(rmRowList)) print " " print " histogram of NZ counts : " for ii in range(len(nzHist)): if (nzHist[ii] > 0): print " %4d %12d " % (ii, nzHist[ii]) print " " print " " newD = tsvIO.filter_dataMatrix(testD, rmRowList, []) tsvIO.lookAtDataD(newD) if (newD['dataType'] == ""): newD['dataType'] = "B:GNAB" colLabels = newD['colLabels'] for ii in range(len(colLabels)): aLabel = colLabels[ii] if (aLabel.find("TUMOR") > 0): print " ERROR ??? how did this get here ??? ", aLabel sys.exit(-1) print " " print " ready to write output file ... ", outFile tsvIO.writeTSV_dataMatrix(newD, 0, 0, outFile)
def removeNonTumorSamples(dataD): ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] ) numCol = len(dataD['colLabels']) keepCols = [0] * numCol tumorList = [] numOutCol = 0 for jj in range(numCol): aCode = dataD['colLabels'][jj] tumorCode = '' # hack to not mess with ITMI samples ... if (aCode.startswith("ITMI-")): tumorCode = aCode # if the barcode is not even long enough to specify the sample type, # we will just assume that we keep it ... elif (len(aCode) < 16): tumorCode = aCode else: # if the barcode is at least 16 characters long, then we parse it # ... if (aCode.startswith("ITMI-")): doNothing = 1 else: aCode = miscTCGA.fixTCGAbarcode(aCode) (site, patient, sample, vial, portion, analyte, plate, center) = miscTCGA.parseTCGAbarcode(aCode) try: iSample = int(sample) except: iSample = -1 if (sample != aCode): print " what is going on here ??? ", aCode sys.exit(-1) if (iSample > 0 and iSample < 10): tumorCode = miscTCGA.sampleLevelCode(aCode) if (tumorCode != ''): if (tumorCode not in tumorList): tumorList += [tumorCode] keepCols[jj] = 1 numOutCol += 1 else: print " WARNING: in removeNonTumorSamples ... multiple columns for the same tumor sample ??? " print aCode, tumorCode # print tumorList print " --> keeping only the first one " # sys.exit(-1) rmColList = [] for jj in range(numCol): if (keepCols[jj] == 0): rmColList += [jj] print " will remove sample <%s> " % dataD['colLabels'][ jj] # filter out the columns we don't want ... dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList) print " back from filter_dataMatrix ... ", dataD['colLabels'][:5] # NOTE: this next bit may no longer be necessary ... # and also set the shortened TCGA barcodes as labels ... if (len(tumorList) != len(dataD['dataMatrix'][0])): print " ERROR !!! length of tumorList does not correspond to size of dataMatrix !!! " print len(tumorList) tsvIO.lookAtDataD(dataD) sys.exit(-1) dataD['colLabels'] = tumorList print " now using shortened barcodes .. ", dataD['colLabels'][:5] return (dataD)
testD = dropSampleTypeFromBarcodes(testD) tsvIO.lookAtDataD(testD) else: print " " print " dropping details (beyond sample type) at the end of the barcodes ... " testD = dropDetailsFromBarcodes(testD) tsvIO.lookAtDataD(testD) if (0): print " " print " at the individual input file level, remove rows and then columns with too many missing values ... " skipRowList = tsvIO.getSkipList(rowMaxNAfrac, testD, 'row') if (skipRowList != []): testD = tsvIO.filter_dataMatrix(testD, skipRowList, []) tsvIO.lookAtDataD(testD) skipColList = tsvIO.getSkipList(colMaxNAfrac, testD, 'col') if (skipColList != []): testD = tsvIO.filter_dataMatrix(testD, [], skipColList) tsvIO.lookAtDataD(testD) # finally, add this dictionary to our list of input data sets ... inputData += [testD] tokenList = testD['dataType'].split(':') if (len(tokenList) != 2): if (testD['dataType'] == "NA"): testD['dataType'] = "M:MISC" tokenList = testD['dataType'].split(':')
def removeNonTumorSamples(dataD): ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] ) numCol = len(dataD['colLabels']) keepCols = [0] * numCol tumorList = [] numOutCol = 0 for jj in range(numCol): aCode = dataD['colLabels'][jj] tumorCode = '' # if the barcode is not even long enough to specify the sample type, # we will just assume that we keep it ... if (len(aCode) < 15): tumorCode = aCode else: # if the barcode is at least 15 characters long, then we parse it # ... aCode = miscTCGA.fixTCGAbarcode(aCode) (site, patient, sample, vial, portion, analyte, plate, center) = miscTCGA.parseTCGAbarcode(aCode) try: iSample = int(sample) except: iSample = -1 if (sample != aCode): print " what is going on here ??? ", aCode sys.exit(-1) if (iSample > 0 and iSample < 10): tumorCode = miscTCGA.sampleLevelCode(aCode) if (tumorCode != ''): if (tumorCode not in tumorList): tumorList += [tumorCode] keepCols[jj] = 1 numOutCol += 1 else: print " WARNING: in removeNonTumorSamples ... multiple columns for the same tumor sample ??? " print aCode, tumorCode # print tumorList print " --> keeping only the first one " # sys.exit(-1) rmColList = [] for jj in range(numCol): if (keepCols[jj] == 0): rmColList += [jj] print " will remove sample <%s> " % dataD['colLabels'][jj] # filter out the columns we don't want ... dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList) print " back from filter_dataMatrix ... ", dataD['colLabels'][:5] # NOTE: this next bit may no longer be necessary ... # and also set the shortened TCGA barcodes as labels ... if (len(tumorList) != len(dataD['dataMatrix'][0])): print " ERROR !!! length of tumorList does not correspond to size of dataMatrix !!! " print len(tumorList) tsvIO.lookAtDataD(dataD) sys.exit(-1) dataD['colLabels'] = tumorList print " now using shortened barcodes .. ", dataD['colLabels'][:5] return (dataD)
# check that the feature names are still unique ... print " --> verify that the feature names are unique ... " (newLabels, rmList) = tsvIO.uniqueFeatureLabels(annotD['rowLabels'], annotD['dataMatrix']) print " back from tsvIO.uniqueFeatureLabels " # quick sanity check that labels are still what I think they are ... for ii in range(len(newLabels)): if (not (newLabels[ii] == annotD['rowLabels'][ii])): print " " print " BAILING !!! ", newLabels[ii], annotD['rowLabels'][ii] print " " sys.exit(-1) # remove any 'extra' features that need removing ... if (len(rmList) > 0): print " --> need to remove these rows ", rmList tmpD = tsvIO.filter_dataMatrix(annotD, rmList, []) annotD = tmpD # and write the matrix back out print " --> calling tsvIO.writeTSV_dataMatrix ... " tsvIO.writeTSV_dataMatrix(annotD, 0, 0, outFile) print " " print " DONE " print " " # -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#