Esempio n. 1
0
def checkFeatures(dataD):

    print " "
    print " in checkFeatures ... "

    # the feature matrix has thousands of features x hundreds of patients
    rowLabels = dataD['rowLabels']
    colLabels = dataD['colLabels']
    numRow = len(rowLabels)
    numCol = len(colLabels)
    dataMatrix = dataD['dataMatrix']
    print " %d rows x %d columns " % (numRow, numCol)
    # print rowLabels[:5]
    # print rowLabels[-5:]

    rmRowList = []

    for iRow in range(numRow):
        featName = rowLabels[iRow]

        # watch out for "bad" feature names, eg 1-Dec or unknown
        if (badFeatureName(featName)):
            rmRowList += [iRow]
        continue

        uVals = []
        for iCol in range(numCol):
            if (dataMatrix[iRow][iCol] != "NA"):
                if (dataMatrix[iRow][iCol] != NA_VALUE):
                    if (dataMatrix[iRow][iCol] not in uVals):
                        uVals += [dataMatrix[iRow][iCol]]
        if (len(uVals) < 2):
            rmRowList += [iRow]
            # print " will remove row #%d <%s> " % ( iRow, featName ), uVals
        else:
            if (len(uVals) == 2):
                uVals.sort()
                if (featName[0] != "B"):
                    if (uVals == [0, 1]):
                        newName = "B" + featName[1:]
                        print " fixing feature name : <%s> --> <%s> " % (
                            featName, newName), uVals
                        rowLabels[iRow] = newName

    dataD['rowLabels'] = rowLabels

    print " --> removing %d uniform features " % len(rmRowList)
    newD = tsvIO.filter_dataMatrix(dataD, rmRowList, [])

    return (newD)
Esempio n. 2
0
def checkFeatures(dataD):

    print " "
    print " in checkFeatures ... "

    # the feature matrix has thousands of features x hundreds of patients
    rowLabels = dataD['rowLabels']
    colLabels = dataD['colLabels']
    numRow = len(rowLabels)
    numCol = len(colLabels)
    dataMatrix = dataD['dataMatrix']
    print " %d rows x %d columns " % (numRow, numCol)
    # print rowLabels[:5]
    # print rowLabels[-5:]

    rmRowList = []

    for iRow in range(numRow):
        featName = rowLabels[iRow]

        # watch out for "bad" feature names, eg 1-Dec or unknown
        if (badFeatureName(featName)):
            rmRowList += [iRow]
        continue

        uVals = []
        for iCol in range(numCol):
            if (dataMatrix[iRow][iCol] != "NA"):
                if (dataMatrix[iRow][iCol] != NA_VALUE):
                    if (dataMatrix[iRow][iCol] not in uVals):
                        uVals += [dataMatrix[iRow][iCol]]
        if (len(uVals) < 2):
            rmRowList += [iRow]
            # print " will remove row #%d <%s> " % ( iRow, featName ), uVals
        else:
            if (len(uVals) == 2):
                uVals.sort()
                if (featName[0] != "B"):
                    if (uVals == [0, 1]):
                        newName = "B" + featName[1:]
                        print " fixing feature name : <%s> --> <%s> " % (featName, newName), uVals
                        rowLabels[iRow] = newName

    dataD['rowLabels'] = rowLabels

    print " --> removing %d uniform features " % len(rmRowList)
    newD = tsvIO.filter_dataMatrix(dataD, rmRowList, [])

    return (newD)
Esempio n. 3
0
def removeIdenticalFeatures(inD, featType):

    print " in removeIdenticalFeatures ... <%s> " % featType

    rowLabels = inD['rowLabels']
    dataMatrix = inD['dataMatrix']

    nRowIn = len(dataMatrix)
    nColIn = len(dataMatrix[0])

    rmRowList = []

    iRow = 0
    while (iRow < nRowIn):

        print " "
        print " working on feature # %d " % iRow

        iFeatname = rowLabels[iRow]
        if (featType != "ANY"):
            if (iFeatname.find(featType) < 0):
                iRow += 1
                continue

        if (iFeatname.find(":ja_") > 0 ):
            iRow += 1
            continue

        curTokens = iFeatname.split(':')
        iGeneName = curTokens[2]

        jRow = iRow + 1
        done = 0

        # check to see if this feature is a ~constant~
        if (constantVec(dataMatrix[iRow])):
            rmRowList += [iRow]
            print "         --> adding constant iRow (%d) to rmRowList <%s> " % (iRow, iFeatname)
            done = 1

        while not done:

            if (jRow >= nRowIn):
                done = 1
                continue

            if (jRow in rmRowList):
                jRow += 1
                continue

            jFeatname = rowLabels[jRow]
            if (featType != "ANY"):
                if (jFeatname.find(featType) < 0):
                    jRow += 1
                    continue

            if (jFeatname.find(":ja_") > 0):
                jRow += 1
                continue

            curTokens = jFeatname.split(':')
            jGeneName = curTokens[2]

            ## HERE this makes things MUCH faster, but may not uncover
            ## all identical features ...
            if ( 1 ):
                if (jGeneName != iGeneName):
                    done = 1
                    continue

            identFeat = 1
            ## print "     --> comparing %d and %d  ( %s and %s ) " % (iRow, jRow, iFeatname, jFeatname)
            for iCol in range(nColIn):
                if (dataMatrix[iRow][iCol] != dataMatrix[jRow][iCol]):
                    identFeat = 0

            if (identFeat):
                print "     --> identical !!! %d and %d  ( %s and %s ) " % (iRow, jRow, iFeatname, jFeatname)
                newName = chooseMostSpecific([iFeatname, jFeatname])
                if (newName == iFeatname):
                    if (jRow not in rmRowList):
                        rmRowList += [jRow]
                        print "         --> adding jRow (%d) to rmRowList <%s> " % (jRow, jFeatname)
                elif (newName == jFeatname):
                    if (iRow not in rmRowList):
                        rmRowList += [iRow]
                        print "         --> adding iRow (%d) to rmRowList <%s> " % (iRow, iFeatname)
                    print "         --> skipping to next iRow "
                    done = 1
                    continue

            jRow += 1

        iRow += 1

    print len(rmRowList)
    # print rmRowList

    outD = tsvIO.filter_dataMatrix(inD, rmRowList, [])

    print " returning ... "
    return (outD)
Esempio n. 4
0
        # now we have tested this gene against all nearby methylation probes
        # ...
        if (jNeg >= 0):
            if (maxNeg <= -corrThresh):
                keepMeth[jNeg] = 1
                print " (b) keeping Spearman rho \t %.3f " % maxNeg
        if (jPos >= 0):
            if (maxPos >= corrThresh):
                keepMeth[jPos] = 1
                print " (c) keeping Spearman rho \t %.3f " % maxPos

    # and at this point we have tested all genes!!!
    print " length of keepMeth dictionary : ", len(keepMeth)
    print " original number of rows in methylation file : ", numMethRow

    rmMethRowList = []

    for jRow in range(numMethRow):
        if (jRow not in keepMeth.keys()):
            rmMethRowList += [jRow]
    print " number of rows to be removed : ", len(rmMethRowList)

    methD = tsvIO.filter_dataMatrix(methD, rmMethRowList, [])

    sortRowFlag = 0
    sortColFlag = 0

    tsvIO.writeTSV_dataMatrix(methD, sortRowFlag, sortColFlag, outFile)

# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
Esempio n. 5
0
        refGeneDict, cytoDict, forceFlag, nameChangeFlag )

    # check that the feature names are still unique ...
    print " --> verify that the feature names are unique ... "
    ( newLabels, rmList ) = tsvIO.uniqueFeatureLabels(annotD['rowLabels'], annotD['dataMatrix'])
    print "     back from tsvIO.uniqueFeatureLabels "

    # quick sanity check that labels are still what I think they are ...
    for ii in range(len(newLabels)):
        if (not (newLabels[ii] == annotD['rowLabels'][ii])):
            print " "
            print " BAILING !!! ", newLabels[ii], annotD['rowLabels'][ii]
            print " "
            sys.exit(-1)

    # remove any 'extra' features that need removing ...
    if ( len(rmList) > 0 ):
        print "     --> need to remove these rows ", rmList
        tmpD = tsvIO.filter_dataMatrix ( annotD, rmList, [] )
        annotD = tmpD

    # and write the matrix back out
    print " --> calling tsvIO.writeTSV_dataMatrix ... "
    tsvIO.writeTSV_dataMatrix(annotD, 0, 0, outFile)

    print " "
    print " DONE "
    print " "

# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
Esempio n. 6
0
        print " "
        print " handling ", aCat

        rmColList = []
        for iC in range(numCol):
            if (catVec[iC] != aCat):
                rmColList += [iC]

        numRm = len(rmColList)
        numKp = numCol - numRm
        if (numKp < 10):
            print " --> too few columns remaining ... skipping this category ... (%d) " % numKp

        else:

            outD = tsvIO.filter_dataMatrix(dataD, [], rmColList)

            # make sure that we are not left with any features that are all-NA ...
            # or nearly all 0 ... (adding this 10sep12)
            dataMatrix = outD['dataMatrix']
            outLabels = outD['rowLabels']
            numRowOut = len(dataMatrix)
            numColOut = len(dataMatrix[0])
            # print numRowOut, numColOut
            rmRowList = []
            rmTypes = {}
            for iRow in range(numRowOut):
                curType = outLabels[iRow][2:6]
                allNA = 1
                iCol = 0
                while (allNA == 1 and iCol < numColOut):
Esempio n. 7
0
def removeDuplicateSamples(dataD, barcodeLen, firstLast=0):

    ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] )
    numCol = len(dataD['colLabels'])
    keepCols = [1] * numCol
    numOutCol = 0

    sampleDict = {}

    for jj in range(numCol):
        aCode = dataD['colLabels'][jj]

        if (aCode.startswith("ITMI-")):
            doNothing = 1

        elif (len(aCode) > barcodeLen):
            aCode = miscTCGA.fixTCGAbarcode(aCode)
            aCode = aCode[:barcodeLen]

        if (aCode not in sampleDict.keys()):
            sampleDict[aCode] = 0

        sampleDict[aCode] += 1

    allKeys = sampleDict.keys()
    allKeys.sort()

    for aKey in allKeys:

        if (sampleDict[aKey] > 1):
            dupList = []
            print " duplicate columns for sample <%s> ??? " % aKey
            for jj in range(numCol):
                aCode = dataD['colLabels'][jj]
                if (aCode.startswith(aKey)):
                    print "        ", aCode
                    dupList += [aCode]
            dupList.sort()
            # print dupList

            ## ['TCGA-A3-3308-01A-01T-0860-13', 'TCGA-A3-3308-01A-02R-1324-13']
            # now decide which one to keep ... we prefer "R" over "T"
            # apparently ...
            keepCode = "NA"
            for kk in range(len(dupList)):
                aCode = dupList[kk]
                if (len(aCode) > 19):
                    if (aCode[19] == "R"):
                        keepCode = aCode

            # if there isn't one with an "R", just take either the first or
            # the last one, based on the firstLast flag ...
            if (keepCode == "NA"):
                if (firstLast == 0):
                    keepCode = dupList[-1]
                elif (firstLast == 1):
                    keepCode = dupList[0]
                else:
                    keepCode = dupList[0]

            # now create the rmList by removing the keepCode from the dupList
            # ...
            rmList = []
            for aCode in dupList:
                if (aCode != keepCode):
                    rmList += [aCode]

            # and finally set the flags to drop the duplicates ...
            for jj in range(numCol):
                aCode = dataD['colLabels'][jj]
                if (aCode in rmList):
                    keepCols[jj] = 0
                    print "         --> will drop column %d <%s> " % (jj, aCode)

    rmColList = []
    for jj in range(numCol):
        if (keepCols[jj] == 0):
            rmColList += [jj]
            print "             will remove sample <%s> " % dataD['colLabels'][jj]

    # filter out the columns we don't want ...
    dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList)
    print " back from filter_dataMatrix ... ", dataD['colLabels'][:5]

    return (dataD)
Esempio n. 8
0
        print " "
        print " "
        for iRow in range(len(skipRowList)):
            print "    REMOVING FEATURE %s " % rowLabels[skipRowList[iRow]]
        print " "
        print " "

    if (len(skipRowList) > int(0.90 * numRow)):
        print " "
        print " WARNING !!! more than 90% of the data is going to be lost ??? !!! "
        print " "
        # sys.exit(-1)

    print " "
    print " calling filter_dataMatrix ... "
    outD = tsvIO.filter_dataMatrix(dataD, skipRowList, [])

    # now build up the list of actual features that we have ...
    curRowLabels = outD['rowLabels']
    numRow = len(curRowLabels)
    print " now we have %d output features " % numRow

    # and put together a new ordered list for the output ...
    # (here we need the 'strict' feature names)
    print " building up new outputOrder vector "
    outputOrder = []
    for jRow in range(len(orderedWhiteList)):
        aFeat = orderedWhiteList[jRow]
        if (is_in_list(aFeat, curRowLabels, 'strict')):
            outputOrder += [aFeat]
        elif (is_in_list(aFeat, curRowLabels, 'loose')):
Esempio n. 9
0
        print " "
        print " "
        for iCol in range(len(skipColList)):
            print "    REMOVING SAMPLE %s " % colLabels[skipColList[iCol]]
        print " "
        print " "

    if (len(skipColList) > int(0.90 * numCol)):
        print " "
        print " WARNING !!! more than 90% of the data is going to be lost ??? !!! "
        print " "
        # sys.exit(-1)

    # print " "
    # print " calling filter_dataMatrix ... "
    outD = tsvIO.filter_dataMatrix(dataD, [], skipColList)

    # make sure that we are not left with any features that are all-NA ...
    dataMatrix = outD['dataMatrix']
    numRow = len(dataMatrix)
    numCol = len(dataMatrix[0])
    skipRowList = []
    for iRow in range(numRow):
        allNA = 1
        iCol = 0
        while (allNA == 1 and iCol < numCol):
            if (dataMatrix[iRow][iCol] != NA_VALUE):
                allNA = 0
            iCol += 1
        if (allNA):
            skipRowList += [iRow]
Esempio n. 10
0
def removeIdenticalFeatures(inD, featType):

    print " in removeIdenticalFeatures ... <%s> " % featType

    rowLabels = inD['rowLabels']
    dataMatrix = inD['dataMatrix']

    nRowIn = len(dataMatrix)
    nColIn = len(dataMatrix[0])

    rmRowList = []

    iRow = 0
    while (iRow < nRowIn):

        print " "
        print " working on feature # %d " % iRow

        iFeatname = rowLabels[iRow]
        if (featType != "ANY"):
            if (iFeatname.find(featType) < 0):
                iRow += 1
                continue

        if (iFeatname.find(":ja_") > 0):
            iRow += 1
            continue

        curTokens = iFeatname.split(':')
        iGeneName = curTokens[2]

        jRow = iRow + 1
        done = 0

        # check to see if this feature is a ~constant~
        if (constantVec(dataMatrix[iRow])):
            rmRowList += [iRow]
            print "         --> adding constant iRow (%d) to rmRowList <%s> " % (
                iRow, iFeatname)
            done = 1

        while not done:

            if (jRow >= nRowIn):
                done = 1
                continue

            if (jRow in rmRowList):
                jRow += 1
                continue

            jFeatname = rowLabels[jRow]
            if (featType != "ANY"):
                if (jFeatname.find(featType) < 0):
                    jRow += 1
                    continue

            if (jFeatname.find(":ja_") > 0):
                jRow += 1
                continue

            curTokens = jFeatname.split(':')
            jGeneName = curTokens[2]

            ## HERE this makes things MUCH faster, but may not uncover
            ## all identical features ...
            if (1):
                if (jGeneName != iGeneName):
                    done = 1
                    continue

            identFeat = 1
            ## print "     --> comparing %d and %d  ( %s and %s ) " % (iRow, jRow, iFeatname, jFeatname)
            for iCol in range(nColIn):
                if (dataMatrix[iRow][iCol] != dataMatrix[jRow][iCol]):
                    identFeat = 0

            if (identFeat):
                print "     --> identical !!! %d and %d  ( %s and %s ) " % (
                    iRow, jRow, iFeatname, jFeatname)
                newName = chooseMostSpecific([iFeatname, jFeatname])
                if (newName == iFeatname):
                    if (jRow not in rmRowList):
                        rmRowList += [jRow]
                        print "         --> adding jRow (%d) to rmRowList <%s> " % (
                            jRow, jFeatname)
                elif (newName == jFeatname):
                    if (iRow not in rmRowList):
                        rmRowList += [iRow]
                        print "         --> adding iRow (%d) to rmRowList <%s> " % (
                            iRow, iFeatname)
                    print "         --> skipping to next iRow "
                    done = 1
                    continue

            jRow += 1

        iRow += 1

    print len(rmRowList)
    # print rmRowList

    outD = tsvIO.filter_dataMatrix(inD, rmRowList, [])

    print " returning ... "
    return (outD)
Esempio n. 11
0
        print " "
        print " "
        for iRow in range(len(skipRowList)):
            print "    REMOVING FEATURE %s " % rowLabels[skipRowList[iRow]]
        print " "
        print " "

    if (len(skipRowList) > int(0.90 * numRow)):
        print " "
        print " WARNING !!! more than 90% of the data is going to be lost ??? !!! "
        print " "
        # sys.exit(-1)

    print " "
    print " calling filter_dataMatrix ... "
    outD = tsvIO.filter_dataMatrix(dataD, skipRowList, [])

    # now build up the list of actual features that we have ... 
    curRowLabels = outD['rowLabels']
    numRow = len(curRowLabels)
    print " now we have %d output features " % numRow

    # and put together a new ordered list for the output ...
    # (here we need the 'strict' feature names)
    print " building up new outputOrder vector "
    outputOrder = []
    for jRow in range(len(orderedWhiteList)):
        aFeat = orderedWhiteList[jRow]
        if ( is_in_list ( aFeat, curRowLabels, 'strict' ) ):
            outputOrder += [ aFeat ]
        elif ( is_in_list ( aFeat, curRowLabels, 'loose' ) ):
Esempio n. 12
0
        print " "
        print " "
        for iCol in range(len(skipColList)):
            print "    REMOVING SAMPLE %s " % colLabels[skipColList[iCol]]
        print " "
        print " "

    if (len(skipColList) > int(0.90 * numCol)):
        print " "
        print " WARNING !!! more than 90% of the data is going to be lost ??? !!! "
        print " "
        # sys.exit(-1)

    # print " "
    # print " calling filter_dataMatrix ... "
    outD = tsvIO.filter_dataMatrix(dataD, [], skipColList)

    # make sure that we are not left with any features that are all-NA ...
    dataMatrix = outD['dataMatrix']
    numRow = len(dataMatrix)
    numCol = len(dataMatrix[0])
    skipRowList = []
    for iRow in range(numRow):
        allNA = 1
        iCol = 0
        while (allNA == 1 and iCol < numCol):
            if (dataMatrix[iRow][iCol] != NA_VALUE):
                allNA = 0
            iCol += 1
        if (allNA):
            skipRowList += [iRow]
Esempio n. 13
0
        print " "
        print " handling ", aCat

        rmColList = []
        for iC in range(numCol):
            if (catVec[iC] != aCat):
                rmColList += [iC]

        numRm = len(rmColList)
        numKp = numCol - numRm
        if (numKp < 10):
            print " --> too few columns remaining ... skipping this category ... (%d) " % numKp

        else:

            outD = tsvIO.filter_dataMatrix(dataD, [], rmColList)

            # make sure that we are not left with any features that are all-NA ...
            # or nearly all 0 ... (adding this 10sep12)
            dataMatrix = outD['dataMatrix']
            outLabels = outD['rowLabels']
            numRowOut = len(dataMatrix)
            numColOut = len(dataMatrix[0])
            # print numRowOut, numColOut
            rmRowList = []
            rmTypes = {}
            for iRow in range(numRowOut):
                curType = outLabels[iRow][2:6]
                allNA = 1
                iCol = 0
                while (allNA == 1 and iCol < numColOut):
Esempio n. 14
0
    print ' (c) TIME ', time.asctime(time.localtime(time.time()))

    # and at this point we have tested all genes!!!
    print " length of keepMeth dictionary : ", len(keepMeth)
    print " length of keepGexp dictionary : ", len(keepGexp)
    print " original number of rows in input TSV file : ", numRow

    rmRowList = []

    for jRow in range(numRow):
        if (tsvRowLabels[jRow].lower().find("platform") < 0 ):
            if (jRow not in keepMeth.keys()):
                if (jRow not in keepGexp.keys()):
                    rmRowList += [jRow]
    print " number of rows to be removed : ", len(rmRowList)

    tsvD = tsvIO.filter_dataMatrix(tsvD, rmRowList, [])

    print ' (d) TIME ', time.asctime(time.localtime(time.time()))

    sortRowFlag = 0
    sortColFlag = 0

    tsvIO.writeTSV_dataMatrix(tsvD, sortRowFlag, sortColFlag, outFile)

    print ' (e) TIME ', time.asctime(time.localtime(time.time()))
    print " DONE !!! "

# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
Esempio n. 15
0
def removeDuplicateSamples(dataD, barcodeLen, firstLast=0):

    ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] )
    numCol = len(dataD['colLabels'])
    keepCols = [1] * numCol
    numOutCol = 0

    sampleDict = {}

    for jj in range(numCol):
        aCode = dataD['colLabels'][jj]

        if (aCode.startswith("ITMI-")):
            doNothing = 1

        else:
            aCode = miscTCGA.fixTCGAbarcode(aCode)
            if (len(aCode) < barcodeLen):
                aCode = miscTCGA.get_barcode16(aCode)

        if (aCode not in sampleDict.keys()):
            sampleDict[aCode] = 0

        sampleDict[aCode] += 1

    allKeys = sampleDict.keys()
    allKeys.sort()

    for aKey in allKeys:

        if (sampleDict[aKey] > 1):
            dupList = []
            print " duplicate columns for sample <%s> ??? " % aKey
            for jj in range(numCol):
                aCode = dataD['colLabels'][jj]
                if (aCode.startswith(aKey)):
                    print "        ", aCode
                    dupList += [aCode]
            dupList.sort()
            # print dupList

            ## ['TCGA-A3-3308-01A-01T-0860-13', 'TCGA-A3-3308-01A-02R-1324-13']
            # now decide which one to keep ... we prefer "R" over "T"
            # apparently ...
            keepCode = "NA"
            for kk in range(len(dupList)):
                aCode = dupList[kk]
                if (len(aCode) > 19):
                    if (aCode[19] == "R"):
                        keepCode = aCode

            # if there isn't one with an "R", just take either the first or
            # the last one, based on the firstLast flag ...
            if (keepCode == "NA"):
                if (firstLast == 0):
                    keepCode = dupList[-1]
                elif (firstLast == 1):
                    keepCode = dupList[0]
                else:
                    keepCode = dupList[0]

            # now create the rmList by removing the keepCode from the dupList
            # ...
            rmList = []
            for aCode in dupList:
                if (aCode != keepCode):
                    rmList += [aCode]

            # and finally set the flags to drop the duplicates ...
            for jj in range(numCol):
                aCode = dataD['colLabels'][jj]
                if (aCode in rmList):
                    keepCols[jj] = 0
                    print "         --> will drop column %d <%s> " % (jj,
                                                                      aCode)

    rmColList = []
    for jj in range(numCol):
        if (keepCols[jj] == 0):
            rmColList += [jj]
            print "             will remove sample <%s> " % dataD['colLabels'][
                jj]

    # filter out the columns we don't want ...
    dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList)
    print " back from filter_dataMatrix ... ", dataD['colLabels'][:5]

    return (dataD)
Esempio n. 16
0
            nzHist[numNZ] += 1
            if (numNZ < minNZC):
                rmRowList += [iRow]

    print " --> number of rows to be skipped : %d out of %d " % (len(rmRowList), numRow)
    print "     number of rows remaining : %d " % (numRow - len(rmRowList))

    print " "
    print " histogram of NZ counts : "
    for ii in range(len(nzHist)):
        if (nzHist[ii] > 0):
            print " %4d  %12d " % (ii, nzHist[ii])
    print " "
    print " "

    newD = tsvIO.filter_dataMatrix(testD, rmRowList, [])
    tsvIO.lookAtDataD(newD)

    if (newD['dataType'] == ""):
        newD['dataType'] = "B:GNAB"

    colLabels = newD['colLabels']
    for ii in range(len(colLabels)):
        aLabel = colLabels[ii]
        if (aLabel.find("TUMOR") > 0):
            print " ERROR ??? how did this get here ??? ", aLabel
            sys.exit(-1)

    print " "
    print " ready to write output file ... ", outFile
    tsvIO.writeTSV_dataMatrix(newD, 0, 0, outFile)
Esempio n. 17
0
def removeNonTumorSamples(dataD):

    ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] )
    numCol = len(dataD['colLabels'])
    keepCols = [0] * numCol
    tumorList = []
    numOutCol = 0

    for jj in range(numCol):
        aCode = dataD['colLabels'][jj]
        tumorCode = ''

        # hack to not mess with ITMI samples ...
        if (aCode.startswith("ITMI-")):
            tumorCode = aCode

        # if the barcode is not even long enough to specify the sample type,
        # we will just assume that we keep it ...
        elif (len(aCode) < 16):
            tumorCode = aCode

        else:
            # if the barcode is at least 16 characters long, then we parse it
            # ...
            if (aCode.startswith("ITMI-")):
                doNothing = 1
            else:
                aCode = miscTCGA.fixTCGAbarcode(aCode)
                (site, patient, sample, vial, portion, analyte, plate,
                 center) = miscTCGA.parseTCGAbarcode(aCode)
                try:
                    iSample = int(sample)
                except:
                    iSample = -1
                    if (sample != aCode):
                        print " what is going on here ??? ", aCode
                        sys.exit(-1)
                if (iSample > 0 and iSample < 10):
                    tumorCode = miscTCGA.sampleLevelCode(aCode)

        if (tumorCode != ''):
            if (tumorCode not in tumorList):
                tumorList += [tumorCode]
                keepCols[jj] = 1
                numOutCol += 1
            else:
                print " WARNING: in removeNonTumorSamples ... multiple columns for the same tumor sample ??? "
                print aCode, tumorCode
                # print tumorList
                print "          --> keeping only the first one "
                # sys.exit(-1)

    rmColList = []
    for jj in range(numCol):
        if (keepCols[jj] == 0):
            rmColList += [jj]
            print "             will remove sample <%s> " % dataD['colLabels'][
                jj]

    # filter out the columns we don't want ...
    dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList)
    print " back from filter_dataMatrix ... ", dataD['colLabels'][:5]

    # NOTE: this next bit may no longer be necessary ...
    # and also set the shortened TCGA barcodes as labels ...
    if (len(tumorList) != len(dataD['dataMatrix'][0])):
        print " ERROR !!! length of tumorList does not correspond to size of dataMatrix !!! "
        print len(tumorList)
        tsvIO.lookAtDataD(dataD)
        sys.exit(-1)

    dataD['colLabels'] = tumorList
    print " now using shortened barcodes .. ", dataD['colLabels'][:5]

    return (dataD)
Esempio n. 18
0
                testD = dropSampleTypeFromBarcodes(testD)
                tsvIO.lookAtDataD(testD)

            else:
                print " "
                print " dropping details (beyond sample type) at the end of the barcodes ... "
                testD = dropDetailsFromBarcodes(testD)
                tsvIO.lookAtDataD(testD)

        if (0):

            print " "
            print " at the individual input file level, remove rows and then columns with too many missing values ... "
            skipRowList = tsvIO.getSkipList(rowMaxNAfrac, testD, 'row')
            if (skipRowList != []):
                testD = tsvIO.filter_dataMatrix(testD, skipRowList, [])
            tsvIO.lookAtDataD(testD)

            skipColList = tsvIO.getSkipList(colMaxNAfrac, testD, 'col')
            if (skipColList != []):
                testD = tsvIO.filter_dataMatrix(testD, [], skipColList)
            tsvIO.lookAtDataD(testD)

        # finally, add this dictionary to our list of input data sets ...
        inputData += [testD]

        tokenList = testD['dataType'].split(':')
        if (len(tokenList) != 2):
            if (testD['dataType'] == "NA"):
                testD['dataType'] = "M:MISC"
                tokenList = testD['dataType'].split(':')
Esempio n. 19
0
def removeNonTumorSamples(dataD):

    ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] )
    numCol = len(dataD['colLabels'])
    keepCols = [0] * numCol
    tumorList = []
    numOutCol = 0

    for jj in range(numCol):
        aCode = dataD['colLabels'][jj]
        tumorCode = ''

        # if the barcode is not even long enough to specify the sample type,
        # we will just assume that we keep it ...
        if (len(aCode) < 15):
            tumorCode = aCode

        else:
            # if the barcode is at least 15 characters long, then we parse it
            # ...
            aCode = miscTCGA.fixTCGAbarcode(aCode)
            (site, patient, sample, vial, portion, analyte,
             plate, center) = miscTCGA.parseTCGAbarcode(aCode)
            try:
                iSample = int(sample)
            except:
                iSample = -1
                if (sample != aCode):
                    print " what is going on here ??? ", aCode
                    sys.exit(-1)
            if (iSample > 0 and iSample < 10):
                tumorCode = miscTCGA.sampleLevelCode(aCode)

        if (tumorCode != ''):
            if (tumorCode not in tumorList):
                tumorList += [tumorCode]
                keepCols[jj] = 1
                numOutCol += 1
            else:
                print " WARNING: in removeNonTumorSamples ... multiple columns for the same tumor sample ??? "
                print aCode, tumorCode
                # print tumorList
                print "          --> keeping only the first one "
                # sys.exit(-1)

    rmColList = []
    for jj in range(numCol):
        if (keepCols[jj] == 0):
            rmColList += [jj]
            print "             will remove sample <%s> " % dataD['colLabels'][jj]

    # filter out the columns we don't want ...
    dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList)
    print " back from filter_dataMatrix ... ", dataD['colLabels'][:5]

    # NOTE: this next bit may no longer be necessary ...
    # and also set the shortened TCGA barcodes as labels ...
    if (len(tumorList) != len(dataD['dataMatrix'][0])):
        print " ERROR !!! length of tumorList does not correspond to size of dataMatrix !!! "
        print len(tumorList)
        tsvIO.lookAtDataD(dataD)
        sys.exit(-1)

    dataD['colLabels'] = tumorList
    print " now using shortened barcodes .. ", dataD['colLabels'][:5]

    return (dataD)
Esempio n. 20
0
    # check that the feature names are still unique ...
    print " --> verify that the feature names are unique ... "
    (newLabels, rmList) = tsvIO.uniqueFeatureLabels(annotD['rowLabels'],
                                                    annotD['dataMatrix'])
    print "     back from tsvIO.uniqueFeatureLabels "

    # quick sanity check that labels are still what I think they are ...
    for ii in range(len(newLabels)):
        if (not (newLabels[ii] == annotD['rowLabels'][ii])):
            print " "
            print " BAILING !!! ", newLabels[ii], annotD['rowLabels'][ii]
            print " "
            sys.exit(-1)

    # remove any 'extra' features that need removing ...
    if (len(rmList) > 0):
        print "     --> need to remove these rows ", rmList
        tmpD = tsvIO.filter_dataMatrix(annotD, rmList, [])
        annotD = tmpD

    # and write the matrix back out
    print " --> calling tsvIO.writeTSV_dataMatrix ... "
    tsvIO.writeTSV_dataMatrix(annotD, 0, 0, outFile)

    print " "
    print " DONE "
    print " "

# -#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#