def removeNonTumorSamples(dataD):

    ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] )
    numCol = len(dataD['colLabels'])
    keepCols = [0] * numCol
    tumorList = []
    numOutCol = 0

    for jj in range(numCol):
        aCode = dataD['colLabels'][jj]
        tumorCode = ''

        # if the barcode is not even long enough to specify the sample type,
        # we will just assume that we keep it ...
        if (len(aCode) < 15):
            tumorCode = aCode

        else:
            # if the barcode is at least 15 characters long, then we parse it
            # ...
            aCode = miscTCGA.fixTCGAbarcode(aCode)
            (site, patient, sample, vial, portion, analyte,
             plate, center) = miscTCGA.parseTCGAbarcode(aCode)
            try:
                iSample = int(sample)
            except:
                iSample = -1
                if (sample != aCode):
                    print " what is going on here ??? ", aCode
                    sys.exit(-1)
            if (iSample > 0 and iSample < 10):
                tumorCode = miscTCGA.sampleLevelCode(aCode)

        if (tumorCode != ''):
            if (tumorCode not in tumorList):
                tumorList += [tumorCode]
                keepCols[jj] = 1
                numOutCol += 1
            else:
                print " WARNING: in removeNonTumorSamples ... multiple columns for the same tumor sample ??? "
                print aCode, tumorCode
                # print tumorList
                print "          --> keeping only the first one "
                # sys.exit(-1)

    rmColList = []
    for jj in range(numCol):
        if (keepCols[jj] == 0):
            rmColList += [jj]
            print "             will remove sample <%s> " % dataD['colLabels'][jj]

    # filter out the columns we don't want ...
    dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList)
    print " back from filter_dataMatrix ... ", dataD['colLabels'][:5]

    # NOTE: this next bit may no longer be necessary ...
    # and also set the shortened TCGA barcodes as labels ...
    if (len(tumorList) != len(dataD['dataMatrix'][0])):
        print " ERROR !!! length of tumorList does not correspond to size of dataMatrix !!! "
        print len(tumorList)
        tsvIO.lookAtDataD(dataD)
        sys.exit(-1)

    dataD['colLabels'] = tumorList
    print " now using shortened barcodes .. ", dataD['colLabels'][:5]

    return (dataD)
Exemple #2
0
def removeNonTumorSamples(dataD):

    ## miscTCGA.lookAtBarcodes ( dataD['colLabels'] )
    numCol = len(dataD['colLabels'])
    keepCols = [0] * numCol
    tumorList = []
    numOutCol = 0

    for jj in range(numCol):
        aCode = dataD['colLabels'][jj]
        tumorCode = ''

        # hack to not mess with ITMI samples ...
        if (aCode.startswith("ITMI-")):
            tumorCode = aCode

        # if the barcode is not even long enough to specify the sample type,
        # we will just assume that we keep it ...
        elif (len(aCode) < 16):
            tumorCode = aCode

        else:
            # if the barcode is at least 16 characters long, then we parse it
            # ...
            if (aCode.startswith("ITMI-")):
                doNothing = 1
            else:
                aCode = miscTCGA.fixTCGAbarcode(aCode)
                (site, patient, sample, vial, portion, analyte, plate,
                 center) = miscTCGA.parseTCGAbarcode(aCode)
                try:
                    iSample = int(sample)
                except:
                    iSample = -1
                    if (sample != aCode):
                        print " what is going on here ??? ", aCode
                        sys.exit(-1)
                if (iSample > 0 and iSample < 10):
                    tumorCode = miscTCGA.sampleLevelCode(aCode)

        if (tumorCode != ''):
            if (tumorCode not in tumorList):
                tumorList += [tumorCode]
                keepCols[jj] = 1
                numOutCol += 1
            else:
                print " WARNING: in removeNonTumorSamples ... multiple columns for the same tumor sample ??? "
                print aCode, tumorCode
                # print tumorList
                print "          --> keeping only the first one "
                # sys.exit(-1)

    rmColList = []
    for jj in range(numCol):
        if (keepCols[jj] == 0):
            rmColList += [jj]
            print "             will remove sample <%s> " % dataD['colLabels'][
                jj]

    # filter out the columns we don't want ...
    dataD = tsvIO.filter_dataMatrix(dataD, [], rmColList)
    print " back from filter_dataMatrix ... ", dataD['colLabels'][:5]

    # NOTE: this next bit may no longer be necessary ...
    # and also set the shortened TCGA barcodes as labels ...
    if (len(tumorList) != len(dataD['dataMatrix'][0])):
        print " ERROR !!! length of tumorList does not correspond to size of dataMatrix !!! "
        print len(tumorList)
        tsvIO.lookAtDataD(dataD)
        sys.exit(-1)

    dataD['colLabels'] = tumorList
    print " now using shortened barcodes .. ", dataD['colLabels'][:5]

    return (dataD)
    print " "
    print " in filterTSVbySampList.py ... "
    print "         input file  : ", inFile
    print "         output file : ", outFile
    print "         list info   : ", listInfo
    print " "

    # print " "
    # print " ***************************************************************** "
    # print " calling readTSV ... ", inFile
    dataD = tsvIO.readTSV(inFile)
    if (dataD == {}):
        sys.exit(-1)

    tsvIO.lookAtDataD(dataD)

    # print " "
    # print " reading sample list ... "
    numLists = len(listInfo)
    listDetails = [0] * numLists
    listBW = [0] * numLists
    listLS = [0] * numLists
    for iList in range(numLists):
        print " --> loading sample list #%d from <%s> " % ((iList + 1), listInfo[iList][0])
        listDetails[iList] = readSampleListFromFile(listInfo[iList][0])
        listBW[iList] = listInfo[iList][1]
        listLS[iList] = listInfo[iList][2]
        if (len(listDetails[iList]) > 0):
            print "     %4d samples found, eg <%s> " % (len(listDetails[iList]), listDetails[iList][0])
        else:
Exemple #4
0
        outFile = sys.argv[4]
        corrThresh = abs(float(sys.argv[5]))
        if (not othrType.startswith("N:")):
            othrType = "N:" + othrType

    ## corrThresh = 0.40
    ## corrThresh = 0.20

    print " "
    print " "
    print " ************* "
    print " in methCorr : "
    print " ***************************************************************** "
    print " calling readTSV ... ", methFile
    methD = tsvIO.readTSV(methFile)
    tsvIO.lookAtDataD(methD)

    print " calling readTSV ... ", othrFile
    othrD = tsvIO.readTSV(othrFile)
    tsvIO.lookAtDataD(othrD)

    try:
        methRowLabels = methD['rowLabels']
        methColLabels = methD['colLabels']
        methDataMatrix = methD['dataMatrix']
    except:
        print " no valid METH feature matrix "
        sys.exit(-1)

    numMethRow = len(methRowLabels)
    numMethCol = len(methColLabels)
Exemple #5
0
        ## check to see if we actually have any data ...
        skipFile = 0
        try:
            if (len(testD) == 0): skipFile = 1
            if (len(testD['rowLabels']) == 0): skipFile = 1
            if (len(testD['colLabels']) == 0): skipFile = 1
        except:
            print " ERROR in looking at data from <%s> ??? " % (aFile)
            skipFile = 1

        if (skipFile):
            print " --> nothing found ??? continuing ... "
            continue

        print " first look ... "
        tsvIO.lookAtDataD(testD)

        if (1):

            # TCGA-CJ-4635-01A-02R
            # the first 12 characters identify the patient
            # the first 16 characters identify the sample

            # now we check for duplicates at the sample level ...
            print " "
            print " checking for duplicates ... "
            testD = removeDuplicateSamples(testD, 16, 0)
            print " second look ... "
            tsvIO.lookAtDataD(testD)

            if (1):
Exemple #6
0
    if (1):
        if (len(sys.argv) != 3):
            print " Usage : %s <input file> <output file> " % sys.argv[0]
            sys.exit(-1)
        inFile = sys.argv[1]
        outFile = sys.argv[2]

    print " "
    print " "
    print " **************** "
    print " in jitterTSV : "
    print " "
    print " ***************************************************************** "
    print " calling readTSV ... ", inFile
    inD = tsvIO.readTSV(inFile)
    tsvIO.lookAtDataD(inD)

    rowLabels = inD['rowLabels']
    colLabels = inD['colLabels']
    dataMatrix = inD['dataMatrix']

    numRow = len(rowLabels)
    numCol = len(colLabels)

    outD = {}
    outD['dataType'] = inD['dataType']
    outD['rowLabels'] = rowLabels
    outD['colLabels'] = colLabels

    numRow = len(rowLabels)
    numCol = len(colLabels)
Exemple #7
0
    ## colMaxNAfrac = 0.90
    ## rowMaxNAfrac = 0.90

    for aFile in inFileList:

        print " "
        print " ***************************************************************** "
        print " calling readTSV ... ", aFile
        testD = tsvIO.readTSV(aFile)

        if (len(testD) == 0):
            print " --> nothing found ??? continuing ... "
            continue

        tsvIO.lookAtDataD(testD)

        if (1):

            # TCGA-CJ-4635-01A-02R
            # the first 12 characters identify the patient
            # the first 15 characters identify the sample

            # now we check for duplicates at the sample level ...
            print " "
            print " checking for duplicates ... "
            testD = removeDuplicateSamples(testD, 15, 0)

            if (0):
                print " "
                print " calling removeNonTumorSamples ... "
Exemple #8
0
    print " "
    print " in filterTSVbySampList.py ... "
    print "         input file  : ", inFile
    print "         output file : ", outFile
    print "         list info   : ", listInfo
    print " "

    # print " "
    # print " ***************************************************************** "
    # print " calling readTSV ... ", inFile
    dataD = tsvIO.readTSV(inFile)
    if (dataD == {}):
        sys.exit(-1)

    tsvIO.lookAtDataD(dataD)

    # print " "
    # print " reading sample list ... "
    numLists = len(listInfo)
    listDetails = [0] * numLists
    listBW = [0] * numLists
    listLS = [0] * numLists
    for iList in range(numLists):
        print " --> loading sample list #%d from <%s> " % (
            (iList + 1), listInfo[iList][0])
        listDetails[iList] = readSampleListFromFile(listInfo[iList][0])
        listBW[iList] = listInfo[iList][1]
        listLS[iList] = listInfo[iList][2]
        if (len(listDetails[iList]) > 0):
            print "     %4d samples found, eg <%s> " % (len(
Exemple #9
0
        tsvFile = sys.argv[1]
        outFile = sys.argv[2]
        dThresh = int ( sys.argv[3] )
        minCount = int ( sys.argv[4] )
        corrThresh = float ( sys.argv[5] )

    print " "
    print " "
    print " ************** "
    print " in methCorr3 : ", dThresh, minCount, corrThresh
    print " ***************************************************************** "
    print ' (a) TIME ', time.asctime(time.localtime(time.time()))
    print " calling readTSV ... ", tsvFile
    tsvD = tsvIO.readTSV(tsvFile)
    tsvIO.lookAtDataD(tsvD)

    try:
        tsvRowLabels = tsvD['rowLabels']
        tsvColLabels = tsvD['colLabels']
        tsvDataMatrix = tsvD['dataMatrix']
    except:
        print " no valid METH feature matrix "
        sys.exit(-1)

    numRow = len(tsvRowLabels)
    numCol = len(tsvColLabels)

    keepMeth = {}
    keepGexp = {}
Exemple #10
0
        if (len(sys.argv) != 3):
            print " Usage : %s <input file> <output file> " % sys.argv[0]
            print " ERROR -- bad command line arguments "
            sys.exit(-1)
        inFile = sys.argv[1]
        outFile = sys.argv[2]

    print " "
    print " "
    print " **************** "
    print " in jitterTSV : "
    print " "
    print " ***************************************************************** "
    print " calling readTSV ... ", inFile
    inD = tsvIO.readTSV(inFile)
    tsvIO.lookAtDataD(inD)

    rowLabels = inD['rowLabels']
    colLabels = inD['colLabels']
    dataMatrix = inD['dataMatrix']

    numRow = len(rowLabels)
    numCol = len(colLabels)

    outD = {}
    outD['dataType'] = inD['dataType']
    outD['rowLabels'] = rowLabels
    outD['colLabels'] = colLabels

    numRow = len(rowLabels)
    numCol = len(colLabels)