def sanityCheckSDRF(sdrfFilename):
    try:
        fh = file(sdrfFilename)
    except:
        print " ERROR in sanityCheckSDRF ??? failed to open file ??? "
        print sdrfFilename
        sys.exit(-1)

    nl = miscIO.num_lines(fh)
    nr = min(nl / 2, 5)
    nt = [0] * nr
    for index in range(nr):
        aLine = fh.readline()
        aLine = aLine.strip()
        tokenList = aLine.split('\t')
        nt[index] = len(tokenList)

    ntMin = min(nt)
    ntMax = max(nt)
    for index in range(nl - nr):
        aLine = fh.readline()
        aLine = aLine.strip()
        tokenList = aLine.split('\t')
        ntCur = len(tokenList)
        if (ntCur == 0):
            continue
        if (ntCur < ntMin):
            mess = "ERROR in sanityCheckSDRF ??? file appears to have been truncated ??? %i %i %i %s" % (
                ntCur, ntMin, ntMax, str(tokenList))
            raise ValueError(mess)

    fh.close()

    return 1
Exemple #2
0
def sanityCheckSDRF ( sdrfFilename ):
    try:
        fh = file ( sdrfFilename )
    except:
        print " ERROR in sanityCheckSDRF ??? failed to open file ??? "
        print sdrfFilename
        sys.exit(-1)

    nl = miscIO.num_lines ( fh )
    nr = min ( nl/2, 5 )
    nt = [0] * nr
    for index in range(nr):
        aLine = fh.readline()
        aLine = aLine.strip()
        tokenList = aLine.split('\t')
        nt[index] = len ( tokenList )

    ntMin = min ( nt )
    ntMax = max ( nt )
    for index in range(nl-nr):
        aLine = fh.readline()
        aLine = aLine.strip()
        tokenList = aLine.split('\t')
        ntCur = len ( tokenList )
        if ( ntCur == 0 ): 
            continue
        if ( ntCur < ntMin ):
            mess = "ERROR in sanityCheckSDRF ??? file appears to have been truncated ??? %i %i %i %s" % (ntCur, ntMin, ntMax, str(tokenList))
            raise ValueError(mess)

    fh.close()

    return 1
Exemple #3
0
def getNumFeat(featureMatrixFile):

    fh = file(featureMatrixFile)
    numLines = miscIO.num_lines(fh)
    numFeat = numLines - 1
    fh.close()

    return (numFeat)
Exemple #4
0
def getNumFeat(featureMatrixFile):

    fh = file(featureMatrixFile)
    numLines = miscIO.num_lines(fh)
    numFeat = numLines - 1
    fh.close()

    return (numFeat)
Exemple #5
0
def statsByCol(matrixFilename):

    (fh, reZip) = openInputFileForReading(matrixFilename)

    # if failed to open file, just return ...
    if (reZip < 0):
        return

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)

    tokenList = firstLine.split()
    numCols = len(tokenList)

    print ' <%s> : %d rows and %d columns ' % \
        (matrixFilename, numLines, numCols)

    minVals = [+99999999] * numCols
    maxVals = [-99999999] * numCols

    sum1 = [0] * numCols
    sum2 = [0] * numCols

    fh.seek(0)

    iRow = 0
    for inLine in fh:
        tokenList = inLine.split()
        if (len(tokenList) == numCols):
            for iCol in range(numCols):
                fVal = float(tokenList[iCol])
                if (fVal < minVals[iCol]):
                    minVals[iCol] = fVal
                if (fVal > maxVals[iCol]):
                    maxVals[iCol] = fVal
                sum1[iCol] += (fVal)
                sum2[iCol] += (fVal * fVal)
            iRow += 1
        else:
            # print ' skipping ... ', len(tokenList), tokenList
            true = 1

    fh.close()

    print ' --> got %d rows of data ... ' % iRow

    for iCol in range(numCols):
        mean1 = sum1[iCol] / float(iRow)
        mean2 = sum2[iCol] / float(iRow)
        sigma = math.sqrt(mean2 - (mean1 * mean1))
        print '         column #%3d : min=%16.8f  max=%16.8f   mean=%16.8f   sigma=%16.8f ' % \
            (iCol, minVals[iCol], maxVals[iCol], mean1, sigma)
Exemple #6
0
def statsByCol(matrixFilename):

    (fh, reZip) = openInputFileForReading(matrixFilename)

    # if failed to open file, just return ...
    if (reZip < 0):
        return

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)

    tokenList = firstLine.split()
    numCols = len(tokenList)

    print ' <%s> : %d rows and %d columns ' % \
        (matrixFilename, numLines, numCols)

    minVals = [+99999999] * numCols
    maxVals = [-99999999] * numCols

    sum1 = [0] * numCols
    sum2 = [0] * numCols

    fh.seek(0)

    iRow = 0
    for inLine in fh:
        tokenList = inLine.split()
        if (len(tokenList) == numCols):
            for iCol in range(numCols):
                fVal = float(tokenList[iCol])
                if (fVal < minVals[iCol]):
                    minVals[iCol] = fVal
                if (fVal > maxVals[iCol]):
                    maxVals[iCol] = fVal
                sum1[iCol] += (fVal)
                sum2[iCol] += (fVal * fVal)
            iRow += 1
        else:
            # print ' skipping ... ', len(tokenList), tokenList
            true = 1

    fh.close()

    print ' --> got %d rows of data ... ' % iRow

    for iCol in range(numCols):
        mean1 = sum1[iCol] / float(iRow)
        mean2 = sum2[iCol] / float(iRow)
        sigma = math.sqrt(mean2 - (mean1 * mean1))
        print '         column #%3d : min=%16.8f  max=%16.8f   mean=%16.8f   sigma=%16.8f ' % \
            (iCol, minVals[iCol], maxVals[iCol], mean1, sigma)
Exemple #7
0
def getOneColumn(matrixFilename, iX, nSkip=0):

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99)

    fhName = fh.name

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)
    numLines -= nSkip

    tokenList = firstLine.split()
    numCols = len(tokenList)

    # print ' <%s> : %d rows and %d columns ' % \
    ## 		 ( matrixFilename, numLines, numCols )

    if (iX >= numCols):
        print ' --> ERROR : iX=%d ' % (iX)
        return (-99)

    xCol = numpy.zeros(numLines)

    fh.seek(0)

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    iRow = 0
    for inLine in fh:
        tokenList = inLine.split()
        if (len(tokenList) == numCols and iRow < numLines):
            try:
                xCol[iRow] = float(tokenList[iX])
            except:
                xCol[iRow] = 0.
            iRow += 1
        else:
            print ' skipping ... ', len(tokenList), tokenList
            true = 1

    fh.close()

    if (reZip):
        zipFile(fhName)

    return (xCol)
Exemple #8
0
def getOneColumn(matrixFilename, iX, nSkip=0):

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99)

    fhName = fh.name

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)
    numLines -= nSkip

    tokenList = firstLine.split()
    numCols = len(tokenList)

    # print ' <%s> : %d rows and %d columns ' % \
    ## 		 ( matrixFilename, numLines, numCols )

    if (iX >= numCols):
        print ' --> ERROR : iX=%d ' % (iX)
        return (-99)

    xCol = numpy.zeros(numLines)

    fh.seek(0)

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    iRow = 0
    for inLine in fh:
        tokenList = inLine.split()
        if (len(tokenList) == numCols and iRow < numLines):
            try:
                xCol[iRow] = float(tokenList[iX])
            except:
                xCol[iRow] = 0.
            iRow += 1
        else:
            print ' skipping ... ', len(tokenList), tokenList
            true = 1

    fh.close()

    if (reZip):
        zipFile(fhName)

    return (xCol)
Exemple #9
0
    def _internalProcessDirectories(self, topDirs, archiveList,
                                    filename2sampleInfo, numSamples):
        print datetime.now(), "processing directories"

        fh = file(self.configuration['normMatFilename'], 'r')
        numRow = miscIO.num_lines(fh) - 1
        numCol = miscIO.num_cols(fh, '\t') - 1

        dataMatrix = [0] * numRow
        for iR in range(numRow):
            dataMatrix[iR] = [0] * numCol

        sampleList = fh.readline().strip().split('\t')[1:]
        if (len(sampleList) != numCol):
            raise ValueError(" ERROR #1 ??? ")
        featureList = []
        # we also need to read in the mapping file ...
        metaData = self._loadNameMap("%s" %
                                     self.configuration['matrix_mapping_file'])
        NA_VALUE = -999999
        done = 0
        iR = 0
        numNA = 0
        while (not done):
            aLine = fh.readline()
            aLine = aLine.strip()
            tokenList = aLine.split('\t')
            if (len(tokenList) != (numCol + 1)):
                done = 1
            else:
                aLabel = tokenList[0]
                # print " label = <%s> " % aLabel
                labelTokens = aLabel.split('.')
                # print labelTokens
                featName = self._makemiRNAFeatureName(labelTokens[0],
                                                      labelTokens[1], metaData)
                # print featName
                featureList += [featName]

                for iC in range(numCol):
                    try:
                        fVal = float(tokenList[iC + 1])
                        dataMatrix[iR][iC] = fVal
                    except:
                        dataMatrix[iR][iC] = NA_VALUE
                        numNA += 1
                iR += 1
                print " iR=%d    numNA=%d " % (iR, numNA)
        print datetime.now(), "finished processing directories"
        return dataMatrix, featureList, sampleList
    def _internalProcessDirectories(self, topDirs, archiveList, filename2sampleInfo, numSamples):
        print datetime.now(), "processing directories"
        
        fh = file(self.configuration['normMatFilename'], 'r')
        numRow = miscIO.num_lines(fh) - 1
        numCol = miscIO.num_cols(fh, '\t') - 1

        dataMatrix = [0] * numRow
        for iR in range(numRow):
            dataMatrix[iR] = [0] * numCol

        sampleList = fh.readline().strip().split('\t')[1:]
        if (len(sampleList) != numCol):
            raise ValueError(" ERROR #1 ??? ")
        featureList = []
        # we also need to read in the mapping file ...
        metaData = self._loadNameMap("%s" % self.configuration['matrix_mapping_file'])
        NA_VALUE = -999999
        done = 0
        iR = 0
        numNA = 0
        while (not done):
            aLine = fh.readline()
            aLine = aLine.strip()
            tokenList = aLine.split('\t')
            if (len(tokenList) != (numCol + 1)):
                done = 1
            else:
                aLabel = tokenList[0]
                # print " label = <%s> " % aLabel
                labelTokens = aLabel.split('.')
                # print labelTokens
                featName = self._makemiRNAFeatureName(labelTokens[0], labelTokens[1], metaData)
                # print featName
                featureList += [featName]

                for iC in range(numCol):
                    try:
                        fVal = float(tokenList[iC + 1])
                        dataMatrix[iR][iC] = fVal
                    except:
                        dataMatrix[iR][iC] = NA_VALUE
                        numNA += 1
                iR += 1
                print " iR=%d    numNA=%d " % (iR, numNA)
        print datetime.now(), "finished processing directories"
        return dataMatrix, featureList, sampleList
Exemple #11
0
def testMappingFile():

    global uuidMappingFile

    print " checking UUID-to-Barcode mapping file <%s> " % uuidMappingFile

    fh = file ( uuidMappingFile, 'r' )
    numLines = miscIO.num_lines(fh)
    if ( numLines < 100000 ):
        print " ERROR ??? seems too small ??? ", numLines
        sys.exit(-1)
    aLine = fh.readline()
    aTokens = aLine.split('\t')
    if ( not ( looks_like_uuid(aTokens[0]) ) ):
        print " ERROR ??? not a UUID ??? ", aTokens[0]
        print aLine
        sys.exit(-1)
    if ( not ( looks_like_barcode(aTokens[1]) ) ):
        print " ERROR ??? not a TCGA barcode ??? ", aTokens[1]
        print aLine
        sys.exit(-1)
Exemple #12
0
def testMappingFile():

    global uuidMappingFile

    print " checking UUID-to-Barcode mapping file <%s> " % uuidMappingFile

    fh = file(uuidMappingFile, 'r')
    numLines = miscIO.num_lines(fh)
    if (numLines < 100000):
        print " ERROR ??? seems too small ??? ", numLines
        sys.exit(-1)
    aLine = fh.readline()
    aTokens = aLine.split('\t')
    if (not (looks_like_uuid(aTokens[0]))):
        print " ERROR ??? not a UUID ??? ", aTokens[0]
        print aLine
        sys.exit(-1)
    if (not (looks_like_barcode(aTokens[1]))):
        print " ERROR ??? not a TCGA barcode ??? ", aTokens[1]
        print aLine
        sys.exit(-1)
Exemple #13
0
def getOneRow(matrixFilename, iY, nSkip=0):

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99)

    fhName = fh.name

    fh.seek(0)
    numLines = miscIO.num_lines(fh)
    if (iY >= numLines):
        print ' --> ERORR : iY=%d ' % (iY)
        return (-99)

    for ii in range(iY + 1):
        aLine = fh.readline()

    tokenList = aLine.split()
    numCols = len(tokenList)
    if (nSkip >= numCols):
        print ' --> ERROR : numCols=%d  nSkip=%d \n' % (numCols, nSkip)
        return (-99)

    xCol = numpy.zeros(numCols - nSkip)
    nGet = 0
    for ii in range(numCols - nSkip):
        try:
            xCol[ii] = float(tokenList[nSkip + ii])
            nGet += 1
        except:
            print ' skipping ... ', tokenList[nSkip + ii]
            true = 1

    fh.close()

    if (reZip):
        zipFile(fhName)

    return (xCol[:nGet])
Exemple #14
0
def getOneRow(matrixFilename, iY, nSkip=0):

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99)

    fhName = fh.name

    fh.seek(0)
    numLines = miscIO.num_lines(fh)
    if (iY >= numLines):
        print ' --> ERORR : iY=%d ' % (iY)
        return (-99)

    for ii in range(iY + 1):
        aLine = fh.readline()

    tokenList = aLine.split()
    numCols = len(tokenList)
    if (nSkip >= numCols):
        print ' --> ERROR : numCols=%d  nSkip=%d \n' % (numCols, nSkip)
        return (-99)

    xCol = numpy.zeros(numCols - nSkip)
    nGet = 0
    for ii in range(numCols - nSkip):
        try:
            xCol[ii] = float(tokenList[nSkip + ii])
            nGet += 1
        except:
            print ' skipping ... ', tokenList[nSkip + ii]
            true = 1

    fh.close()

    if (reZip):
        zipFile(fhName)

    return (xCol[:nGet])
Exemple #15
0
def twoDhist(matrixFilename,
             iX,
             iY,
             tdmFilename='',
             xMin='',
             xMax='',
             deltaX='',
             yMin='',
             yMax='',
             deltaY=''):

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99)

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)

    tokenList = firstLine.split()
    numCols = len(tokenList)

    print ' <%s> : %d rows and %d columns ' % \
        (matrixFilename, numLines, numCols)

    if (iX >= numCols or iY >= numCols):
        print ' --> ERROR : iX=%d  iY=%d ' % (iX, iY)
        return (-99)

    xCol = numpy.zeros(numLines)
    yCol = numpy.zeros(numLines)

    fh.seek(0)

    iRow = 0
    for inLine in fh:
        tokenList = inLine.split()
        if (len(tokenList) == numCols):
            xCol[iRow] = float(tokenList[iX])
            yCol[iRow] = float(tokenList[iY])
            iRow += 1
        else:
            # print ' skipping ... ', len(tokenList), tokenList
            true = 1

    fh.close()

    numRows = iRow
    print ' --> got %d rows of data ... ' % numRows

    if (xMin == ''):
        xMin = xCol.min()
    if (xMax == ''):
        xMax = xCol.max()
    xMean = xCol.mean()
    xSigma = xCol.std()
    print '         ', xMin, xMax, xMean, xSigma

    if (yMin == ''):
        yMin = yCol.min()
    if (yMax == ''):
        yMax = yCol.max()
    yMean = yCol.mean()
    ySigma = yCol.std()
    print '         ', yMin, yMax, yMean, ySigma

    if (deltaX == ''):
        (xMinHist, xMaxHist, deltaX,
         numXbins) = getHistogramInfo(xMin, xMax, xSigma)
    else:
        numXbins = int((xMax - xMin) / deltaX + 2)
        xMinHist = xMin
        xMaxHist = xMax
    print numXbins, deltaX

    if (deltaY == ''):
        (yMinHist, yMaxHist, deltaY,
         numYbins) = getHistogramInfo(yMin, yMax, ySigma)
    else:
        numYbins = int((yMax - yMin) / deltaY + 2)
        yMinHist = yMin
        yMaxHist = yMax
    print numYbins, deltaY

    # check to see if the bins should be square ... if so, we'll take the smaller
    # of the two dimensions ...
    if (0):
        if (min(deltaX, deltaY) / max(deltaX, deltaY) > 0.85):
            if (deltaX > deltaY):
                deltaX = deltaY
                numXbins = int(xMaxHist - xMinHist) / deltaX + 2
            elif (deltaX < deltaY):
                deltaY = deltaX
                numYbins = int(yMaxHist - yMinHist) / deltaY + 2
            print ' --> adjusted to square bins ... ', deltaX, numXbins, numYbins

    xyHist = [0] * numXbins
    for ii in range(numXbins):
        xyHist[ii] = [0] * numYbins

    for iRow in range(numRows):
        xVal = xCol[iRow]
        yVal = yCol[iRow]
        iX = int((xVal - xMinHist) / deltaX + 0.0001)
        iY = int((yVal - yMinHist) / deltaY + 0.0001)
        if (iX >= 0 and iY >= 0 and iX < numXbins and iY < numYbins):
            xyHist[iX][iY] += 1

    # find the one bin with the most counts ...
    modeVal = -1.
    for iX in range(numXbins):
        xVal = float(iX) * deltaX + xMinHist
        for iY in range(numYbins):
            yVal = float(iY) * deltaY + yMinHist
            if (xyHist[iX][iY] > modeVal):
                modeVal = xyHist[iX][iY]
                modeX = xVal
                modeY = yVal
    print ' mode at   ', modeX, modeY, modeVal

    # what is the median count value ?
    allBins = numpy.zeros(numXbins * numYbins)
    kk = 0
    for iX in range(numXbins):
        for iY in range(numYbins):
            allBins[kk] = xyHist[iX][iY]
            kk += 1
    allBins.sort()
    numBins = len(allBins)
    p50 = allBins[int(0.50 * numBins)]
    p90 = allBins[int(0.90 * numBins)]
    print allBins[0], allBins[-1], p50, p90

    if (tdmFilename != ''):
        fhPng = file(tdmFilename, 'w')

        outLine = 'CORNER'
        for iX in range(numXbins):
            outLine += '\t%d' % iX
        outLine += '\n'
        fhPng.write(outLine)

        for iY in range(numYbins - 1, -1, -1):
            outLine = '%d' % iY
            for iX in range(numXbins):
                outLine += '\t%d' % xyHist[iX][iY]
            outLine += '\n'
            fhPng.write(outLine)

        fhPng.close()
Exemple #16
0
def oneDcondHist(matrixFilename,
                 iX,
                 bitVec,
                 deltaX=-1,
                 xMin=-1,
                 xMax=-1,
                 nSkip=0,
                 printFlag=1):

    print ' in oneDcondHist ... ', matrixFilename, iX, deltaX, xMin, xMax, nSkip, printFlag
    print ' length of input bitVec : ', len(bitVec)

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99, -99, -99, -99, -99, -99)

    fhName = fh.name

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)
    numLines -= nSkip

    tokenList = firstLine.split()
    numCols = len(tokenList)

    print ' <%s> : %d rows and %d columns ' % \
        (matrixFilename, numLines, numCols)

    if (iX >= numCols):
        print ' --> ERROR : iX=%d ' % (iX)
        return (-99, -99, -99, -99, -99, -99)

    xCol = numpy.zeros(numLines)

    fh.seek(0)

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    iRow = 0
    for inLine in fh:
        tokenList = inLine.split()
        if (len(tokenList) == numCols and iRow < numLines):
            try:
                xCol[iRow] = float(tokenList[iX])
            except:
                print ' ERROR ??? ', iX
                print tokenList
                print tokenList[iX]
                sys.exit(-1)
            iRow += 1
        else:
            print ' skipping ... ', len(tokenList), tokenList
            true = 1

    fh.close()

    if (reZip):
        zipFile(fhName)

    numRows = iRow
    print ' --> got %d rows of data ... ' % numRows

    # now that we have the vector of data we need to decide how to set the
    # range and bin size for the histogram ...
    if (xMin == -1):
        xMin = xCol.min()
    if (xMax == -1):
        xMax = xCol.max()
    xMean = xCol.mean()
    xSigma = xCol.std()
    print '         min=%16.8f, max=%16.8f, mean=%16.8f and sigma=%16.8f :' % (
        xCol.min(), xCol.max(), xMean, xSigma)

    if (deltaX < 0):
        print ' calling getHistogramInfo ... ', xMin, xMax, xSigma
        (xMinHist, xMaxHist, deltaX,
         numBins) = getHistogramInfo(xMin, xMax, xSigma)
        print xMinHist, xMaxHist, deltaX, numBins
    else:
        print ' calling getHistogramInfo2 ... ', xMin, xMax, deltaX
        (xMinHist, xMaxHist, deltaX,
         numBins) = getHistogramInfo2(xMin, xMax, deltaX)
        print xMinHist, xMaxHist, deltaX, numBins

    # and now we can build the histogram of counts ...
    xHist0 = [0] * numBins
    xHist1 = [0] * numBins
    numRows0 = 0
    numRows1 = 0
    print numRows, len(bitVec), len(xCol)
    for iRow in range(numRows):
        # if ( iRow%1000 == 0 ): print iRow, xCol[iRow], bitVec[iRow]
        xVal = xCol[iRow]
        iBin = int((xVal - xMinHist) / deltaX + 0.0001)
        if (abs(bitVec[iRow]) < 0.00001):
            numRows0 += 1
            try:
                xHist0[iBin] += 1
            except:
                print ' ERROR ??? : ', iRow, xVal, iBin, xHist0[iBin]
        else:
            numRows1 += 1
            try:
                xHist1[iBin] += 1
            except:
                print ' ERROR ??? : ', iRow, xVal, iBin, xHist1[iBin]

    # and now we can print it out as counts, a PDF, and a CDF ...

    # first we'd like to find the first and last non-zero bins ...
    iStartBin = 0
    while (xHist0[iStartBin] == 0 and xHist1[iStartBin] == 0):
        iStartBin += 1

    iStopBin = numBins - 1
    while (xHist0[iStopBin] == 0 and xHist1[iStopBin] == 0):
        iStopBin -= 1

    # and now we can print it out ...
    cumCount0 = 0
    modeVal0 = -1.
    medianVal0 = 1.
    qqErr0 = 0.

    meanX0 = 0.
    meanSqX0 = 0.

    cumCount1 = 0
    modeVal1 = -1.
    medianVal1 = 1.
    qqErr1 = 0.

    meanX1 = 0.
    meanSqX1 = 0.

    for iBin in range(iStartBin, iStopBin + 1):

        if (1):

            xVal = float(iBin) * deltaX + xMinHist

            meanX0 += (float(xHist0[iBin]) * xVal / float(numRows0))
            meanX1 += (float(xHist1[iBin]) * xVal / float(numRows1))

            meanSqX0 += (float(xHist0[iBin]) * xVal * xVal / float(numRows0))
            meanSqX1 += (float(xHist1[iBin]) * xVal * xVal / float(numRows1))

            cumCount0 += xHist0[iBin]
            cumFrac0 = float(cumCount0) / float(numRows0)
            curFrac0 = float(xHist0[iBin]) / float(numRows0)

            cumCount1 += xHist1[iBin]
            cumFrac1 = float(cumCount1) / float(numRows1)
            curFrac1 = float(xHist1[iBin]) / float(numRows1)

            if (printFlag):

                print " ERROR ... this code needs to be fixed ... xMean and xSigma are not correct ... "
                sys.exit(-1)

                if (cumFrac0 > 0. and cumFrac0 < 1.):
                    xNormInv0 = miscMath.normInv(cumFrac0, xMean, xSigma)
                else:
                    xNormInv0 = -999999999.

                if (cumFrac1 > 0. and cumFrac1 < 1.):
                    xNormInv1 = miscMath.normInv(cumFrac1, xMean, xSigma)
                else:
                    xNormInv1 = -999999999.

                outLine = ' %5d  %14.8f ' % (iBin, xVal)
                outLine += ' %12d  %14.8f  %14.8f ' % (xHist0[iBin], curFrac0,
                                                       cumFrac0)
                outLine += ' %12d  %14.8f  %14.8f ' % (xHist1[iBin], curFrac1,
                                                       cumFrac1)
                if (xNormInv0 != -999999999.):
                    outLine += ' %14.8f ' % xNormInv0
                    qqErr0 += (abs(xNormInv0 - xVal) * deltaX)
                else:
                    outLine += '                '
                if (xNormInv1 != -999999999.):
                    outLine += ' %14.8f ' % xNormInv1
                    qqErr1 += (abs(xNormInv1 - xVal) * deltaX)
                else:
                    outLine += '                '
                print outLine

            if (modeVal0 < curFrac0):
                modeVal0 = curFrac0
                modeBin0 = iBin
                modeX0 = xVal

            if (abs(cumFrac0 - 0.5) < medianVal0):
                medianVal0 = abs(cumFrac0 - 0.5)
                medianBin0 = iBin
                medianX0 = xVal

            if (modeVal1 < curFrac1):
                modeVal1 = curFrac1
                modeBin1 = iBin
                modeX1 = xVal

            if (abs(cumFrac1 - 0.5) < medianVal1):
                medianVal1 = abs(cumFrac1 - 0.5)
                medianBin1 = iBin
                medianX1 = xVal

    sigmaX0 = math.sqrt(meanSqX0 - (meanX0 * meanX0))
    sigmaX1 = math.sqrt(meanSqX1 - (meanX1 * meanX1))

    print ' conditioned on bit = 0  %12d  %6.3f  %6.3f ' % (numRows0, meanX0,
                                                            sigmaX0)
    print '     median at ', medianBin0, medianX0, medianVal0
    print '     mode at   ', modeBin0, modeX0, modeVal0
    print ' conditioned on bit = 1  %12d  %6.3f  %6.3f ' % (numRows1, meanX1,
                                                            sigmaX1)
    print '     median at ', medianBin1, medianX1, medianVal1
    print '     mode at   ', modeBin1, modeX1, modeVal1
    # print ' integrated abs QQ error : ', qqErr0, qqErr1

    return (xMinHist, xMaxHist, deltaX, numBins, xHist0, xHist1)
Exemple #17
0
def oneDhist(matrixFilename,
             iX,
             deltaX=-1,
             xMin=-1,
             xMax=-1,
             nSkip=0,
             printFlag=1):

    print ' in oneDhist ... ', matrixFilename, iX, deltaX, xMin, xMax, nSkip, printFlag

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99)

    fhName = fh.name

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)
    numLines -= nSkip

    tokenList = firstLine.split()
    numCols = len(tokenList)

    print ' <%s> : %d rows and %d columns ' % \
        (matrixFilename, numLines, numCols)

    if (iX >= numCols):
        print ' --> ERROR : iX=%d ' % (iX)
        return (-99)

    xCol = numpy.zeros(numLines)

    fh.seek(0)

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    iRow = 0
    numErrSkip = 0
    for inLine in fh:
        tokenList = inLine.split('\t')
        if (len(tokenList) <= iX):
            print " ERROR ??? too few tokens ??? ", iX, len(tokenList)
            print tokenList
            numErrSkip += 1
            # sys.exit(-1)
        elif (len(tokenList) > iX and iRow < numLines):
            if (tokenList[iX] == "NaN"):
                tokenList[iX] = "NA"
            try:
                xCol[iRow] = float(tokenList[iX])
                # print iRow, min(xCol), max(xCol)
                iRow += 1
            except:
                if (1):
                    print ' invalid value ??? ', iX, tokenList, ' ... skipping ... '
                    numErrSkip += 1
                else:
                    print ' ERROR ??? ', iX
                    print tokenList
                    print tokenList[iX]
                    sys.exit(-1)
        else:
            print ' skipping ... ', len(tokenList), tokenList
            numErrSkip += 1

    fh.close()

    if (reZip):
        zipFile(fhName)

    numRows = iRow
    print ' --> got %d rows of data ... ' % numRows
    print '     skipped %d rows of data ... ' % numErrSkip

    if (numErrSkip > 0):
        xCol = xCol[:numRows]

    # now that we have the vector of data we need to decide how to set the
    # range and bin size for the histogram ...
    if (xMin == -1):
        xMin = xCol.min()
    if (xMax == -1):
        xMax = xCol.max()
    xMean = xCol.mean()
    xSigma = xCol.std()
    xMagSq = getVectorNormSq(xCol)
    print '         N=%12d, min=%16.8f, max=%16.8f, mean=%16.8f and sigma=%16.8f :' % (
        len(xCol), xCol.min(), xCol.max(), xMean, xSigma)
    print '         norm squared = %16.8f ' % xMagSq

    if (deltaX < 0):
        print ' calling getHistogramInfo ... ', xMin, xMax, xSigma
        (xMinHist, xMaxHist, deltaX,
         numBins) = getHistogramInfo(xMin, xMax, xSigma)
        print xMinHist, xMaxHist, deltaX, numBins
    else:
        print ' calling getHistogramInfo2 ... ', xMin, xMax, deltaX
        (xMinHist, xMaxHist, deltaX,
         numBins) = getHistogramInfo2(xMin, xMax, deltaX)
        print xMinHist, xMaxHist, deltaX, numBins

    # and now we can build the histogram of counts ...
    numTooSmall = 0
    numTooBig = 0
    xHist = [0] * numBins
    for iRow in range(numRows):
        xVal = xCol[iRow]
        if (xVal < xMinHist):
            numTooSmall += 1
        elif (xVal > xMaxHist):
            numTooBig += 1
        else:
            iBin = int((xVal - xMinHist) / deltaX + 0.0001)
            try:
                xHist[iBin] += 1
            except:
                print ' ERROR ??? : ', iRow, xVal, iBin, xHist[iBin]
                sys.exit(-1)

    print " "
    print " numTooSmall : ", numTooSmall
    print " numTooBig : ", numTooBig
    print " "

    # and now we can print it out as counts, a PDF, and a CDF ...

    # first we'd like to find the first and last non-zero bins ...
    iStartBin = 0
    while (xHist[iStartBin] == 0):
        iStartBin += 1

    iStopBin = numBins - 1
    while (xHist[iStopBin] == 0):
        iStopBin -= 1

    # and now we can print it out ...
    cumCount = numTooSmall
    modeVal = -1.
    medianVal = 1.
    qqErr = 0.
    for iBin in range(iStartBin, iStopBin + 1):

        if (1):

            cumCount += xHist[iBin]
            cumFrac = float(cumCount) / float(numRows)
            curFrac = float(xHist[iBin]) / float(numRows)
            xVal = float(iBin) * deltaX + xMinHist

            if (cumFrac > 0. and cumFrac < 1.):
                xNormInv = miscMath.normInv(cumFrac, xMean, xSigma)
            else:
                xNormInv = -999999999.

            if (printFlag):
                if (xNormInv != -999999999.):
                    if (xHist[iBin] > 0):
                        print ' %5d  %14.10f  %12d  %14.10f  %14.10f  %14.10f ' % \
                            (iBin, xVal, xHist[iBin],
                             curFrac, cumFrac, xNormInv)
                    qqErr += (abs(xNormInv - xVal) * deltaX)
                else:
                    if (xHist[iBin] > 0):
                        print ' %5d  %14.10f  %12d  %14.10f  %14.10f ' % \
                            (iBin, xVal, xHist[iBin], curFrac, cumFrac)

            if (modeVal < curFrac):
                modeVal = curFrac
                modeBin = iBin
                modeX = xVal

            if (abs(cumFrac - 0.5) < medianVal):
                medianVal = abs(cumFrac - 0.5)
                medianBin = iBin
                medianX = xVal

    # print ' median at ', medianBin, medianX, medianVal
    # print ' mode at   ', modeBin, modeX, modeVal
    print ' integrated abs QQ error : ', qqErr

    # maybe it would also be interesting to compare this to a uniform
    # distribution ...
    xMinVal = float(iStartBin) * deltaX + xMinHist
    xMaxVal = float(iStopBin) * deltaX + xMinHist
    uHeight = 1. / float(iStopBin - iStartBin + 1)
    KLdist = 0.
    for iBin in range(iStartBin, iStopBin + 1):
        curFrac = float(xHist[iBin]) / float(numRows)
        if (uHeight < 1.e-9 and curFrac < 1.e-9):
            KLdist += 0.
        elif (curFrac < 1.e-9):
            # print " warning ... not sure what to do ... ", curFrac, uHeight
            doNothing = 1
        elif (uHeight < 1.e-9):
            # print " warning ... not sure what to do ... ", curFrac, uHeight
            doNothing = 1
        else:
            KLdist += curFrac * \
                math.log(curFrac / uHeight)  +  uHeight * \
                math.log(uHeight / curFrac)
    print ' symmetric KL distance to uniform distribution between %f and %f : %f ' % (
        xMinVal, xMaxVal, KLdist)

    return (xMinHist, xMaxHist, deltaX, numBins, xHist)
Exemple #18
0
def oneDhist(matrixFilename, iX, deltaX=-1, xMin=-1, xMax=-1, nSkip=0, printFlag=1):

    print ' in oneDhist ... ', matrixFilename, iX, deltaX, xMin, xMax, nSkip, printFlag

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99)

    fhName = fh.name

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)
    numLines -= nSkip

    tokenList = firstLine.split()
    numCols = len(tokenList)

    print ' <%s> : %d rows and %d columns ' % \
        (matrixFilename, numLines, numCols)

    if (iX >= numCols):
        print ' --> ERROR : iX=%d ' % (iX)
        return (-99)

    xCol = numpy.zeros(numLines)

    fh.seek(0)

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    iRow = 0
    numErrSkip = 0
    for inLine in fh:
        tokenList = inLine.split('\t')
        if (len(tokenList) <= iX):
            print " ERROR ??? too few tokens ??? ", iX, len(tokenList)
            print tokenList
            numErrSkip += 1
            # sys.exit(-1)
        elif (len(tokenList) > iX and iRow < numLines):
            if (tokenList[iX] == "NaN"):
                tokenList[iX] = "NA"
            try:
                xCol[iRow] = float(tokenList[iX])
                # print iRow, min(xCol), max(xCol)
                iRow += 1
            except:
                if (1):
                    print ' invalid value ??? ', iX, tokenList, ' ... skipping ... '
                    numErrSkip += 1
                else:
                    print ' ERROR ??? ', iX
                    print tokenList
                    print tokenList[iX]
                    sys.exit(-1)
        else:
            print ' skipping ... ', len(tokenList), tokenList
            numErrSkip += 1

    fh.close()

    if (reZip):
        zipFile(fhName)

    numRows = iRow
    print ' --> got %d rows of data ... ' % numRows
    print '     skipped %d rows of data ... ' % numErrSkip

    if (numErrSkip > 0):
        xCol = xCol[:numRows]

    # now that we have the vector of data we need to decide how to set the
    # range and bin size for the histogram ...
    if (xMin == -1):
        xMin = xCol.min()
    if (xMax == -1):
        xMax = xCol.max()
    xMean = xCol.mean()
    xSigma = xCol.std()
    xMagSq = getVectorNormSq(xCol)
    print '         N=%12d, min=%16.8f, max=%16.8f, mean=%16.8f and sigma=%16.8f :' % (len(xCol), xCol.min(), xCol.max(), xMean, xSigma)
    print '         norm squared = %16.8f ' % xMagSq

    if (deltaX < 0):
        print ' calling getHistogramInfo ... ', xMin, xMax, xSigma
        (xMinHist, xMaxHist, deltaX,
         numBins) = getHistogramInfo(xMin, xMax, xSigma)
        print xMinHist, xMaxHist, deltaX, numBins
    else:
        print ' calling getHistogramInfo2 ... ', xMin, xMax, deltaX
        (xMinHist, xMaxHist, deltaX,
         numBins) = getHistogramInfo2(xMin, xMax, deltaX)
        print xMinHist, xMaxHist, deltaX, numBins

    # and now we can build the histogram of counts ...
    numTooSmall = 0
    numTooBig = 0
    xHist = [0] * numBins
    for iRow in range(numRows):
        xVal = xCol[iRow]
        if (xVal < xMinHist):
            numTooSmall += 1
        elif (xVal > xMaxHist):
            numTooBig += 1
        else:
            iBin = int((xVal - xMinHist) / deltaX + 0.0001)
            try:
                xHist[iBin] += 1
            except:
                print ' ERROR ??? : ', iRow, xVal, iBin, xHist[iBin]
                sys.exit(-1)

    print " "
    print " numTooSmall : ", numTooSmall
    print " numTooBig : ", numTooBig
    print " "

    # and now we can print it out as counts, a PDF, and a CDF ...

    # first we'd like to find the first and last non-zero bins ...
    iStartBin = 0
    while (xHist[iStartBin] == 0):
        iStartBin += 1

    iStopBin = numBins - 1
    while (xHist[iStopBin] == 0):
        iStopBin -= 1

    # and now we can print it out ...
    cumCount = numTooSmall
    modeVal = -1.
    medianVal = 1.
    qqErr = 0.
    for iBin in range(iStartBin, iStopBin + 1):

        if (1):

            cumCount += xHist[iBin]
            cumFrac = float(cumCount) / float(numRows)
            curFrac = float(xHist[iBin]) / float(numRows)
            xVal = float(iBin) * deltaX + xMinHist

            if (cumFrac > 0. and cumFrac < 1.):
                xNormInv = miscMath.normInv(cumFrac, xMean, xSigma)
            else:
                xNormInv = -999999999.

            if (printFlag):
                if (xNormInv != -999999999.):
                    if (xHist[iBin] > 0):
                        print ' %5d  %14.10f  %12d  %14.10f  %14.10f  %14.10f ' % \
                            (iBin, xVal, xHist[iBin],
                             curFrac, cumFrac, xNormInv)
                    qqErr += (abs(xNormInv - xVal) * deltaX)
                else:
                    if (xHist[iBin] > 0):
                        print ' %5d  %14.10f  %12d  %14.10f  %14.10f ' % \
                            (iBin, xVal, xHist[iBin], curFrac, cumFrac)

            if (modeVal < curFrac):
                modeVal = curFrac
                modeBin = iBin
                modeX = xVal

            if (abs(cumFrac - 0.5) < medianVal):
                medianVal = abs(cumFrac - 0.5)
                medianBin = iBin
                medianX = xVal

    # print ' median at ', medianBin, medianX, medianVal
    # print ' mode at   ', modeBin, modeX, modeVal
    print ' integrated abs QQ error : ', qqErr

    # maybe it would also be interesting to compare this to a uniform
    # distribution ...
    xMinVal = float(iStartBin) * deltaX + xMinHist
    xMaxVal = float(iStopBin) * deltaX + xMinHist
    uHeight = 1. / float(iStopBin - iStartBin + 1)
    KLdist = 0.
    for iBin in range(iStartBin, iStopBin + 1):
        curFrac = float(xHist[iBin]) / float(numRows)
        if (uHeight < 1.e-9 and curFrac < 1.e-9):
            KLdist += 0.
        elif (curFrac < 1.e-9):
            # print " warning ... not sure what to do ... ", curFrac, uHeight
            doNothing = 1
        elif (uHeight < 1.e-9):
            # print " warning ... not sure what to do ... ", curFrac, uHeight
            doNothing = 1
        else:
            KLdist += curFrac * \
                math.log(curFrac / uHeight)  +  uHeight * \
                math.log(uHeight / curFrac)
    print ' symmetric KL distance to uniform distribution between %f and %f : %f ' % (xMinVal, xMaxVal, KLdist)

    return (xMinHist, xMaxHist, deltaX, numBins, xHist)
Exemple #19
0
def xyCorr(matrixFilename, iX, iY, jStart=-1, jStop=-1, flipFlag=0):

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99)

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)

    tokenList = firstLine.split()
    numCols = len(tokenList)

    print ' <%s> : %d rows and %d columns ' % \
        (matrixFilename, numLines, numCols)

    if (iX >= numCols or iY >= numCols):
        print ' --> ERROR : iX=%d iY=%d ' % (iX, iY)
        return (-99)

    xCol = numpy.zeros(numLines)
    yCol = numpy.zeros(numLines)

    fh.seek(0)

    iRow = 0
    for inLine in fh:
        tokenList = inLine.split()
        if (len(tokenList) == numCols):
            xCol[iRow] = float(tokenList[iX])
            yCol[iRow] = float(tokenList[iY])
            iRow += 1
        else:
            # print ' skipping ... ', len(tokenList), tokenList
            true = 1

    fh.close()

    print ' --> got %d rows of data ... ' % iRow

    if (jStart >= 0 and jStop >= 0):
        print ' --> trimming to [%d:%d] ' % (jStart, jStop)
        xCol = xCol[jStart:jStop + 1]
        yCol = yCol[jStart:jStop + 1]

    if (flipFlag):
        print ' --> reversing one vector '
        lenY = len(yCol)
        tmpY = numpy.zeros(lenY)
        for kk in range(lenY):
            tmpY[kk] = yCol[lenY - 1 - kk]
        yCol = tmpY

    rhoXY = miscMath.PearsonCorr(xCol, yCol)
    print ' Pearson correlation : ', rhoXY

    (a, b, r) = miscMath.linear_regression(xCol, yCol)
    print ' Linear regression :   r2 = %f ' % r
    print '                       slope = %f ' % b
    print '                       offset = %f ' % a
Exemple #20
0
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    fh = file(inFilename)
    numLines = 10
    for iLine in range(numLines):
        aLine = fh.readline()
        # look for carriage return / line-feed ?
        ## lookAtLine ( aLine )
        ## aLine = aLine.strip()
        aTokens = aLine.split('\t')
        if (len(aTokens) > 15):
            print len(aTokens), aTokens[:5], aTokens[-5:]
        else:
            print len(aTokens), aTokens

    numLines = miscIO.num_lines(fh)
    print "\n\n total # of lines in file : %d " % numLines
    fh.close()

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    fh = file(inFilename)
    aLine = fh.readline()
    aTokens = aLine.split('\t')
    numA = len(aTokens)
    print " number of header tokens : ", numA
    lookAtHeaderTokens(aTokens)

    done = 0
    iLine = 0
    while not done:
        bLine = fh.readline()
Exemple #21
0
        sys.exit(-1)

    # if we get this far, we should make sure that the output directory we
    # want exists
    print " --> testing that we have an output directory ... <%s> " % outDir
    tsvIO.createDir(outDir)
    print "     output file name will be called <%s> " % outFilename

    # we also need to read in the mapping file ...
    metaData = loadNameMap(
        gidgetConfigVars['TCGAFMP_DCC_REPOSITORIES'] + "/mirna_bcgsc/mature.fa.flat.human.mirbase_v19.txt")

    if (1):

        fh = file(gotFiles[0], 'r')
        numRow = miscIO.num_lines(fh) - 1
        numCol = miscIO.num_cols(fh, '\t') - 1

        rowLabels = []
        dataMatrix = [0] * numRow
        for iR in range(numRow):
            dataMatrix[iR] = [0] * numCol

        hdrLine = fh.readline()
        hdrLine = hdrLine.strip()
        hdrTokens = hdrLine.split('\t')
        if (len(hdrTokens) != (numCol + 1)):
            print " ERROR #1 ??? "
            sys.exit(-1)

        done = 0
Exemple #22
0
def xyCorr(matrixFilename, iX, iY, jStart=-1, jStop=-1, flipFlag=0):

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99)

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)

    tokenList = firstLine.split()
    numCols = len(tokenList)

    print ' <%s> : %d rows and %d columns ' % \
        (matrixFilename, numLines, numCols)

    if (iX >= numCols or iY >= numCols):
        print ' --> ERROR : iX=%d iY=%d ' % (iX, iY)
        return (-99)

    xCol = numpy.zeros(numLines)
    yCol = numpy.zeros(numLines)

    fh.seek(0)

    iRow = 0
    for inLine in fh:
        tokenList = inLine.split()
        if (len(tokenList) == numCols):
            xCol[iRow] = float(tokenList[iX])
            yCol[iRow] = float(tokenList[iY])
            iRow += 1
        else:
            # print ' skipping ... ', len(tokenList), tokenList
            true = 1

    fh.close()

    print ' --> got %d rows of data ... ' % iRow

    if (jStart >= 0 and jStop >= 0):
        print ' --> trimming to [%d:%d] ' % (jStart, jStop)
        xCol = xCol[jStart:jStop + 1]
        yCol = yCol[jStart:jStop + 1]

    if (flipFlag):
        print ' --> reversing one vector '
        lenY = len(yCol)
        tmpY = numpy.zeros(lenY)
        for kk in range(lenY):
            tmpY[kk] = yCol[lenY - 1 - kk]
        yCol = tmpY

    rhoXY = miscMath.PearsonCorr(xCol, yCol)
    print ' Pearson correlation : ', rhoXY

    (a, b, r) = miscMath.linear_regression(xCol, yCol)
    print ' Linear regression :   r2 = %f ' % r
    print '                       slope = %f ' % b
    print '                       offset = %f ' % a
Exemple #23
0
def twoDhist(matrixFilename, iX, iY,
             tdmFilename='',
             xMin='', xMax='', deltaX='',
             yMin='', yMax='', deltaY=''):

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99)

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)

    tokenList = firstLine.split()
    numCols = len(tokenList)

    print ' <%s> : %d rows and %d columns ' % \
        (matrixFilename, numLines, numCols)

    if (iX >= numCols or iY >= numCols):
        print ' --> ERROR : iX=%d  iY=%d ' % (iX, iY)
        return (-99)

    xCol = numpy.zeros(numLines)
    yCol = numpy.zeros(numLines)

    fh.seek(0)

    iRow = 0
    for inLine in fh:
        tokenList = inLine.split()
        if (len(tokenList) == numCols):
            xCol[iRow] = float(tokenList[iX])
            yCol[iRow] = float(tokenList[iY])
            iRow += 1
        else:
            # print ' skipping ... ', len(tokenList), tokenList
            true = 1

    fh.close()

    numRows = iRow
    print ' --> got %d rows of data ... ' % numRows

    if (xMin == ''):
        xMin = xCol.min()
    if (xMax == ''):
        xMax = xCol.max()
    xMean = xCol.mean()
    xSigma = xCol.std()
    print '         ', xMin, xMax, xMean, xSigma

    if (yMin == ''):
        yMin = yCol.min()
    if (yMax == ''):
        yMax = yCol.max()
    yMean = yCol.mean()
    ySigma = yCol.std()
    print '         ', yMin, yMax, yMean, ySigma

    if (deltaX == ''):
        (xMinHist, xMaxHist, deltaX,
         numXbins) = getHistogramInfo(xMin, xMax, xSigma)
    else:
        numXbins = int((xMax - xMin) / deltaX + 2)
        xMinHist = xMin
        xMaxHist = xMax
    print numXbins, deltaX

    if (deltaY == ''):
        (yMinHist, yMaxHist, deltaY,
         numYbins) = getHistogramInfo(yMin, yMax, ySigma)
    else:
        numYbins = int((yMax - yMin) / deltaY + 2)
        yMinHist = yMin
        yMaxHist = yMax
    print numYbins, deltaY

    # check to see if the bins should be square ... if so, we'll take the smaller
    # of the two dimensions ...
    if (0):
        if (min(deltaX, deltaY) / max(deltaX, deltaY) > 0.85):
            if (deltaX > deltaY):
                deltaX = deltaY
                numXbins = int(xMaxHist - xMinHist) / deltaX + 2
            elif (deltaX < deltaY):
                deltaY = deltaX
                numYbins = int(yMaxHist - yMinHist) / deltaY + 2
            print ' --> adjusted to square bins ... ', deltaX, numXbins, numYbins

    xyHist = [0] * numXbins
    for ii in range(numXbins):
        xyHist[ii] = [0] * numYbins

    for iRow in range(numRows):
        xVal = xCol[iRow]
        yVal = yCol[iRow]
        iX = int((xVal - xMinHist) / deltaX + 0.0001)
        iY = int((yVal - yMinHist) / deltaY + 0.0001)
        if (iX >= 0 and iY >= 0 and iX < numXbins and iY < numYbins):
            xyHist[iX][iY] += 1

    # find the one bin with the most counts ...
    modeVal = -1.
    for iX in range(numXbins):
        xVal = float(iX) * deltaX + xMinHist
        for iY in range(numYbins):
            yVal = float(iY) * deltaY + yMinHist
            if (xyHist[iX][iY] > modeVal):
                modeVal = xyHist[iX][iY]
                modeX = xVal
                modeY = yVal
    print ' mode at   ', modeX, modeY, modeVal

    # what is the median count value ?
    allBins = numpy.zeros(numXbins * numYbins)
    kk = 0
    for iX in range(numXbins):
        for iY in range(numYbins):
            allBins[kk] = xyHist[iX][iY]
            kk += 1
    allBins.sort()
    numBins = len(allBins)
    p50 = allBins[int(0.50 * numBins)]
    p90 = allBins[int(0.90 * numBins)]
    print allBins[0], allBins[-1], p50, p90

    if (tdmFilename != ''):
        fhPng = file(tdmFilename, 'w')

        outLine = 'CORNER'
        for iX in range(numXbins):
            outLine += '\t%d' % iX
        outLine += '\n'
        fhPng.write(outLine)

        for iY in range(numYbins - 1, -1, -1):
            outLine = '%d' % iY
            for iX in range(numXbins):
                outLine += '\t%d' % xyHist[iX][iY]
            outLine += '\n'
            fhPng.write(outLine)

        fhPng.close()
Exemple #24
0
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    fh = file(inFilename)
    numLines = 10
    for iLine in range(numLines):
        aLine = fh.readline()
        # look for carriage return / line-feed ?
        ## lookAtLine ( aLine )
        ## aLine = aLine.strip()
        aTokens = aLine.split('\t')
        if (len(aTokens) > 15):
            print len(aTokens), aTokens[:5], aTokens[-5:]
        else:
            print len(aTokens), aTokens

    numLines = miscIO.num_lines(fh)
    print "\n\n total # of lines in file : %d " % numLines
    fh.close()

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    fh = file(inFilename)
    aLine = fh.readline()
    aTokens = aLine.split('\t')
    numA = len(aTokens)
    print " number of header tokens : ", numA
    lookAtHeaderTokens(aTokens)

    done = 0
    iLine = 0
    while not done:
        bLine = fh.readline()
Exemple #25
0
def oneDcondHist(matrixFilename, iX, bitVec, deltaX=-1, xMin=-1, xMax=-1, nSkip=0, printFlag=1):

    print ' in oneDcondHist ... ', matrixFilename, iX, deltaX, xMin, xMax, nSkip, printFlag
    print ' length of input bitVec : ', len(bitVec)

    (fh, reZip) = openInputFileForReading(matrixFilename)
    if (reZip < 0):
        return (-99, -99, -99, -99, -99, -99)

    fhName = fh.name

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    firstLine = fh.readline()

    fh.seek(0)
    numLines = miscIO.num_lines(fh)
    numLines -= nSkip

    tokenList = firstLine.split()
    numCols = len(tokenList)

    print ' <%s> : %d rows and %d columns ' % \
        (matrixFilename, numLines, numCols)

    if (iX >= numCols):
        print ' --> ERROR : iX=%d ' % (iX)
        return (-99, -99, -99, -99, -99, -99)

    xCol = numpy.zeros(numLines)

    fh.seek(0)

    for iSkip in range(nSkip):
        skipLine = fh.readline()

    iRow = 0
    for inLine in fh:
        tokenList = inLine.split()
        if (len(tokenList) == numCols and iRow < numLines):
            try:
                xCol[iRow] = float(tokenList[iX])
            except:
                print ' ERROR ??? ', iX
                print tokenList
                print tokenList[iX]
                sys.exit(-1)
            iRow += 1
        else:
            print ' skipping ... ', len(tokenList), tokenList
            true = 1

    fh.close()

    if (reZip):
        zipFile(fhName)

    numRows = iRow
    print ' --> got %d rows of data ... ' % numRows

    # now that we have the vector of data we need to decide how to set the
    # range and bin size for the histogram ...
    if (xMin == -1):
        xMin = xCol.min()
    if (xMax == -1):
        xMax = xCol.max()
    xMean = xCol.mean()
    xSigma = xCol.std()
    print '         min=%16.8f, max=%16.8f, mean=%16.8f and sigma=%16.8f :' % (xCol.min(), xCol.max(), xMean, xSigma)

    if (deltaX < 0):
        print ' calling getHistogramInfo ... ', xMin, xMax, xSigma
        (xMinHist, xMaxHist, deltaX,
         numBins) = getHistogramInfo(xMin, xMax, xSigma)
        print xMinHist, xMaxHist, deltaX, numBins
    else:
        print ' calling getHistogramInfo2 ... ', xMin, xMax, deltaX
        (xMinHist, xMaxHist, deltaX,
         numBins) = getHistogramInfo2(xMin, xMax, deltaX)
        print xMinHist, xMaxHist, deltaX, numBins

    # and now we can build the histogram of counts ...
    xHist0 = [0] * numBins
    xHist1 = [0] * numBins
    numRows0 = 0
    numRows1 = 0
    print numRows, len(bitVec), len(xCol)
    for iRow in range(numRows):
        # if ( iRow%1000 == 0 ): print iRow, xCol[iRow], bitVec[iRow]
        xVal = xCol[iRow]
        iBin = int((xVal - xMinHist) / deltaX + 0.0001)
        if (abs(bitVec[iRow]) < 0.00001):
            numRows0 += 1
            try:
                xHist0[iBin] += 1
            except:
                print ' ERROR ??? : ', iRow, xVal, iBin, xHist0[iBin]
        else:
            numRows1 += 1
            try:
                xHist1[iBin] += 1
            except:
                print ' ERROR ??? : ', iRow, xVal, iBin, xHist1[iBin]

    # and now we can print it out as counts, a PDF, and a CDF ...

    # first we'd like to find the first and last non-zero bins ...
    iStartBin = 0
    while (xHist0[iStartBin] == 0 and xHist1[iStartBin] == 0):
        iStartBin += 1

    iStopBin = numBins - 1
    while (xHist0[iStopBin] == 0 and xHist1[iStopBin] == 0):
        iStopBin -= 1

    # and now we can print it out ...
    cumCount0 = 0
    modeVal0 = -1.
    medianVal0 = 1.
    qqErr0 = 0.

    meanX0 = 0.
    meanSqX0 = 0.

    cumCount1 = 0
    modeVal1 = -1.
    medianVal1 = 1.
    qqErr1 = 0.

    meanX1 = 0.
    meanSqX1 = 0.

    for iBin in range(iStartBin, iStopBin + 1):

        if (1):

            xVal = float(iBin) * deltaX + xMinHist

            meanX0 += (float(xHist0[iBin]) * xVal / float(numRows0))
            meanX1 += (float(xHist1[iBin]) * xVal / float(numRows1))

            meanSqX0 += (float(xHist0[iBin]) * xVal * xVal / float(numRows0))
            meanSqX1 += (float(xHist1[iBin]) * xVal * xVal / float(numRows1))

            cumCount0 += xHist0[iBin]
            cumFrac0 = float(cumCount0) / float(numRows0)
            curFrac0 = float(xHist0[iBin]) / float(numRows0)

            cumCount1 += xHist1[iBin]
            cumFrac1 = float(cumCount1) / float(numRows1)
            curFrac1 = float(xHist1[iBin]) / float(numRows1)

            if (printFlag):

                print " ERROR ... this code needs to be fixed ... xMean and xSigma are not correct ... "
                sys.exit(-1)

                if (cumFrac0 > 0. and cumFrac0 < 1.):
                    xNormInv0 = miscMath.normInv(cumFrac0, xMean, xSigma)
                else:
                    xNormInv0 = -999999999.

                if (cumFrac1 > 0. and cumFrac1 < 1.):
                    xNormInv1 = miscMath.normInv(cumFrac1, xMean, xSigma)
                else:
                    xNormInv1 = -999999999.

                outLine = ' %5d  %14.8f ' % (iBin, xVal)
                outLine += ' %12d  %14.8f  %14.8f ' % (xHist0[iBin],
                                                       curFrac0, cumFrac0)
                outLine += ' %12d  %14.8f  %14.8f ' % (xHist1[iBin],
                                                       curFrac1, cumFrac1)
                if (xNormInv0 != -999999999.):
                    outLine += ' %14.8f ' % xNormInv0
                    qqErr0 += (abs(xNormInv0 - xVal) * deltaX)
                else:
                    outLine += '                '
                if (xNormInv1 != -999999999.):
                    outLine += ' %14.8f ' % xNormInv1
                    qqErr1 += (abs(xNormInv1 - xVal) * deltaX)
                else:
                    outLine += '                '
                print outLine

            if (modeVal0 < curFrac0):
                modeVal0 = curFrac0
                modeBin0 = iBin
                modeX0 = xVal

            if (abs(cumFrac0 - 0.5) < medianVal0):
                medianVal0 = abs(cumFrac0 - 0.5)
                medianBin0 = iBin
                medianX0 = xVal

            if (modeVal1 < curFrac1):
                modeVal1 = curFrac1
                modeBin1 = iBin
                modeX1 = xVal

            if (abs(cumFrac1 - 0.5) < medianVal1):
                medianVal1 = abs(cumFrac1 - 0.5)
                medianBin1 = iBin
                medianX1 = xVal

    sigmaX0 = math.sqrt(meanSqX0 - (meanX0 * meanX0))
    sigmaX1 = math.sqrt(meanSqX1 - (meanX1 * meanX1))

    print ' conditioned on bit = 0  %12d  %6.3f  %6.3f ' % (numRows0, meanX0, sigmaX0)
    print '     median at ', medianBin0, medianX0, medianVal0
    print '     mode at   ', modeBin0, modeX0, modeVal0
    print ' conditioned on bit = 1  %12d  %6.3f  %6.3f ' % (numRows1, meanX1, sigmaX1)
    print '     median at ', medianBin1, medianX1, medianVal1
    print '     mode at   ', modeBin1, modeX1, modeVal1
    # print ' integrated abs QQ error : ', qqErr0, qqErr1

    return (xMinHist, xMaxHist, deltaX, numBins, xHist0, xHist1)
Exemple #26
0
    testD = tsvIO.readTSV_dataMatrix ( tsvFilename )
    try:
        print " got feature matrix ", testD['dataType']
    except:
        print " ERROR ... failed to read input feature matrix ??? "
        print tsvFilename
        sys.exit(-1)

    ## next, make sure that the pathways file is accessible
    ## and looks good ...
    pathwaysFilename = gidgetConfigVars['TCGAFMP_BIOINFORMATICS_REFERENCES'] \
        + "/nci_pid/only_NCI_Nature_ver5.tab"
    try:
        fh = file ( pathwaysFilename, 'r' )
        print " opening file <%s> " % pathwaysFilename
        numP = miscIO.num_lines ( fh )
        print "     --> numP = ", numP
        fh.close()
    except:
        numP = 0

    if ( numP < 10 ):
        print " ERROR ... pathways file looks bad ??? "
        print pathwaysFilename
        sys.exit(-1)         

    ## and now start looping over the features ...
    fh = file ( featListFile, 'r' )
    for aLine in fh:
        aLine = aLine.strip()
        featString = aLine