예제 #1
0
    def removeNaN(inputFilename,
                  outputFilename,
                  removeNaN=True,
                  delimiter="\t",
                  header=True):
        from Analysis.Filter import Filter
        import numpy as np

        with open(outputFilename, "w") as outputFile:
            print "Commencing annotation concatenation.  Will remove NA values: %s" % removeNaN
            inputFile = open(inputFilename, "r")
            for i, line in enumerate(inputFile):
                if header == True and i == 0:
                    outputFile.write(line.strip() + "\n")
                else:
                    isAllNaN = False
                    array = line.strip().split(delimiter)[5:]
                    dataPoints = Filter.convert_NAtoNaN(array)
                    dataPoints = np.array(dataPoints).astype(np.float)
                    isAllNaN = Filter.test_allNaN(dataPoints)
                    if isAllNaN == False:
                        pyDataArray = dataPoints.tolist()
                        outputFile.write(line.strip() + "\n")
            print "Completed. Flushing resources."
            inputFile.flush()
            inputFile.close()
        return outputFilename
예제 #2
0
 def foldChange_fromMedian(array, arbitraryAddition = False):
     import numpy as np
     from Analysis.Filter import Filter
     array = Filter.convert_NAtoNaN(array)
     dataRow = np.array(array).astype(np.float)
     if Filter.test_allNaN(dataRow) == False:
         if Filter.test_containsNaN(dataRow) == True:
             dataRow = Filter.filterNonNumericCells_toZero(dataRow)
         if arbitraryAddition != False:
             assert float(arbitraryAddition)
             dataRow = RNASeqV2.addAbitraryAddition_nonZero(dataRow, arbitraryAddition)
         median = np.median(dataRow)
         for x in np.nditer(dataRow, op_flags=['readwrite']):
             x[...] = (np.log2(x) - np.log2(median))
         return dataRow
     return None
예제 #3
0
    def filterMethData_standardDev(fileName,
                                   annotationRows,
                                   stdDev_cutoff=0.2,
                                   headers=True,
                                   delimiter="\t",
                                   includeAllSiteAnnotation=False):
        """
        Filters and cleans na values from data points and outputs data that has standard deviation above cutoff
        """
        from Analysis.Filter import Filter

        assert fileName
        assert int(annotationRows)
        assert float(stdDev_cutoff)
        print "Creating filtered methylation file. Cut-off Std Deviation: %s" % (
            str(stdDev_cutoff))

        outputFileName = ".".join(fileName.split(
            ".")[:-1]) + ".Filtered-StdDev%s.tsv" % str(stdDev_cutoff)

        matrixFile = open(fileName, 'r')
        output = open(outputFileName, 'w')

        for i, row in enumerate(matrixFile):
            if headers == True and i == 0:
                if includeAllSiteAnnotation == False:
                    sampleIDs = row.strip().split(delimiter)[(annotationRows):]
                    headers = row.strip().split(delimiter)[:(annotationRows)]
                    headerLine = headers[3].strip(
                    ) + delimiter + delimiter.join(sampleIDs).strip(
                        delimiter) + "\n"
                    output.write(headerLine)
                else:
                    output.write(row)
            else:
                dataArray = row.strip().split(delimiter)[(annotationRows):]
                annotators = row.strip().split(delimiter)[:(annotationRows)]
                if annotators[annotationRows - 1] in (delimiter, ""):
                    annotators[annotationRows - 1] = "None"
                dataArray, stdDev = Filter.filterMatrix_standardDev(dataArray)
                if stdDev and stdDev > stdDev_cutoff:
                    pyDataArray = dataArray.tolist()
                    if includeAllSiteAnnotation == False:
                        output.write(annotators[3].strip() + delimiter +
                                     delimiter.join(map(
                                         str, pyDataArray)).strip(delimiter) +
                                     "\n")
                    else:
                        output.write(
                            delimiter.join(annotators).strip(delimiter) +
                            delimiter + delimiter.join(map(
                                str, pyDataArray)).strip(delimiter) + "\n")
        print "Completed filtering methylation data."
        matrixFile.flush()
        matrixFile.close()
        output.flush()
        output.close()
        return outputFileName
예제 #4
0
    def filterRNASeqData_standardDev(fileName, stdDev_cutoff = 2.0, headers = True, delimiter = "\t", includeAllSiteAnnotation = False):
        """
        Filters and cleans na values from data points and outputs data that has standard deviation above cutoff
        """
        from Analysis.Filter import Filter
        
        assert fileName
        assert float(stdDev_cutoff)
        print "Creating filtered methylation file. Cut-off Std Deviation: %s" % (str(stdDev_cutoff))
        
        outputFileName = ".".join(fileName.split(".")[:-1])+".filtered_stdDev%s.tsv" % str(stdDev_cutoff)
        
        matrixFile = open(fileName, 'r')
        output = open(outputFileName, 'w')
        
        minStdDev = 0
        maxStdDev = 0
        filteredCount = 0
        inclusionCount = 0
        
        for i, row in enumerate(matrixFile):
            if headers == True and i == 0:
                if includeAllSiteAnnotation == False:
                    sampleIDs = row.strip().split(delimiter)[1:]
                    headers = row.strip().split(delimiter)[0]
                    headerLine = headers.strip() + delimiter + delimiter.join(sampleIDs).strip(delimiter) + "\n"
                    output.write(headerLine)
                else:
                    output.write(row)
            else:
                dataArray = row.strip().split(delimiter)[1:]
                annotators = row.strip().split(delimiter)[0]
                dataArray, stdDev = Filter.filterMatrix_standardDev(dataArray)

                if stdDev:
                    if stdDev > maxStdDev:
                        maxStdDev = stdDev
                    if stdDev < minStdDev:
                        minStdDev = stdDev
                    if stdDev > stdDev_cutoff:
                        inclusionCount += 1
                        pyDataArray = dataArray.tolist()
                        if includeAllSiteAnnotation == False:
                            output.write(annotators.strip() + delimiter + delimiter.join(map(str, pyDataArray)).strip(delimiter) + "\n")
                        else:
                            output.write(annotators.strip(delimiter) + delimiter + delimiter.join(map(str, pyDataArray)).strip(delimiter) + "\n")
                    else:
                        filteredCount += 1
        print "Completed filtering RNASeq data. IncludedLines: %s Filtered Lines: %s" % (str(inclusionCount), str(filteredCount))
        print "Max StdDev: %s  Min StdDev: %s" % (maxStdDev, minStdDev)
        matrixFile.flush()
        matrixFile.close()
        output.flush()
        output.close()
        return outputFileName
예제 #5
0
    def calculateMethylationStats(inputFilename,
                                  statsOutputFilename,
                                  sortedOutputFilename=None,
                                  annotationColumns=1,
                                  headerLine=True,
                                  delimiter="\t"):
        import numpy as np
        from Utility.Utility import Utility
        from Analysis.Filter import Filter

        columns = {}
        columnHeaderDict = {}
        with open(inputFilename, "r") as inputFile:
            print "Opening file %s to parse per sample methylation data" % (
                inputFilename)
            columns, columnHeaderDict = Utility.getColumns(inputFile)

        ### CALCULATE STATS ###
        if len(columnHeaderDict) > annotationColumns:
            statsDict = {}
            meanDict = {}
            reverseDict = {}
            totalDict = {}

            for i in range(annotationColumns, len(columnHeaderDict.keys())):
                sampleID = columnHeaderDict[i]
                methArray = columns[sampleID]
                methArray = Filter.convert_NAtoNaN(methArray)
                dataPoints = np.array(methArray).astype(np.float)
                stdDev = np.std(dataPoints)
                mean = np.mean(dataPoints)
                median = np.median(dataPoints)
                total = np.sum(dataPoints)
                statsDict[sampleID] = (stdDev, mean, median, total)
                reverseDict[
                    total] = sampleID  # This could be problematic if two sampleIDs share the same mean, although it seems relatively unlikely
                meanDict[sampleID] = mean
                totalDict[sampleID] = total

            outputSampleOrder = []
            if sortedOutputFilename:
                sortedTotals = sorted(totalDict.values(), reverse=True)
                for s_mean in sortedTotals:
                    outputSampleOrder.append(reverseDict[s_mean])
            else:
                for i in range(annotationColumns,
                               len(columnHeaderDict.keys())):
                    outputSampleOrder.append(columnHeaderDict[i])

            usedStats = {"STD_DEV": 0, "MEAN": 1, "MEDIAN": 2, "TOTAL": 3}
            with open(statsOutputFilename, "w") as statsOutput:
                statsOutput.write("Statistic" + delimiter +
                                  delimiter.join(outputSampleOrder).strip() +
                                  "\n")
                for statType in usedStats.keys():
                    statLine = statType + delimiter
                    for s_id in outputSampleOrder:
                        statLine += str(
                            statsDict[s_id][usedStats[statType]]) + delimiter
                    statsOutput.write(statLine.strip(delimiter) + "\n")

            if sortedOutputFilename:
                with open(sortedOutputFilename, "w") as outputFile:
                    outputFile.write(
                        "CpG-ID" + delimiter +
                        delimiter.join(outputSampleOrder).strip() + "\n")
                    for j in range(0, len(columns[columnHeaderDict[0]])):
                        row = columns[columnHeaderDict[0]][j] + delimiter
                        for orderedID in outputSampleOrder:
                            row += columns[orderedID][j] + delimiter
                        outputFile.write(row.strip(delimiter) + "\n")
                print "Completed sorting and writing methylation matrix!"
            return sortedOutputFilename