def removeNaN(inputFilename, outputFilename, removeNaN=True, delimiter="\t", header=True): from Analysis.Filter import Filter import numpy as np with open(outputFilename, "w") as outputFile: print "Commencing annotation concatenation. Will remove NA values: %s" % removeNaN inputFile = open(inputFilename, "r") for i, line in enumerate(inputFile): if header == True and i == 0: outputFile.write(line.strip() + "\n") else: isAllNaN = False array = line.strip().split(delimiter)[5:] dataPoints = Filter.convert_NAtoNaN(array) dataPoints = np.array(dataPoints).astype(np.float) isAllNaN = Filter.test_allNaN(dataPoints) if isAllNaN == False: pyDataArray = dataPoints.tolist() outputFile.write(line.strip() + "\n") print "Completed. Flushing resources." inputFile.flush() inputFile.close() return outputFilename
def foldChange_fromMedian(array, arbitraryAddition = False): import numpy as np from Analysis.Filter import Filter array = Filter.convert_NAtoNaN(array) dataRow = np.array(array).astype(np.float) if Filter.test_allNaN(dataRow) == False: if Filter.test_containsNaN(dataRow) == True: dataRow = Filter.filterNonNumericCells_toZero(dataRow) if arbitraryAddition != False: assert float(arbitraryAddition) dataRow = RNASeqV2.addAbitraryAddition_nonZero(dataRow, arbitraryAddition) median = np.median(dataRow) for x in np.nditer(dataRow, op_flags=['readwrite']): x[...] = (np.log2(x) - np.log2(median)) return dataRow return None
def filterMethData_standardDev(fileName, annotationRows, stdDev_cutoff=0.2, headers=True, delimiter="\t", includeAllSiteAnnotation=False): """ Filters and cleans na values from data points and outputs data that has standard deviation above cutoff """ from Analysis.Filter import Filter assert fileName assert int(annotationRows) assert float(stdDev_cutoff) print "Creating filtered methylation file. Cut-off Std Deviation: %s" % ( str(stdDev_cutoff)) outputFileName = ".".join(fileName.split( ".")[:-1]) + ".Filtered-StdDev%s.tsv" % str(stdDev_cutoff) matrixFile = open(fileName, 'r') output = open(outputFileName, 'w') for i, row in enumerate(matrixFile): if headers == True and i == 0: if includeAllSiteAnnotation == False: sampleIDs = row.strip().split(delimiter)[(annotationRows):] headers = row.strip().split(delimiter)[:(annotationRows)] headerLine = headers[3].strip( ) + delimiter + delimiter.join(sampleIDs).strip( delimiter) + "\n" output.write(headerLine) else: output.write(row) else: dataArray = row.strip().split(delimiter)[(annotationRows):] annotators = row.strip().split(delimiter)[:(annotationRows)] if annotators[annotationRows - 1] in (delimiter, ""): annotators[annotationRows - 1] = "None" dataArray, stdDev = Filter.filterMatrix_standardDev(dataArray) if stdDev and stdDev > stdDev_cutoff: pyDataArray = dataArray.tolist() if includeAllSiteAnnotation == False: output.write(annotators[3].strip() + delimiter + delimiter.join(map( str, pyDataArray)).strip(delimiter) + "\n") else: output.write( delimiter.join(annotators).strip(delimiter) + delimiter + delimiter.join(map( str, pyDataArray)).strip(delimiter) + "\n") print "Completed filtering methylation data." matrixFile.flush() matrixFile.close() output.flush() output.close() return outputFileName
def filterRNASeqData_standardDev(fileName, stdDev_cutoff = 2.0, headers = True, delimiter = "\t", includeAllSiteAnnotation = False): """ Filters and cleans na values from data points and outputs data that has standard deviation above cutoff """ from Analysis.Filter import Filter assert fileName assert float(stdDev_cutoff) print "Creating filtered methylation file. Cut-off Std Deviation: %s" % (str(stdDev_cutoff)) outputFileName = ".".join(fileName.split(".")[:-1])+".filtered_stdDev%s.tsv" % str(stdDev_cutoff) matrixFile = open(fileName, 'r') output = open(outputFileName, 'w') minStdDev = 0 maxStdDev = 0 filteredCount = 0 inclusionCount = 0 for i, row in enumerate(matrixFile): if headers == True and i == 0: if includeAllSiteAnnotation == False: sampleIDs = row.strip().split(delimiter)[1:] headers = row.strip().split(delimiter)[0] headerLine = headers.strip() + delimiter + delimiter.join(sampleIDs).strip(delimiter) + "\n" output.write(headerLine) else: output.write(row) else: dataArray = row.strip().split(delimiter)[1:] annotators = row.strip().split(delimiter)[0] dataArray, stdDev = Filter.filterMatrix_standardDev(dataArray) if stdDev: if stdDev > maxStdDev: maxStdDev = stdDev if stdDev < minStdDev: minStdDev = stdDev if stdDev > stdDev_cutoff: inclusionCount += 1 pyDataArray = dataArray.tolist() if includeAllSiteAnnotation == False: output.write(annotators.strip() + delimiter + delimiter.join(map(str, pyDataArray)).strip(delimiter) + "\n") else: output.write(annotators.strip(delimiter) + delimiter + delimiter.join(map(str, pyDataArray)).strip(delimiter) + "\n") else: filteredCount += 1 print "Completed filtering RNASeq data. IncludedLines: %s Filtered Lines: %s" % (str(inclusionCount), str(filteredCount)) print "Max StdDev: %s Min StdDev: %s" % (maxStdDev, minStdDev) matrixFile.flush() matrixFile.close() output.flush() output.close() return outputFileName
def calculateMethylationStats(inputFilename, statsOutputFilename, sortedOutputFilename=None, annotationColumns=1, headerLine=True, delimiter="\t"): import numpy as np from Utility.Utility import Utility from Analysis.Filter import Filter columns = {} columnHeaderDict = {} with open(inputFilename, "r") as inputFile: print "Opening file %s to parse per sample methylation data" % ( inputFilename) columns, columnHeaderDict = Utility.getColumns(inputFile) ### CALCULATE STATS ### if len(columnHeaderDict) > annotationColumns: statsDict = {} meanDict = {} reverseDict = {} totalDict = {} for i in range(annotationColumns, len(columnHeaderDict.keys())): sampleID = columnHeaderDict[i] methArray = columns[sampleID] methArray = Filter.convert_NAtoNaN(methArray) dataPoints = np.array(methArray).astype(np.float) stdDev = np.std(dataPoints) mean = np.mean(dataPoints) median = np.median(dataPoints) total = np.sum(dataPoints) statsDict[sampleID] = (stdDev, mean, median, total) reverseDict[ total] = sampleID # This could be problematic if two sampleIDs share the same mean, although it seems relatively unlikely meanDict[sampleID] = mean totalDict[sampleID] = total outputSampleOrder = [] if sortedOutputFilename: sortedTotals = sorted(totalDict.values(), reverse=True) for s_mean in sortedTotals: outputSampleOrder.append(reverseDict[s_mean]) else: for i in range(annotationColumns, len(columnHeaderDict.keys())): outputSampleOrder.append(columnHeaderDict[i]) usedStats = {"STD_DEV": 0, "MEAN": 1, "MEDIAN": 2, "TOTAL": 3} with open(statsOutputFilename, "w") as statsOutput: statsOutput.write("Statistic" + delimiter + delimiter.join(outputSampleOrder).strip() + "\n") for statType in usedStats.keys(): statLine = statType + delimiter for s_id in outputSampleOrder: statLine += str( statsDict[s_id][usedStats[statType]]) + delimiter statsOutput.write(statLine.strip(delimiter) + "\n") if sortedOutputFilename: with open(sortedOutputFilename, "w") as outputFile: outputFile.write( "CpG-ID" + delimiter + delimiter.join(outputSampleOrder).strip() + "\n") for j in range(0, len(columns[columnHeaderDict[0]])): row = columns[columnHeaderDict[0]][j] + delimiter for orderedID in outputSampleOrder: row += columns[orderedID][j] + delimiter outputFile.write(row.strip(delimiter) + "\n") print "Completed sorting and writing methylation matrix!" return sortedOutputFilename