def readDiffRegs(self, args): for countsFile in args.diffreg: if not fileExists(countsFile): raise PSToolException("Diffreg file does not exist: " + str(countsFile)) for diffFile in args.diffreg: df = EnrichmentDF.parseFromFile(diffFile)
def makeResults(self, parallelResult, oEnvironment, args): allDiffRegData = {} allDiffRegSims = defaultdict(dict) conditions = set() for file in args.diffreg: thisData = EnrichmentDF(DataFrame.parseFromFile(file)) condPair = tuple(thisData.getConditions()) for cond in condPair: conditions.add(cond) allDiffRegData[condPair] = thisData for method in args.methods: methodFCs = [] for x in thisData.getColumn(method + "_log2FC"): if x != None and x!= 'None': methodFCs.append(abs(float(x))) average = sum(methodFCs) / len(methodFCs) allDiffRegSims[method][condPair] = average allConditions = sorted(list(conditions)) for method in allDiffRegSims: sims = np.zeros( (len(allConditions), len(allConditions)) ) for condPair in allDiffRegSims[method]: sims[ allConditions.index(condPair[0]), allConditions.index(condPair[1]) ] = allDiffRegSims[method][condPair] sims[allConditions.index(condPair[1]), allConditions.index(condPair[0])] = allDiffRegSims[method][ condPair] PorePlot.heat_map_cluster(sims, allConditions, allConditions, "Similarity: " + str(method), "", pltcfg=args.pltcfg)
def __init__(self, args): super(FoldChangeAnalysis, self).__init__(args) self.counts = None self.condData = EnrichmentDF()
class FoldChangeAnalysis(ParallelPSTInterface): def __init__(self, args): super(FoldChangeAnalysis, self).__init__(args) self.counts = None self.condData = EnrichmentDF() def _makePropDict(self): return None def readCounts(self, args): counts = {} for countFiles in args.counts: groupName = None for countFile in countFiles: if groupName == None: groupName = countFile.name counts[groupName] = [] df = DataFrame.parseFromFile(countFile.name, [ 'gene', 'coverage', 'coverage_rank', 'read_counts', 'read_counts_rank', 'read_counts_sec', 'read_counts_sec_rank' ]) df.setFilepath(os.path.abspath(countFile.name)) counts[groupName].append(df) return counts def readDiffRegs(self, args): # TODO what did I want to do with this argument? for diffFile in args.diffreg: df = EnrichmentDF.parseFromFile(diffFile) def prepareInputs(self, args): return [] def execParallel(self, data, environment): return None def joinParallel(self, existResult, newResult, oEnvironment): return None def makeResults(self, parallelResult, oEnvironment, args): if not args.counts == None: """ counts is a defaultdict(list) for each condition name with maybe multiple samples """ counts = self.readCounts(args) vConds = sorted([x for x in counts]) createdComparisons = defaultdict(list) conditions = [] for valueSource in ['coverage', 'read_counts']: self.condData = EnrichmentDF() replicates = {} for condition in vConds: condData = counts[condition] condReplicates = [] for condDataSample in condData: geneNames = condDataSample.getColumnIndex('gene') geneCounts = condDataSample.getColumnIndex(valueSource) condRow = condDataSample.toDataRow( geneNames, geneCounts) sampleName = condDataSample.filepath conditions.append(sampleName) condReplicates.append(sampleName) self.condData.addCondition(condRow, sampleName) replicates[condition] = condReplicates print("Running for conditions: " + str(vConds)) createdComparisons[valueSource] += self.condData.runDEanalysis( args.output, prefix=valueSource, rscriptPath=args.rscript.name, methods=args.methods, replicates=replicates, noDErun=args.noanalysis) self.prepareHTMLOut(createdComparisons, replicates, args) if args.diffreg != None: createdComparisons = defaultdict(list) conditions = set() for file in args.diffreg: df = EnrichmentDF.parseFromFile(file) valueSource = self.getValueSource(df) conditions += df.getConditions() createdComparisons[valueSource].append(file) self.prepareHTMLOut(createdComparisons, conditions, args) def getValueSource(self, df): return df.data[0][1] def prepareHTMLOut(self, createdComparisons, replicates, args): for valueSource in createdComparisons: allComparisons = createdComparisons[valueSource] condPair2File = {} for x in allComparisons: condPair2File[(x[0], x[1])] = x[2] print("Comparisons") for x in condPair2File: print(x, condPair2File[x]) self.condData.printResult( args.output, prefix=valueSource, conditionPair2File=condPair2File, replicates=replicates ) #conditions=conditions, files=createdComparisons[valueSource])
def makeResults(self, parallelResult, oEnvironment, args): if not args.counts == None: """ counts is a defaultdict(list) for each condition name with maybe multiple samples """ counts = self.readCounts(args) vConds = sorted([x for x in counts]) createdComparisons = defaultdict(list) conditions = [] for valueSource in ['coverage', 'read_counts']: self.condData = EnrichmentDF() replicates = {} for condition in vConds: condData = counts[condition] condReplicates = [] for condDataSample in condData: geneNames = condDataSample.getColumnIndex('gene') geneCounts = condDataSample.getColumnIndex(valueSource) condRow = condDataSample.toDataRow( geneNames, geneCounts) sampleName = condDataSample.filepath conditions.append(sampleName) condReplicates.append(sampleName) self.condData.addCondition(condRow, sampleName) replicates[condition] = condReplicates print("Running for conditions: " + str(vConds)) createdComparisons[valueSource] += self.condData.runDEanalysis( args.output, prefix=valueSource, rscriptPath=args.rscript.name, methods=args.methods, replicates=replicates, noDErun=args.noanalysis) self.prepareHTMLOut(createdComparisons, replicates, args) if args.diffreg != None: createdComparisons = defaultdict(list) conditions = set() for file in args.diffreg: df = EnrichmentDF.parseFromFile(file) valueSource = self.getValueSource(df) conditions += df.getConditions() createdComparisons[valueSource].append(file) self.prepareHTMLOut(createdComparisons, conditions, args)
def readDiffRegs(self, args): # TODO what did I want to do with this argument? for diffFile in args.diffreg: df = EnrichmentDF.parseFromFile(diffFile)
def makeResults(self, parallelResult, oEnvironment, args): allDiffRegData = {} allDiffRegSims = defaultdict(dict) conditions = set() def parseNones(row): ret = [None] * len(row) for i in range(0, len(row)): if row[i] != 'None': ret[i] = row[i] return ret topGenes = Counter() for file in args.diffreg: thisData = EnrichmentDF(DataFrame.parseFromFile(file)) thisData.applyToRow(parseNones) condPair = tuple(thisData.getConditions()) for cond in condPair: conditions.add(cond) allDiffRegData[condPair] = thisData for method in args.methods: methodFCs = [] pvals = thisData.toDataRow( thisData.getColumnIndex('id'), thisData.getColumnIndex(method + "_RAW.PVA")) genepval = [(x[0], float(x[1])) for x in pvals.to_pairs() if x[1] != None] genepval.sort(key=lambda x: x[1]) for i in range(0, args.top): topGenes[genepval[i][0]] += 1 outputDF = DataFrame() geneIDidx = outputDF.addColumn('gene_id') countIdx = outputDF.addColumn('count') linkIdx = outputDF.addColumn('link') for (gene, count) in topGenes.most_common(): geneRow = DataRow.fromDict({ 'gene_id': gene, 'count': count, 'link': "<a href='http://www.uniprot.org/uniprot/?query=" + gene + "&sort=score' target='_blank'>UniProt</a>", }) outputDF.addRow(geneRow) outputDF.export(args.output, ExportTYPE.HTML)
def makeResults(self, parallelResult, oEnvironment, args): if not args.counts == None: """ counts is a defaultdict(list) for each condition name with maybe multiple samples """ geneEnhancement = self.loadEnhancement(args.enhanced) geneLengths = self.loadGeneLengths(args.lengths) counts, cond2samples = self.readCounts(args, biotypes=geneEnhancement, gene2length=geneLengths) #vConds = sorted([x for x in counts]) vConds = [x for x in counts] createdComparisons = defaultdict(list) conditions = [] for valueSource in ['count']: self.condData = EnrichmentDF() replicates = OrderedDict() for condition in vConds: condData = counts[condition] condReplicates = [] for condDataSample in condData: geneNames = condDataSample.getColumnIndex('gene') geneCounts = condDataSample.getColumnIndex(valueSource) rowUpdates = [] sampleName = condDataSample.filepath print(sampleName, len(condDataSample)) for row in condDataSample: rowData = { "id": row["gene"], sampleName: row[valueSource] } if args.libsize: rowData[sampleName + ".LS"] = row["LS"] if args.fpkm: rowData[sampleName + ".FPKM"] = row["FPKM"] if args.tpm: rowData[sampleName + ".TPM"] = row["TPM"] rowUpdates.append(rowData) #condRows = condDataSample.namedRows(geneNames, interestCols) #condRow = condDataSample.toDataRow(geneNames, geneCounts) conditions.append(sampleName) condReplicates.append(sampleName) print("Add Condition", sampleName, rowUpdates[0]) self.condData.addConditions(rowUpdates, sampleName) replicates[condition] = condReplicates print("Running for conditions: " + str(vConds)) createdComparisons[valueSource] += self.condData.runDEanalysis( args.output, prefix=valueSource, rscriptPath=args.rscript.name, methods=args.methods, replicates=replicates, noDErun=args.noanalysis, enhanceSymbol=geneEnhancement, geneLengths=geneLengths) self.prepareHTMLOut(createdComparisons, replicates, args)
class FoldChangeFeatureCountsAnalysis(ParallelPSTInterface): def __init__(self, args): super(FoldChangeFeatureCountsAnalysis, self).__init__(args) self.counts = None self.condData = EnrichmentDF() def _makePropDict(self): return None def readCounts(self, args, biotypes=None, gene2length=None): if args.norrna and biotypes == None: raise argparse.ArgumentParser().error( "removal of rRNA requires --enhanced!") if args.removemtrna and biotypes == None: raise argparse.ArgumentParser().error( "removal of mtRNA requires --enhanced!") if args.only_protein_coding and biotypes == None: raise argparse.ArgumentParser().error( "--only-protein-coding requires --enhanced!") if args.fpkm and gene2length == None: raise argparse.ArgumentParser().error( "calculation of FPKM requires --lengths!") if args.tpm and gene2length == None: raise argparse.ArgumentParser().error( "calculation of TPM requires --lengths!") featureCountsColumns = [ "Geneid", "Chr", "Start", "End", "Strand", "Length" ] counts = defaultdict(lambda: list()) condition2samples = defaultdict(list) for idx, countFile in enumerate(args.counts): print("Loading File", idx, ":", countFile.name) countFilePrefix = args.prefixes[idx] df = DataFrame.parseFromFile(countFile.name, skipChar='#') allheaders = df.getHeader() sampleHeaders = [ x for x in allheaders if not x in featureCountsColumns ] for sample in sampleHeaders: condition2samples[sample].append(countFilePrefix + sample) for condGroup in args.conditions: condName = condGroup[0] for condElement in condGroup: print(condName, condElement, condElement in allheaders) if args.allow_nonexistant_cond and not condElement in allheaders: continue subDf = df.selectColumns({ "Geneid": "gene", condElement: "count" }) if args.removestable: geneColIdx = subDf.getColumnIndex("gene") subDf.applyByRow("gene", lambda x: x[geneColIdx].split(".")[0]) if biotypes != None and args.norrna: geneColIdx = subDf.getColumnIndex("gene") subDf.filterRows( lambda x: x[geneColIdx] in biotypes and not "rRNA" in biotypes[x[geneColIdx]][1]) if biotypes != None and args.removemtrna: geneColIdx = subDf.getColumnIndex("gene") subDf.filterRows( lambda x: x[geneColIdx] in biotypes and not "Mt_" in biotypes[x[geneColIdx]][1]) if biotypes != None and args.only_protein_coding: subDf.filterRows( lambda x: x[geneColIdx] in biotypes and "protein_coding" in biotypes[x[geneColIdx]][1]) if os.path.isdir(condElement) or os.path.isfile( condElement): subDf.setFilepath(os.path.abspath(condElement)) else: subDf.setFilepath(condElement) if args.libsize: countCol = subDf.getColumnIndex("count") geneCol = subDf.getColumnIndex("gene") totalCounts = sum([x[countCol] for x in subDf.data]) libSizeIdx = subDf.addColumn("LS", 0) def addLibSize(x): x[libSizeIdx] = (x[countCol] / totalCounts) * 10000 return tuple(x) subDf.applyToRow(addLibSize) if args.fpkm: countCol = subDf.getColumnIndex("count") geneCol = subDf.getColumnIndex("gene") totalCounts = sum([x[countCol] for x in subDf.data]) fpkmIdx = subDf.addColumn("FPKM", 0) def addFPKM(x): geneID = x[geneCol] geneLength = gene2length.get(geneID, 0) if geneLength == 0: x[fpkmIdx] = 0 else: x[fpkmIdx] = x[countCol] / ( totalCounts * geneLength) * pow(10, 9) return tuple(x) subDf.applyToRow(addFPKM) if args.tpm: countCol = subDf.getColumnIndex("count") geneCol = subDf.getColumnIndex("gene") totalCounts = sum([x[countCol] for x in subDf.data]) totalRatio = 0 for row in subDf: geneID = row["gene"] geneCount = row["count"] geneLength = gene2length.get(geneID, 0) if geneLength == 0: pass else: totalRatio += geneCount / geneLength tpmIdx = subDf.addColumn("TPM", 0) def addTPM(x): geneID = x[geneCol] geneLength = gene2length.get(geneID, 0) if geneLength == 0: x[fpkmIdx] = 0 else: x[tpmIdx] = x[countCol] / ( geneLength * totalRatio) * pow(10, 6) return tuple(x) subDf.applyToRow(addTPM) counts[condName].append(subDf) return counts, condition2samples def readDiffRegs(self, args): # TODO what did I want to do with this argument? for diffFile in args.diffreg: df = EnrichmentDF.parseFromFile(diffFile) def prepareInputs(self, args): return [] def execParallel(self, data, environment): return None def joinParallel(self, existResult, newResult, oEnvironment): return None def loadEnhancement(self, fileE): if fileE == None: print("Not loading gene name enhancements") return {} print("Loading gene name enhancements", fileE.name) ens2sym = {} for lidx, line in enumerate(fileE): line = line.strip().split("\t") if lidx == 0 or line[0].startswith("#"): continue ensemblID = line[0] geneSymbol = line[1] biotype = line[2] #if len(geneSymbol) == 0: # continue ens2sym[ensemblID] = (geneSymbol, biotype) return ens2sym def loadGeneLengths(self, fileE): if fileE == None: print("Not loading gene lengths") return None print("Loading gene lengths", fileE.name) """ Ensembl_gene_identifier GeneID length ENSMUSG00000000001 14679 3262 ENSMUSG00000000003 54192 902 ENSMUSG00000000028 12544 2252 """ ens2gl = {} for lidx, line in enumerate(fileE): line = line.strip().split("\t") if lidx == 0: try: int(line[1]) except: continue ensemblID = line[0] geneLength = line[1] if len(ensemblID) == 0 or len(geneLength) == 0: continue geneLength = int(geneLength) ens2gl[ensemblID] = geneLength return ens2gl def makeResults(self, parallelResult, oEnvironment, args): if not args.counts == None: """ counts is a defaultdict(list) for each condition name with maybe multiple samples """ geneEnhancement = self.loadEnhancement(args.enhanced) geneLengths = self.loadGeneLengths(args.lengths) counts, cond2samples = self.readCounts(args, biotypes=geneEnhancement, gene2length=geneLengths) #vConds = sorted([x for x in counts]) vConds = [x for x in counts] createdComparisons = defaultdict(list) conditions = [] for valueSource in ['count']: self.condData = EnrichmentDF() replicates = OrderedDict() for condition in vConds: condData = counts[condition] condReplicates = [] for condDataSample in condData: geneNames = condDataSample.getColumnIndex('gene') geneCounts = condDataSample.getColumnIndex(valueSource) rowUpdates = [] sampleName = condDataSample.filepath print(sampleName, len(condDataSample)) for row in condDataSample: rowData = { "id": row["gene"], sampleName: row[valueSource] } if args.libsize: rowData[sampleName + ".LS"] = row["LS"] if args.fpkm: rowData[sampleName + ".FPKM"] = row["FPKM"] if args.tpm: rowData[sampleName + ".TPM"] = row["TPM"] rowUpdates.append(rowData) #condRows = condDataSample.namedRows(geneNames, interestCols) #condRow = condDataSample.toDataRow(geneNames, geneCounts) conditions.append(sampleName) condReplicates.append(sampleName) print("Add Condition", sampleName, rowUpdates[0]) self.condData.addConditions(rowUpdates, sampleName) replicates[condition] = condReplicates print("Running for conditions: " + str(vConds)) createdComparisons[valueSource] += self.condData.runDEanalysis( args.output, prefix=valueSource, rscriptPath=args.rscript.name, methods=args.methods, replicates=replicates, noDErun=args.noanalysis, enhanceSymbol=geneEnhancement, geneLengths=geneLengths) self.prepareHTMLOut(createdComparisons, replicates, args) def getValueSource(self, df): return df.data[0][1] def prepareHTMLOut(self, createdComparisons, replicates, args): for valueSource in createdComparisons: allComparisons = createdComparisons[valueSource] condPair2File = {} for x in allComparisons: condPair2File[(x[0], x[1])] = x[2] print("Comparisons") for x in condPair2File: print(x, condPair2File[x]) self.condData.printResult( args.output, prefix=valueSource, conditionPair2File=condPair2File, replicates=replicates ) # conditions=conditions, files=createdComparisons[valueSource])
def __init__(self, args): super(FoldChangeSimilarity, self).__init__(args) self.counts = None self.condData = EnrichmentDF()