def execute(cls, choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' if choices[2]=='Transfac TF ids': mappingFn = 'pwm2TFids.shelf' mapping = safeshelve.open(Tool1.MAPPING_SHELVES_PATH + os.sep + mappingFn ) elif choices[2]== 'Transfac TF readable names': mappingFn = 'pwm2TFnamesNew.shelf' mapping = safeshelve.open(Tool1.MAPPING_SHELVES_PATH + os.sep + mappingFn ) elif choices[2]== 'HGNC gene symbols': mappingFn = 'PWM_to_HGNC.txt' mapping = dict([line.strip().split() for line in open(Tool1.MAPPING_SHELVES_PATH + os.sep + mappingFn).readlines()]) else: raise Exception(choices[2]) if galaxyFn==None: for key in sorted(mapping.keys()): print key + ':' + ','.join(mapping[key]) + os.linesep, else: mappingStaticFile = GalaxyRunSpecificFile(['mapping.txt'], galaxyFn) f = mappingStaticFile.getFile() for key in sorted(mapping.keys()): if type(mapping[key]) in (list,tuple): mapping[key] = ','.join(mapping[key]) f.write( key + ':' + mapping[key] + os.linesep ) f.close() print mappingStaticFile.getLink('View/download mapping')
def execute(cls, choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' from time import time startTime = time() from quick.application.ExternalTrackManager import ExternalTrackManager from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile motifFn = ExternalTrackManager.extractFnFromGalaxyTN( choices[0].split(':')) observedFasta = ExternalTrackManager.extractFnFromGalaxyTN( choices[1].split(':')) randomGalaxyTN = choices[2].split(':') randomName = ExternalTrackManager.extractNameFromHistoryTN( randomGalaxyTN) randomGalaxyFn = ExternalTrackManager.extractFnFromGalaxyTN( randomGalaxyTN) randomStatic = GalaxyRunSpecificFile( ['random'], randomGalaxyFn ) #finds path to static file created for a previous history element (randomFn), and directs to a folder containing several files.. #print os.listdir(randomStatic.getDiskPath()) randomFastaPath = randomStatic.getDiskPath() #motifFn, observedFasta, randomFastaPath = '/Users/sandve/egne_dokumenter/_faglig/NullModels/DnaSeqExample/liver.pwm', 'liver.fa', 'randomFastas' testStatistic = choices[3] if testStatistic == 'Average of max score per sequence': scoreFunc = scoreMotifOnFastaAsAvgOfBestScores elif testStatistic == 'Sum of scores across all positions of all sequences': scoreFunc = scoreMotifOnFastaAsSumOfAllScores elif testStatistic == 'Score of Frith et al. (2004)': scoreFunc = lr4 elif testStatistic == 'Product of max per sequence': scoreFunc = scoreMotifOnFastaAsProductOfBestScores else: raise pvals = mcPvalFromMotifAndFastas(motifFn, observedFasta, randomFastaPath, scoreFunc) print 'Pvals for motifs (%s) against observed (%s) vs random (%s - %s) sequences.' % ( motifFn, observedFasta, randomName, randomFastaPath) for motif, pval in sorted(pvals.items()): print motif + '\t' + ('%.4f' % pval) from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile from proto.RSetup import robjects histStaticFile = GalaxyRunSpecificFile(['pvalHist.png'], galaxyFn) #histStaticFile.openRFigure() histStaticFile.plotRHist(pvals.values(), [x / 40.0 for x in range(41)], 'Histogram of p-values', xlim=robjects.FloatVector([0.0, 1.0])) #r.hist(robjects.FloatVector(pvals.values()), breaks=robjects.FloatVector([x/40.0 for x in range(41)]), xlim=robjects.FloatVector([0.0, 1.0]), main='Histogram of p-values' ) #histStaticFile.closeRFigure() print histStaticFile.getLink('Histogram') print 'Time (s):', time() - startTime
def collectParamsIntoFile(self): parameters = GalaxyRunSpecificFile(['run_parameters.html'],self.jobFile) #just collect the parametes used into a file p_path = parameters.getDiskPath(True) p_output = open(p_path,'w') print>>p_output, '<html><body>' print>>p_output, '<ol>' for key in self.params.keys(): print>>p_output, '<li>%s:%s </li>'%(key,self.params[key]) print>>p_output, '</body></html>' p_output.close() print parameters.getLink('Parameters of this run')
def findGeneTargets(genome, regionsTn, upFlankSize, downFlankSize, galaxyFn): assert genome in ['hg18','hg19','mm9'] #tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome) #tfTrackName = tfTrackNameMappings[tfSource] + [selectedTF] geneIntersection = GeneIntersection(genome, 'Ensembl', regionsTn, galaxyFn) geneIntersection.expandReferenceTrack(upFlankSize, downFlankSize) expansionStr = ' flanked' if not (upFlankSize == downFlankSize == 0) else '' #print '<p>There are %i Ensemble genes being targets of your selected TF (%s), based on intersecting TF target positions with%s %sgene regions.</p>' % (geneIntersection.getNumberOfIntersectedBins(), selectedTF, expansionStr, 'Ensembl') print '<p>There are %i Ensemble genes being targets of your selected regions, based on intersecting your supplied regions with%s %sgene regions.</p>' % (geneIntersection.getNumberOfIntersectedBins(), expansionStr, 'Ensembl') idFileNamer = geneIntersection.getGeneIdStaticFileWithContent() print '<p>', idFileNamer.getLink('Download list'), ' of all Ensemble IDs with 1 or more hits.</p>' regFileNamer = geneIntersection.getIntersectedRegionsStaticFileWithContent() print '<p>', regFileNamer.getLink('Download bed-file'), ' of all Ensembl gene regions with 1 or more hits.</p>' targetBins = geneIntersection.getIntersectedReferenceBins() res = geneIntersection.getIntersectionResult() resDictKey = geneIntersection.getUniqueResDictKey() setOfNumOccurrences = set([res[bin][resDictKey] for bin in targetBins]) byNumOccurrencesStaticFile = GalaxyRunSpecificFile(['genes_by_num_occurrences.html'], galaxyFn) f = byNumOccurrencesStaticFile.getFile() for numOccurrences in reversed(sorted(setOfNumOccurrences)): f.write('Gene regions having %i occurrences:<br>' % numOccurrences + '<br>' + os.linesep) f.write(', '.join([ '<a href=http://www.ensembl.org/Homo_sapiens/Gene/Summary?g='+str(bin.val).split('|')[0]+'>'+str(bin.val).split('|')[0]+'</a>' for bin in targetBins if res[bin][resDictKey]==numOccurrences]) + '<br><br>' + os.linesep) f.close() print '</p>Inspect list of all intersected genes (by ID), ', byNumOccurrencesStaticFile.getLink('ordered by number of occurrences') + ' inside, and with links to gene descriptions.<br>'
def analyzeNumRejectedDistribution(maxNumSamples, h, fdrThreshold, totalNumTests, totalNumH1Tests, numReplications, a, b, galaxyFn=None): numRej = [] texts = [] #estimate time use: print '(estimating run time..)' prevTime = time.time() #Experiment._analyzeNumRejectedDistribution(maxNumSamples, None, None, fdrThreshold, totalNumTests, totalNumH1Tests, 1,a,b, galaxyFn) Experiment._analyzeNumRejectedDistribution(maxNumSamples, None, None, fdrThreshold, 1, 1, 1, a, b, galaxyFn) baseMeasure = time.time() - prevTime withOnlyMaxNumEstimate = baseMeasure * totalNumTests * numReplications #print 'Estimated running time: between %i and %i seconds.' % (withOnlyMaxNumEstimate, withOnlyMaxNumEstimate*3) print 'Estimated running time: around %i seconds. (%.1f hours)' % ( withOnlyMaxNumEstimate, withOnlyMaxNumEstimate / 3600.0) for x, y, z, simult, text in [ [maxNumSamples, None, None, True, 'Basic'], [maxNumSamples, h, None, True, 'Sequential'], [maxNumSamples, h, fdrThreshold, True, 'McFdr Simultanous'], [maxNumSamples, h, fdrThreshold, False, 'McFdr Individual'] ]: print text, ':' MultipleTestCollection.SIMULTANOUS_FDR_STOPPING_CRITERION = simult numRej.append( Experiment._analyzeNumRejectedDistribution( x, y, z, fdrThreshold, totalNumTests, totalNumH1Tests, numReplications, a, b, galaxyFn)) texts.append(text) plotStaticFile = GalaxyRunSpecificFile(['numRej.png'], galaxyFn) plotStaticFile.plotRLines(range(len(numRej[0])), numRej, xlab='Sorted simulations', ylab='num Rejected', legend=texts) print plotStaticFile.getLink('Cumulative distribution')
def singleSimulation(self, numH0, numH1, replicateIndex, verbose=False): tests = MultipleTestCollection(numH0, numH1, self._maxNumSamples, self._h, self._fdrThreshold,self._a,self._b) tests.addSamples(self.NUM_SAMPLES_INITIALLY) while not tests.allTestsAreDetermined(): tests.addSamples(self.NUM_SAMPLES_PER_CHUNK) #if verbose: #print tests.getTotalNumSamples() #As sampling is now anyway over, we set fdrThreshold to a threshold used after computations are finished (i.e. affects final rejection/acception, but not stopping of samples) tests.setFdrThresholdAtAllCounters(self._postFdrThreshold) #print 'FINALLY, #samples: ', if self._galaxyFn is not None: if self._h is None: scheme = 'Basic' elif self._fdrThreshold is None: scheme = 'Sequential' else: scheme = 'McFdr' staticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'PandQvals.txt'], self._galaxyFn) tests.writeAllPandQVals(staticFile.getFile() ) linkToRaw = staticFile.getLink('Raw p and q-vals') + ' under %s scheme with %i true H1, (replication %i)' % (scheme, numH1, replicateIndex) figStaticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'PandQvals.png'], self._galaxyFn) figStaticFile.openRFigure() tests.makeAllPandQValsFigure() figStaticFile.closeRFigure() linkToFig = figStaticFile.getLink(' (p/q-figure) ') + '<br>' figNumSamplesStaticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'NumSamples.png'], self._galaxyFn) figNumSamplesStaticFile.openRFigure() tests.makeNumSamplesFigure() figNumSamplesStaticFile.closeRFigure() linkToNumSamplesFig = figNumSamplesStaticFile.getLink(' (numSamples-figure) ') + '<br>' catalogStaticFile = GalaxyRunSpecificFile([str(numH1),'cat.html'], self._galaxyFn) catalogStaticFile.writeTextToFile(linkToRaw + linkToFig + linkToNumSamplesFig, mode='a') #if verbose: #print sorted(tests.getFdrVals()) #print 'NumS ign Below 0.2: ', sum([1 if t<0.2 else 0 for t in tests.getFdrVals()]) #return tests.getTotalNumSamples(), tests.getTotalNumRejected() return tests.getTotalNumSamples(), tests.getTotalNumRejected(), tests.getClassificationSummaries()
def makeHtmlStr(self): htmlPage = GalaxyRunSpecificFile( ['html', '_'.join(self.track), 'page.html'], self.galaxyFn) htmlStr = 'TF: ' + self.tf + '<br/>\nChip-seq peaks: ' + self.chipSeqPeaks + '<br/>\nPWM: ' + self.pwm + '<br/>\nNumber of SNV-intersected binding regions: ' + self.intersectingPoints + '<br/>\nHighest binding difference: ' + self.maxPwmDiff + '<br/>\nAvg binding difference: ' + self.avgPwmDiff + '<br/>\n' + self.regularFasta.getLink( 'Original Fasta') + '<br/>\n' + self.mutatedFasta.getLink( 'Mutated Fasta') + '<br/>\n' + self.pwmDiffScore.getLink( 'PWM score for each region' ) + '<br/>\n' + self.gtrackDiffScore.getLink( 'Gtrack of PWM score for each region') htmlPage.writeTextToFile(htmlStr) return htmlPage.getLink(self.tf + ': ' + self.track[-1])
def getLinkToSingleLocalHtmlResultsTable(self, linkText, disease, resDictKey, galaxyFn): core = HtmlCore() core.begin() core.paragraph( self.getHtmlLocalResultsTable(resDictKey, fillInNoneValues=True)) core.end() staticFile = GalaxyRunSpecificFile( ['LocalResultTables', resDictKey, disease + '.html'], galaxyFn) staticFile.writeTextToFile(str(core)) return staticFile.getLink(linkText)
def execute(cls, choices, galaxyFn=None, username=''): file = choices.file import quick.extra.stefania.Functions_defineDistance_CreateDistanceMatrix_ClusterAnalysis as cdm if choices.mothers == "": numMothers = None else: numMothers = int(choices.mothers) # inputFile = open(ExternalTrackManager.extractFnFromGalaxyTN(file.split(':')), 'r') # with inputFile as f: # data = [x.strip('\n') for x in f.readlines()] # f.closed from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile sf = GalaxyRunSpecificFile(["matrix.pickle"],galaxyFn) sfPng = GalaxyRunSpecificFile(["matrix.png"], galaxyFn) sfPng1 = GalaxyRunSpecificFile(["Flat1.csv"], galaxyFn) sfPng2 = GalaxyRunSpecificFile(["Flat2.csv"], galaxyFn) filename = ExternalTrackManager.extractFnFromGalaxyTN(file.split(':')) outFn = sf.getDiskPath(ensurePath=True) outDendrogram = sfPng.getDiskPath(ensurePath=True) clustersFileOutput1 = sfPng1.getDiskPath(ensurePath=True) clustersFileOutput2 = sfPng2.getDiskPath(ensurePath=True) cdm.createDistanceMatrix(filename, outFn, outFileType='pkl', womanIDcolPosition = 0, numRows=numMothers) cdm.clusteringFunction(outFn, outDendrogram, clustersFileOutput1, clustersFileOutput2, type='hierarchical', method1= 'centroid', method2='complete') print "Result: ", sf.getLink("pickle-file") htmlCore = HtmlCore() htmlCore.begin() htmlCore.divBegin('plot1') htmlCore.link('Download plot', sfPng.getURL()) htmlCore.image(sfPng.getURL()) htmlCore.divEnd() htmlCore.divBegin('plot1') htmlCore.link('Download file1', sfPng1.getURL()) htmlCore.divEnd() htmlCore.divBegin('plot1') htmlCore.link('Download file2', sfPng2.getURL()) htmlCore.divEnd() htmlCore.end() print htmlCore
def getReference(self, resDictKey): globalRes = self._results.getGlobalResult() htmlObj = GalaxyRunSpecificFile(['test.html'], self._baseDir) fileObj = open(htmlObj.getDiskPath(ensurePath=True), 'w') #fileObj.write( 'globalRes ' + str(globalRes)) htmlText = CreateBpsVennDIagram.getHtmlString( globalRes['result']['catInfo'], globalRes['result']['stateBPCounter'], globalRes['result']['genome']) fileObj.write(htmlText) fileObj.close() return htmlObj.getLink('link to results') return str(globalRes) return strWithStdFormatting( globalRes[resDictKey]) if globalRes not in [None, {}] else 'None'
def getResultTableLink(self, refSubType, linkText): assert self._galaxyFn is not None and self._gwasId is not None res = self.getResult(refSubType) basedir = GalaxyRunSpecificFile( ['ResultTableDetails', self._gwasId, refSubType], self._galaxyFn).getDiskPath(ensurePath=True) staticFile = GalaxyRunSpecificFile( ['ResultTables', self._gwasId, refSubType + '.html'], self._galaxyFn) core = HtmlCore() core.begin() if hasattr(res, 'batchText'): core.paragraph('<pre> Corresponding batch command line:\n ' + res.batchText + '</pre>') core.paragraph(str(ResultsViewer(res, basedir))) core.end() staticFile.writeTextToFile(str(core)) #staticFile.writeTextToFile( str(ResultsViewer(res, basedir) ) ) return staticFile.getLink(linkText)
def getLinkToLocalResultsHeatmap(self, linkText, disease, resDictKey, galaxyFn): values = [] allLocalResults = self.getAllLocalResults(resDictKey, fillInNoneValues=True) allLocalRegions = self.getLocalRegions() refSubTypes = self.getRefSubTypes() numRows = len(allLocalRegions) for localRegion in allLocalRegions: for refSubType in refSubTypes: values.append(allLocalResults[refSubType][localRegion]) if None in values or any(numpy.isnan(x) for x in values): return 'Not generated, due to missing values' #if not ( 0 < (float(sum(values)) / len(values)) < 100000): # return 'Not generated, due to too small/large values (average: %s)' % (float(sum(values)) / len(values)) maxVal = max(values) from proto.RSetup import r, robjects r('library(gplots)') dataMatrix = r.matrix(robjects.FloatVector(values), nrow=numRows) if (r.length(r.unique(r.colSums(dataMatrix))) <= 1) or (r.length( r.unique(r.rowSums(dataMatrix))) <= 1): return 'Not generated, due to lacking variation' dataMatrix = r('function(data,names){rownames(data)=names; data}')( dataMatrix, [str(x) for x in allLocalRegions]) dataMatrix = r('function(data,names){colnames(data)=names; data}')( dataMatrix, refSubTypes) #print 'dimensions dataMatrix: ', r.dim(dataMatrix), dataMatrix sf = GalaxyRunSpecificFile( ['LocalResultTables', resDictKey, disease + '_heatmap.png'], galaxyFn) sf.openRFigure(h=4000, w=4000) r("function(data,maxVal){heatmap.2(data,col =c('#99FFFF',colorRampPalette(c('cyan','blue', 'black', 'red', 'yellow'))(161),'#FFFF66'), breaks = seq(0,maxVal,length=164),trace='none',margins=c(15,15))}" )(dataMatrix, maxVal) #r("function(data){heatmap(data)}")(dataMatrix) sf.closeRFigure() return sf.getLink(linkText)
class ExactlySpecifiedTF(object): def __init__(self, tf, chipSeqPeaks, pwm, tracks, galaxyFn): self.tf = tf self.chipSeqPeaks = chipSeqPeaks self.pwm = pwm assert len(tracks) == 2 self.track = tracks[0] self.mutationTrack = tracks[1] self.galaxyFn = galaxyFn self.bedPwmDiffScore = GalaxyRunSpecificFile( ['pwmDiffScore', self.pwm + '_'.join(self.track), 'pwmDiff.bed'], self.galaxyFn) self.pwmDiffScore = GalaxyRunSpecificFile( ['pwmDiffScore', self.pwm + '_'.join(self.track), 'pwmDiff.html'], self.galaxyFn) self.gtrackDiffScore = GalaxyRunSpecificFile([ 'pwmDiffScore', self.pwm + '_'.join(self.track), 'pwmDiff.gtrack' ], self.galaxyFn) self.mutatedFasta = GalaxyRunSpecificFile( ['fastaFiles', '_'.join(self.track), 'mutatedFastseq.fasta'], self.galaxyFn) self.regularFasta = GalaxyRunSpecificFile( ['fastaFiles', '_'.join(self.track), 'regularFastseq.fasta'], self.galaxyFn) self.maxPwmDiff = None self.avgPwmDiff = None self.numPwmDiff = 0 def getFastaFiles(self, genome): assert self.track assert self.mutationTrack regionDict, pointDict = self.IntersectData( genome, [self.track, self.mutationTrack]) self.intersectingPoints = str( sum([len(v) for v in regionDict.values()])) mutatedfastaDict = self.getMutatedSequence(genome, regionDict, pointDict) regularFastaDict = self.getMutatedSequence(genome, regionDict) self.mutatedFasta.writeTextToFile('\n'.join([ '\n'.join(mutatedfastaDict[chrom]) for chrom in sorted(mutatedfastaDict.keys()) ])) self.regularFasta.writeTextToFile('\n'.join([ '\n'.join(regularFastaDict[chrom]) for chrom in sorted(regularFastaDict.keys()) ])) @classmethod def getMutatedSequence(cls, genome, regionDict, pointDict=None): resultDict = defaultdict(list) regionList = [] fastaTrack = PlainTrack(['Sequence', 'DNA']) for chrom in regionDict.keys(): for start, end in regionDict[chrom]: seqTv = fastaTrack.getTrackView( GenomeRegion(genome, chrom, start, end)) valList = list(seqTv.valsAsNumpyArray()) if pointDict: mutatedPoints = [ v[1:] for v in pointDict[chrom] if v[0] == start ] for index, val in mutatedPoints: val = val[-1] if val.find('>') >= 0 else val valList[index] = val resultDict[chrom].append( '>%s %i-%i\n%s' % (chrom, start + 1, end, ''.join(valList))) return resultDict @classmethod def IntersectData(cls, genome, tracks): from quick.util.CommonFunctions import getGeSource start = time() geSources = [] for track in tracks: geSources.append(getGeSource(track, genome)) #try: # fileType = ExternalTrackManager.extractFileSuffixFromGalaxyTN(track) # fn = ExternalTrackManager.extractFnFromGalaxyTN(track) # if fileType == 'category.bed': # geSources.append(BedCategoryGenomeElementSource(fn)) # elif fileType == 'gtrack': # geSources.append(GtrackGenomeElementSource(fn)) # else: # geSources.append(BedGenomeElementSource(fn)) # #except: # geSources.append(FullTrackGenomeElementSource(genome, track, allowOverlaps=False)) resultDict, pointDict = defaultdict(list), defaultdict(list) gs1, gs2 = geSources track1Dict, track2Dict = defaultdict(list), defaultdict(list) for ge in gs1: track1Dict[ge.chr].append((ge.start, ge.end)) for ge in gs2: track2Dict[ge.chr].append((ge.start, ge.end, ge.val)) for chrom in track1Dict.keys(): counter = 0 track2List = sorted(track2Dict[chrom]) for start1, end1 in sorted(track1Dict[chrom]): while len(track2List) > counter: start2, end2, val = track2List[counter] if start1 < end2 <= end1 or start1 <= start2 < end1: resultDict[chrom].append([start1, end1]) pointDict[chrom].append( [start1, start2 - start1, str(val)]) elif start2 < start1 and end2 > end1: resultDict[chrom].append([start1, end1]) pointDict[chrom].append( [start1, start2 - start1, str(val)]) elif start2 >= end1: break counter += 1 return resultDict, pointDict def getPwmScores(self, motifId, moticScanObj): pwmRegDict = moticScanObj.scanMotifInTwoSequences( motifId, self.regularFasta.getDiskPath(), self.mutatedFasta.getDiskPath()) #pwmMutDict = moticScanObj.scanMotifInSequence(motifId, self.mutatedFasta.getDiskPath()) #pwmRegDict = moticScanObj.scanMotifInSequence(motifId, self.regularFasta.getDiskPath()) diffResDict = defaultdict(list) lineTab = [] for region in sorted(pwmRegDict): chrom, start = region.split() end = region.replace('-', ' ').split()[-1] start = int(start.split('-')[0]) regular, mutated = pwmRegDict[region] difference = abs(regular[0] - mutated[0]) reg, regMut, mut, mutReg = regular[:2] + mutated[:2] regSeq, regMutSeq, regPos = regular[2:] mutSeq, mutRegSeq, mutPos = mutated[2:] #print 'regSeq, regMutSeq, regPos: ', regSeq, regMutSeq, regPos, type(regSeq), type(regMutSeq), type(regPos) string = '%s\t%f\t[%f -> %f]\t[%f -> %f]\t' % (region.replace( '-', ' ').replace(' ', '\t'), difference, reg, regMut, mut, mutReg) string += '%s:%i-%i\t%s\t%s\t' % (chrom, start + regPos[0], start + regPos[1], regSeq, regMutSeq) string += '%s:%i-%i\t%s\t%s' % (chrom, start + mutPos[0], start + mutPos[1], mutSeq, mutRegSeq) diffResDict[difference].append(string) lineTab.append([ chrom, str(start), str(end), str(difference), '[%f -> %f]' % (reg, regMut), '[%f -> %f]' % (mut, mutReg), '%s:%i-%i' % (chrom, start + regPos[0], start + regPos[1]), regSeq, regMutSeq, '%s:%i-%i' % (chrom, start + mutPos[0], start + mutPos[1]), mutSeq, mutRegSeq ]) #(scores[bestIndx], mScores[bestIndx], matches[bestIndx], mMatches[bestIndx], endpoints[bestIndx]), (mScores[mBestIndx], scores[mBestIndx], mMatches[mBestIndx], matches[mBestIndx], mEndpoints[mBestIndx])] diffList = diffResDict.keys() if len(diffList) > 0: self.maxPwmDiff = str(max(diffList)) self.avgPwmDiff = str(sum(diffList) / len(diffList)) self.numPwmDiff = len(diffList) line = '# GTrack file\n#The columns in this dataset are:\n#\t(ChIP-seq_peak)chr\n#\tstart\n#\tend\n#\tmax(difference in column 5, difference in column 6)\n#\t[best_reference_sequence_PWM_hit_score -> corresponding_mutated_sequence_score]\n#\t[best_mutated_sequence_PWM_hit_score -> corresponding_reference_sequence_score]\n#\tchr:start-end(best_reference_sequence_PWM_hit_motif)\n#\tbest_reference_sequence_PWM_hit_motif\n#\tcorresponding_mutated_sequence_motif\n#\tchr:start-stop(best_mutated_sequence_PWM_hit_motif)\n#\tbest_mutated_sequence_PWM_hit_motif\n#\tcorresponding_reference_sequence_motif)\n##track type: valued segments\n##value column: val\n###seqid\tstart\tend\tval\treference_sequence_PWM\tmutated_sequence_PWM_hit_score\tbest_reference_sequence_PWM_hit_motif\tcorresponding_mutated_sequence_motif\tchr:start-stop(best_mutated_sequence_PWM_hit_motif)\tbest_mutated_sequence_PWM_hit_motif\tcorresponding_reference_sequence_motif\n' self.gtrackDiffScore.writeTextToFile(line) self.pwmDiffScore.writeTextToFile(self.getHtmlPwmTable(lineTab)) self.bedPwmDiffScore.writeTextToFile('\n'.join( ['\t'.join(v[:4]) for v in lineTab])) for k in sorted(diffResDict.keys(), reverse=True): line = '\n'.join(diffResDict[k]) #self.pwmDiffScore.writeTextToFile(line) self.gtrackDiffScore.writeTextToFile(line, mode='a') def getHtmlPwmTable(self, lineTab): headerTab = [ 'chrom', 'start', 'end', 'max PWM difference', 'best reference seq_PWM score -> corresponding mut seq score', 'best mut seq PWM score -> corresponding_ref seq score', 'ref region', 'ref seq', 'corresponding mut seq', 'mut region', 'mut seq', 'corresponding ref seq' ] core = HtmlCore() core.begin() core.tableHeader(headerTab, sortable=True) for row in lineTab: if True: #hasattr(tfObj,'maxPwmDiff'): core.tableLine(row) core.tableFooter() core.end() return str(core) def makeHtmlStr(self): htmlPage = GalaxyRunSpecificFile( ['html', '_'.join(self.track), 'page.html'], self.galaxyFn) htmlStr = 'TF: ' + self.tf + '<br/>\nChip-seq peaks: ' + self.chipSeqPeaks + '<br/>\nPWM: ' + self.pwm + '<br/>\nNumber of SNV-intersected binding regions: ' + self.intersectingPoints + '<br/>\nHighest binding difference: ' + self.maxPwmDiff + '<br/>\nAvg binding difference: ' + self.avgPwmDiff + '<br/>\n' + self.regularFasta.getLink( 'Original Fasta') + '<br/>\n' + self.mutatedFasta.getLink( 'Mutated Fasta') + '<br/>\n' + self.pwmDiffScore.getLink( 'PWM score for each region' ) + '<br/>\n' + self.gtrackDiffScore.getLink( 'Gtrack of PWM score for each region') htmlPage.writeTextToFile(htmlStr) return htmlPage.getLink(self.tf + ': ' + self.track[-1])
def execute(cls, choices, galaxyFn=None, username=''): ''' Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' genome = choices[0] regSpec = '__chrs__' binSpec = '*' if choices[6] == 'Chromosome arms': regSpec = '__chrArms__' elif choices[6] == 'Track from history...': #put in history bins support here #print choices[4:] regSpec = ExternalTrackManager.extractFileSuffixFromGalaxyTN(choices[7].split(':')) binSpec = ExternalTrackManager.extractFnFromGalaxyTN(choices[7].split(':')) #print 'regSpec, binSpec,', regSpec, binSpec lineList, counter, tooManyBins = [], 0, False for line in open(binSpec): if line.strip() !='': if counter == cls.MAX_NUM_ROWS: tooManyBins = True break lineList.append(line) counter+= 1 if line.strip()[0] !='#' else 0 if tooManyBins: newHist = GalaxyRunSpecificFile(['newHistFile.%s' % regSpec], galaxyFn) binSpec = newHist.getDiskPath(ensurePath=True) open(binSpec, 'w').write(''.join(lineList)) print GalaxyInterface.getHtmlBeginForRuns(galaxyFn) print GalaxyInterface.getHtmlForToggles(withRunDescription=False) core = HtmlCore() core.styleInfoBegin(styleClass='debug') figImage = GalaxyRunSpecificFile(['VizTrackOnGenome.png'], galaxyFn) #StaticImage(['VizTrackOnGenome.png']) analysisDef = ' [normalizeRows=%s] [centerRows=%s] -> RawVisualizationDataStat' % \ (choices[4] == 'Scale to same size', choices[5] == 'Center') if choices[1] == 'HyperBrowser repository': trackName = choices[2].split(':') else: trackName = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, choices[3].split(':')) res = GalaxyInterface.runManual([trackName], analysisDef, regSpec, binSpec, genome, username=username, printResults=False, printHtmlWarningMsgs=False) core.styleInfoEnd() core.line('') core.tableHeader(None) #visPresenter = RawVisualizationPresenter(res, galaxyFn,'')#os.path.split()[0] #htmlStreng = visPresenter.getReference('Result', fullImage=True) rScript = cls.customRExecution(res, figImage.getDiskPath(ensurePath=True), '') figUrl = figImage.getURL() figLinkText ='<img src="%s" alt="Figure" height="%i" width="800"/>' % (figUrl, 20 *min(cls.MAX_NUM_ROWS, len(res))) core.tableLine([figImage.getLink(figLinkText)]) rScriptGalaxyFile = GalaxyRunSpecificFile(['RScript.R'], galaxyFn) with open(rScriptGalaxyFile.getDiskPath(ensurePath=True), 'w') as rScriptFile: rScriptFile.write(rScript) core.tableLine([rScriptGalaxyFile.getLink('R script')]) core.tableFooter() print core print GalaxyInterface.getHtmlEndForRuns()
def executeSelfFeature(cls, genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg, galaxyFn, regSpec, binSpec): from proto.RSetup import r #regSpec, binSpec = 'bed', '/usit/invitro/data/galaxy/galaxy-dist-hg-dev/./database/files/017/dataset_17084.dat' silenceRWarnings() jobFile = open(galaxyFn, 'w') # print>>jobFile, 'PARAMS: ', dict(zip('genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec'.split(','), [repr(v)+'<br>'for v in [genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg,regSpec, binSpec]])), '<br><br>' batchRun = GalaxyRunSpecificFile(['batch_run_job.txt'], galaxyFn) print >> jobFile, '<h3>Results for the "similarity of positional distribution along the genome" way of clustering<h3/><br/><br/>' with open(batchRun.getDiskPath(ensurePath=True), 'w') as batchFile: print >> batchFile, '$clusterBySelfFeature', (genome, '$'.join([ ':'.join(t) for t in tracks ]), ':'.join(track_names), clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec) print >> jobFile, batchRun.getLink( 'View batch script line for this analysis<br/>') #print>>jobFile, 'Batch script syntax for this analysis:<br>$clusterBySelfFeature', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names) , clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec), '<br><br>' #print>>jobFile, 'signature of method clusterBySelfFeature:<br>', 'clusterBySelfFeature(genome, tracksStr, track_namesStr, clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec):<br><br><br>' prettyTrackNames = [ v[-1].replace('RoadMap_', '').replace('.H3K4me1', '') for v in tracks ] #prettyTrackNames = [prettyPrintTrackName(v, shortVersion=True) for v in tracks] f_matrix = cls.construct_feature_matrix(genome, tracks, feature, regSpec, binSpec) #print>>jobFile, 'dir f_matrix: ', dir(f_matrix), regSpec, binSpec userBinSource = GalaxyInterface._getUserBinSource( regSpec, binSpec, genome) binNames = [ str(bin) for binIndex, bin in enumerate(sorted(list(userBinSource))) ] if len(binNames) != f_matrix.shape[1]: binNames = ['Microbin' + str(i) for i in range(f_matrix.shape[1])] r.assign('bin_names', binNames) r.assign('track_names', prettyTrackNames ) #use as track names, will be shown in clustering figure r.assign('f_matrix', f_matrix) r.assign('distanceType', distanceType) r('row.names(f_matrix) <- track_names') r('colnames(f_matrix) <- bin_names') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--": #print 'galaxyFn: ', galaxyFn figure = GalaxyRunSpecificFile( ['cluster_tracks_result_figure.pdf'], galaxyFn) figurepath = figure.getDiskPath(ensurePath=True) r('d <- dist(f_matrix, method=distanceType)') distTable = r('d') distMatrix = GalaxyRunSpecificFile(['distance_matrix_result.txt'], galaxyFn) distMatrixPath = distMatrix.getDiskPath(True) open(distMatrixPath, 'w').write(str(distTable)) print >> jobFile, distMatrix.getLink( 'View the distance matrix for this analysis <br>') #with open(distMatrixPath,'w') as distObj: # #distTable = d_matrix.tolist() # core = HtmlCore() # core.tableHeader(['']+track_names,firstRow=True) # rowSize = len(track_names) # index=0 # while index<len(distTable): # core.tableLine([track_names[index % rowSize]]+[str(v) for v in distTable[index:index+rowSize]]) # #for index, row in enumerate(distTable): # # core.tableLine([track_names[index]]+[str(v) for v in row]) # core.tableFooter() # print>>distObj, str(core) #print>>jobFile, distMatrix.getLink('View the distance matrix for this analysis <br>') if True: #f_matrix.shape[1] <= 100: r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.robj'], galaxyFn) #', '.join([str(v) for v in row]) r.assign('f_matrix_fn', r_f_matrixFile.getDiskPath(True)) r('dput(f_matrix, f_matrix_fn)') #r_f_matrixFile.writeTextToFile(', '.join(cls.getFlattenedMatrix(f_matrix)) + '\n\nTrack names: '+', '.join(prettyTrackNames)+'\n\nNumber of tracks: '+str(len(prettyTrackNames))+'\n\nbins: +) #r_f_matrixFile.writeTextToFile() #r_f_matrixFile.writeTextToFile(str(f_matrix)+'\n\n'+str(r.d)) print >> jobFile, r_f_matrixFile.getLink( 'Access the R-representation of the Feature_matrix (text-file)' ), '<br/>' cls._clusterAndPlotDendrogram(figurepath, extra_option, 'd', 'f_matrix', prettyTrackNames) print >> jobFile, figure.getLink( 'View the clustering tree (dendrogram) for this analysis<br>') if True: #f_matrix.shape[1] <= 100: #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn) #baseDir = os.path.dirname(heatmap.getDiskPath(True)) resDict = Results([], [], '') resDict.setGlobalResult({ 'result': { 'Matrix': f_matrix, 'Rows': np.array(track_names), 'Cols': np.array(binNames), 'Significance': None, 'RowClust': r('hr'), 'ColClust': None } }) header = 'View the resulting heatmap plot <br>' baseDir = GalaxyRunSpecificFile([], galaxyFn).getDiskPath() heatPresenter = HeatmapFromNumpyPresenter( resDict, baseDir, header, printDimensions=False) print >> jobFile, heatPresenter.getReference('result') #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn) #heatmap_path = heatmap.getDiskPath(True) #r.pdf(heatmap_path) ##cm.colors(256) #r.library("gplots") #r('heatmap(f_matrix, col=redgreen(75), distfun=function(c) dist(c, method=distanceType), hclustfun=function(c) hclust(c, method=extra_option, members=NULL),Colv=NA, scale="none", xlab="", ylab="", cexRow=0.5, cexCol=0.5, margin=c(8,10))')#Features cluster tracks #r('dev.off()') ##print>>jobFile, r('dimnames(f_matrix)') #print>>jobFile, heatmap.getLink('View the resulting heatmap plot <br>') else: print >> jobFile, 'Heatmap not generated due to large size ', f_matrix.shape elif clusterMethod == 'K-means clustering' and extra_option != "--select--" and kmeans_alg != "--select--": textFile = GalaxyRunSpecificFile( ['result_of_kmeans_clustering.txt'], galaxyFn) textFilePath = textFile.getDiskPath(True) extra_option = int(extra_option) r.assign('kmeans_alg', kmeans_alg) r.assign('extra_option', extra_option) r( 'hr <- kmeans(f_matrix,extra_option,algorithm=kmeans_alg)' ) #the number of cluster is gotten from clusterMethod+ tag, instead of 3 used here r('hr$height <- hr$height/max(hr$height)*10') kmeans_output = open(textFilePath, 'w') clusterSizes = r('hr$size') #size of every cluster withinSS = r('hr$withinss') clusters = r('hr$cluster') for index1 in range( extra_option ): #extra_option actually the number of clusters #trackInCluster = [k for k,val in clusters.items() if val == index1] trackInCluster = [ k + 1 for k, val in enumerate(clusters) if val == index1 + 1 ] #IS THIS CORRECT, I.E. SAME AS ABOVE?? print >> kmeans_output, 'Cluster %i(%s objects) : ' % ( index1 + 1, str(clusterSizes[index1])) for name in trackInCluster: print >> kmeans_output, name, '(This result may be a bit shaky afters some changes in rpy access)' print >> kmeans_output, 'Sum of square error for this cluster is : ' + str( withinSS[index1]) + '\n' kmeans_output.close() print >> jobFile, textFile.getLink( 'Detailed result of kmeans clustering <br>') #cls.print_data(f_matrix, jobFile) '''
def execute(cls, choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' import subprocess import os from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile from config.Config import HB_SOURCE_CODE_BASE_DIR from quick.application.ExternalTrackManager import ExternalTrackManager tempInStaticFile = GalaxyRunSpecificFile(['tempIn.txt'], galaxyFn) outStaticFile = GalaxyRunSpecificFile(['tempOut.fasta'], galaxyFn) #print os.getcwd() inFn = ExternalTrackManager.extractFnFromGalaxyTN( choices[0].split(':')) #print inFn tempOutFn = outStaticFile.getDiskPath(True) #print tempOutFn os.chdir(HB_SOURCE_CODE_BASE_DIR + '/third_party/nonpython') #print outStaticFile.getLink('output') markovOrder = int(choices[1]) seqs = [] for line in open(inFn): if line.startswith('>'): seqs.append([line[1:].strip(), []]) else: seqs[-1][1].append(line.strip()) for seq in seqs: seq[1] = ''.join(seq[1]) pureSequence = ''.join([seq[1] for seq in seqs]) totalSeqLen = len(pureSequence) #pureSequence = ''.join([line.replace('\n','') for line in open(inFn) if not line.startswith('>')]) tempInStaticFile.writeTextToFile(pureSequence) numSamples = int(choices[2]) if numSamples > 1: zipOutStatic = GalaxyRunSpecificFile(['randomFastas.zip'], galaxyFn) zipOut = zipfile.ZipFile(zipOutStatic.getDiskPath(True), 'w') for iteration in range(numSamples): if numSamples > 1: fastaOutStatic = GalaxyRunSpecificFile( ['random', 's%s.fa' % iteration], galaxyFn) fastaOutFn = fastaOutStatic.getDiskPath(True) else: fastaOutFn = galaxyFn #fastaOutStatic = GalaxyRunSpecificFile(['random%s'%iteration], galaxyFn) #subprocess.call('javac',shell=True) #subprocess.call('javac',shell=False) #subprocess.call('javac MarkovModel.java',shell=True) subprocess.call('java MarkovModel %s %s %s >%s' % (tempInStaticFile.getDiskPath(), markovOrder, totalSeqLen, tempOutFn), shell=True) #subprocess.call('javac third_party/nonpython/MarkovModel.java') #subprocess.call('java third_party/nonpython/MarkovModel.java') pureMarkovSequence = open(tempOutFn).readline().strip() pmsIndex = 0 fastaOutF = open(fastaOutFn, 'w') for seq in seqs: fastaOutF.write('>' + seq[0] + os.linesep) nextPmsIndex = pmsIndex + len(seq[1]) #seq.append(pureMarkovSequence[pmsIndex:nextPmsIndex]) fastaOutF.write(pureMarkovSequence[pmsIndex:nextPmsIndex] + os.linesep) pmsIndex = nextPmsIndex fastaOutF.close() assert pmsIndex == totalSeqLen == len(pureMarkovSequence), ( pmsIndex, totalSeqLen, len(pureMarkovSequence)) if numSamples > 1: #print 'Adding %s to archive' % fastaOutFn.split('/')[-1] zipOut.write(fastaOutFn, fastaOutFn.split('/')[-1]) if numSamples > 1: zipOut.close() print zipOutStatic.getLink('Zipped random sequences')
def executePairDistance(cls, genome, tracks, track_names, clusterMethod, extra_option, feature, extra_feature, galaxyFn, regSpec, binSpec): from proto.RSetup import r silenceRWarnings() #jobFile = galaxyFn if feature is not None: # must use "" here because the '' does not work l = len(tracks) d_matrix = np.zeros((l, l)) for i in range(l): for j in range(l): if i < j: if extra_feature == "1 minus the ratio": d_matrix[ i, j] = 1 - ClusteringExecution.computeDistance( genome, tracks[i], tracks[j], feature, regSpec, binSpec, galaxyFn) d_matrix[j, i] = d_matrix[i, j] elif extra_feature == "1 over the ratio": d_matrix[ i, j] = 1 / ClusteringExecution.computeDistance( genome, tracks[i], tracks[j], feature, regSpec, binSpec, galaxyFn) d_matrix[j, i] = d_matrix[i, j] else: d_matrix[i, j] = ClusteringExecution.computeDistance( genome, tracks[i], tracks[j], feature, regSpec, binSpec, galaxyFn) d_matrix[j, i] = d_matrix[i, j] jobFile = open(galaxyFn, 'w') print >> jobFile, '<h3>Results for the "direct sequence-level similarity" way of clustering<h3/><br/><br/>' figure = GalaxyRunSpecificFile( ['cluster_tracks_result_figure.pdf'], galaxyFn ) #this figure is runspecific and is put in the directory distMatrix = GalaxyRunSpecificFile(['distance_matrix_result.html'], galaxyFn) distMatrixPath = distMatrix.getDiskPath(True) with open(distMatrixPath, 'w') as distObj: distTable = d_matrix.tolist() core = HtmlCore() core.tableHeader([''] + track_names, firstRow=True) for index, row in enumerate(distTable): core.tableLine([track_names[index]] + [str(v) for v in row]) core.tableFooter() print >> distObj, str(core) figurepath = figure.getDiskPath(True) #r.pdf(figurepath, 8, 8) r.assign('track_names', track_names) r.assign('d_matrix', d_matrix) r('row.names(d_matrix) <- track_names') r('d <- as.dist(d_matrix)') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--": cls._clusterAndPlotDendrogram(figurepath, extra_option, 'd', 'd_matrix', track_names) #r.assign('extra_option',extra_option) #r('hr <- hclust(d, method=extra_option, members=NULL)') #r('hr$height <- hr$height/max(hr$height)*10') #r('plot(hr, ylab="Distance", hang=-1)') #r('dev.off()') batchRun = GalaxyRunSpecificFile(['batch_run_job.txt'], galaxyFn) with open(batchRun.getDiskPath(ensurePath=True), 'w') as batchFile: print >> batchFile, '$clusterByPairDistance', ( genome, '$'.join([':'.join(t) for t in tracks ]), ':'.join(track_names), clusterMethod, extra_option, feature, extra_feature, regSpec, binSpec) print >> jobFile, batchRun.getLink( 'View batch script line for this analysis <br/>') #print>>jobFile, 'Batch script syntax for this analysis:<br>$clusterByPairDistance', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names) , clusterMethod, extra_option, feature, extra_feature, regSpec, binSpec), '<br><br>' print >> jobFile, figure.getLink( 'View the clustering tree (dendrogram) for this analysis <br>') print >> jobFile, distMatrix.getLink( 'View the distance matrix for this analysis <br>')
def executeReferenceTrack(cls, genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, galaxyFn, regSpec, binSpec, numreferencetracks=None, refTracks=None, refFeatures=None, yesNo=None, howMany=None, upFlank=None, downFlank=None): from proto.RSetup import r silenceRWarnings() jobFile = open(galaxyFn, 'w') print >> jobFile, '<h3>Results for the "similarity of relations to other sets of genomic features" way of clustering<h3/><br/><br/>' # print>>jobFile, 'PARAMS: ', dict(zip('genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec'.split(','), [repr(v)+'<br>'for v in [genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec]])), '<br><br>' batchRun = GalaxyRunSpecificFile(['batch_run_job.txt'], galaxyFn) with open(batchRun.getDiskPath(ensurePath=True), 'w') as batchFile: print >> batchFile, '$clusterByReference', (genome, '$'.join([ ':'.join(t) for t in tracks ]), ':'.join(track_names), clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec, numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank) print >> jobFile, batchRun.getLink( 'View batch script line for this analysis<br/>') #print>>jobFile, 'Batch script syntax for this analysis:<br>', '$clusterByReference', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names) , clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec,numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank), '<br><br>' #print>>jobFile, 'signature of method clusterByReference:<br>', 'clusterByReference(genome, tracksStr, track_namesStr, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec, numreferencetracks=None, refTracks=None, refFeatures=None, yesNo=None, howMany=None, upFlank=None, downFlank=None)<br><br><br>' prettyTrackNames = [ v[-1].replace("RoadMap_", "").replace('.H3K4me1', '') for v in tracks ] #prettyTrackNames = [prettyPrintTrackName(v) for v in tracks] #paramNames = ['numreferencetracks', 'refTracks', 'refFeatures', 'yesNo', 'howMany', 'upFlank', 'downFlank'] #for index, value in enumerate([numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank]): # if value != None: # print paramNames[index]+'='+ str(value), #print '' reftrack_names = [ ] #for use in creating the heatmap (as the column names) options = [ ] #for the case using refTracks, options contains feature for every refTrack, chosen by user. if numreferencetracks: for i in range(int(numreferencetracks)): ref_i = refTracks[i].split( ":" ) #name of refTrack is being used to construct the name of expanded refTrack #refTracks.append(ref_i) #put the refTrack into refTracks list reftrack_names.append(ref_i[-1]) temp_opt1 = 'ref' + str(i) + 'feature' options += [] if refFeatures[i] is None else [refFeatures[i]] if yesNo and yesNo[ i] == "Yes" and howMany and howMany[i] != '--select--': for expan in range(int(howMany[i])): reftrack_names.append(ref_i[-1] + '_' + upFlank[i][expan]) upFlank = int(upFlank[i][expan]) downFlank = int(downFlank[i][expan]) withinRunId = str(i + 1) + ' expansion ' + str(expan + 1) outTrackName = GalaxyInterface.expandBedSegmentsFromTrackNameUsingGalaxyFn( ref_i, genome, upFlank, downFlank, galaxyFn, withinRunId) #outTrackName is unique for run refTracks.append( outTrackName ) #put the expanded track into refTracks list options.append( options[-1] ) # use chosen feature for refTack as valid feature for the expanded for index, track in enumerate(refTracks): #print track, '<br>' if isinstance(track, basestring): track = track.split(":") refTracks[index] = track[:-1] if track[ -1] == "-- All subtypes --" else track if len(refTracks) > 0: trackFormats = [ TrackInfo(genome, track).trackFormatName for track in tracks ] trackLen = len(tracks) refLen = len(refTracks) f_matrix = np.zeros((trackLen, refLen)) for i in range(trackLen): for j in range(refLen): #print 'len(options), refLen, len(tracks), trackLen, len(trackFormats):', len(options), refLen, len(tracks), trackLen, len(trackFormats) f_matrix[i, j] = cls.extract_feature(genome, tracks[i], refTracks[j], options[j], regSpec, binSpec, trackFormats[i]) r.assign('track_names', prettyTrackNames ) #use as track names, will be shown in clustering figure r.assign('reftrack_names', reftrack_names) r.assign('f_matrix', f_matrix) r.assign('distanceType', distanceType) r('row.names(f_matrix) <- track_names') r('colnames(f_matrix) <- reftrack_names') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--": figure = GalaxyRunSpecificFile( ['cluster_tracks_result_figure.pdf'], galaxyFn) figurepath = figure.getDiskPath(True) #r.pdf(figurepath, 8,8) r('d <- dist(f_matrix, method=distanceType)') distTable = r('d') distMatrix = GalaxyRunSpecificFile( ['distance_matrix_result.txt'], galaxyFn) distMatrixPath = distMatrix.getDiskPath(True) open(distMatrixPath, 'w').write(str(distTable)) print >> jobFile, distMatrix.getLink( 'View the distance matrix for this analysis <br>') #with open(distMatrixPath,'w') as distObj: # #distTable = d_matrix.tolist() # core = HtmlCore() # core.tableHeader(['']+track_names,firstRow=True) # rowSize = len(track_names) # index=0 # while index<len(distTable): # core.tableLine([track_names[index % rowSize]]+[str(v) for v in distTable[index:index+rowSize]]) # core.tableFooter() # print>>distObj, str(core) #print>>jobFile, distMatrix.getLink('View the distance matrix for this analysis <br>') #print r.f_matrix #print r.d r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.robj'], galaxyFn) r.assign('f_matrix_fn', r_f_matrixFile.getDiskPath(True)) r('dput(f_matrix, f_matrix_fn)') print >> jobFile, r_f_matrixFile.getLink( 'Access the R-representation of the Feature_matrix (text-file) <br>' ), #r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.txt'], galaxyFn) #r_f_matrixFile.writeTextToFile(str(f_matrix)+'\n\n'+str(r.d)) #print>>jobFile, r_f_matrixFile.getLink('r.f_matrix & r.d <br>') cls._clusterAndPlotDendrogram(figurepath, extra_option, 'd', 'f_matrix', prettyTrackNames) #r.assign('extra_option',extra_option) #r('hr <- hclust(d, method=extra_option, members=NULL)') #r('hr$height <- hr$height/max(hr$height)*10') #r('plot(hr, ylab="Distance", hang=-1)') # #r('dev.off()') print >> jobFile, figure.getLink( 'View the clustering tree (dendrogram) for this analysis<br>' ) elif clusterMethod == 'K-means clustering' and extra_option != "--select--" and kmeans_alg != "--select--": textFile = GalaxyRunSpecificFile( ['result_of_kmeans_clustering.txt'], galaxyFn) textFilePath = textFile.getDiskPath(True) extra_option = int(extra_option) r.assign('extra_option', extra_option) r.assign('kmeans_alg', kmeans_alg) r( 'hr <- kmeans(f_matrix,extra_option,algorithm=kmeans_alg)' ) #the number of cluster is gotten from clusterMethod+ tag, instead of 3 used here r('hr$height <- hr$height/max(hr$height)*10') kmeans_output = open(textFilePath, 'w') clusterSizes = r('hr$size') #size of every cluster withinSS = r('hr$withinss') clusters = np.array( r('hr$cluster') ) #convert to array in order to handle the index more easily track_names = np.array(track_names) for index1 in range( extra_option ): #extra_option actually the number of clusters trackInCluster = [ k for k, val in clusters.items() if val == index1 ] print >> kmeans_output, 'Cluster %i(%s objects) : ' % ( index1 + 1, str(clusterSizes[index1])) for name in trackInCluster: print >> kmeans_output, name print >> kmeans_output, 'Sum of square error for this cluster is : ' + str( withinSS[index1]) + '\n' kmeans_output.close() print >> jobFile, textFile.getLink( 'Detailed result of kmeans clustering <br>') #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn) #baseDir = os.path.dirname(heatmap.getDiskPath(True)) ##r.png(heatmap_path, width=800, height=700) resDict = Results([], [], 'ClusTrack') resDict.setGlobalResult({ 'result': { 'Matrix': f_matrix, 'Rows': np.array(track_names), 'Cols': np.array(reftrack_names), 'Significance': None, 'RowClust': r('hr'), 'ColClust': None } }) header = 'Heatmap of Feature matrix for "similarity of positional distribution along the genome" ' baseDir = GalaxyRunSpecificFile([], galaxyFn).getDiskPath() heatPresenter = HeatmapFromNumpyPresenter(resDict, baseDir, header, printDimensions=False) print >> jobFile, heatPresenter.getReference('result') #r.pdf(heatmap_path) #r.library("gplots") #r('heatmap(f_matrix, col=redgreen(75), Colv=NA, scale="none", xlab="", ylab="", margins=c(10,10))')#Features cluster tracks #r('dev.off()') #print>>jobFile, heatmap.getLink('View the resulting heatmap plot <br>') #cls.print_data(f_matrix, jobFile) else: print 'Have to specify a set of refTracks'
def findTFsTargetingGenes(cls, genome, tfSource, ensembleGeneIdList, upFlankSize, downFlankSize, geneSource, galaxyFn): #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat' #print 'overriding galaxyFN!: ', galaxyFn uniqueWebPath = GalaxyRunSpecificFile([], galaxyFn).getDiskPath() assert genome in [ 'mm9', 'hg18', 'hg19' ] #other genomes not supported. TF id links do not specify genome for pre-selection of analysis #if tfSource == 'UCSC tfbs conserved': # tfTrackName = ['Gene regulation','TFBS','UCSC prediction track'] #else: # raise tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome) tfTrackName = tfTrackNameMappings[tfSource] #Get gene track #targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed' #geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') #GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn ) if not (upFlankSize == downFlankSize == 0): unflankedGeneRegsTempFn = uniqueWebPath + os.sep + '_geneRegs.bed' #flankedGeneRegsTempFn = uniqueWebPath + os.sep + 'flankedGeneRegs.bed' flankedGeneRegsTempStaticFile = GalaxyRunSpecificFile( ['flankedGeneRegs.bed'], galaxyFn) flankedGeneRegsTempFn = flankedGeneRegsTempStaticFile.getDiskPath() geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, unflankedGeneRegsTempFn) GalaxyInterface.expandBedSegments(unflankedGeneRegsTempFn, flankedGeneRegsTempFn, genome, upFlankSize, downFlankSize, suffix='category.bed') #flankedGeneRegsExternalTN = ['external'] +galaxyId + [flankedGeneRegsTempFn] regSpec, binSpec = 'category.bed', flankedGeneRegsTempFn else: regSpec, binSpec = '__genes__', ','.join(ensembleGeneIdList) res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName) #trackName1 = tfTrackName # #analysisDef = 'Category point count: Number of elements each category of track1 (with overlaps)'+\ # '[tf1:=SegmentToStartPointFormatConverter:]'+\ # '-> FreqByCatStat' ##assert len(ensembleGeneIdList)==1 ##geneId = ensembleGeneIdList[0] # #print '<div class="debug">' #userBinSource, fullRunArgs = GalaxyInterface._prepareRun(trackName1, None, analysisDef, regSpec, binSpec, genome) #res = AnalysisDefJob(analysisDef, trackName1, None, userBinSource, **fullRunArgs).run() # #print res ##GalaxyInterface._viewResults([res], galaxyFn) #print '</div>' tfs = res.getResDictKeys() genesPlural = 's' if len(ensembleGeneIdList) > 1 else '' tfsPlural = 's' if len(tfs) != 1 else '' print '<p>There are %i TF%s targeting your gene%s of interest (%s), using "%s" as source of TF occurrences.</p>' % ( len(tfs), tfsPlural, genesPlural, ','.join(ensembleGeneIdList), tfSource) if not (upFlankSize == downFlankSize == 0): print '(using ', flankedGeneRegsTempStaticFile.getLink( 'these genomic regions'), ' for genes)' expansionStr = ' flanked' if not ( upFlankSize == downFlankSize == 0) else '' idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'], galaxyFn) idHtmlFileNamer.writeTextToFile('<br>'.join([ '<a href=%s/hyper?dbkey=%s&track1=%s&track2=>%s</a>' % (URL_PREFIX, genome, quote(':'.join(tfTrackName + [tf])), tf) for tf in tfs ])) #idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=/hbdev/hyper?track1=%s&track2=>%s</a>'%( ':'.join(tfTrackName+[tf]), tf) for tf in tfs])) print '<p>', idHtmlFileNamer.getLink( 'Inspect html file' ), ' of all TF IDs occurring 1 or more times within your%s gene region%s of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % ( expansionStr, genesPlural) idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'], galaxyFn) idFileNamer.writeTextToFile(os.linesep.join(tfs) + os.linesep) print '<p>', idFileNamer.getLink( 'Inspect text file' ), ' listing all TF IDs occurring 1 or more times within your%s gene region%s of interest.</p>' % ( expansionStr, genesPlural) extractedTfbsFileNamer = GalaxyRunSpecificFile( ['tfbsInGeneRegions.bed'], galaxyFn) GalaxyInterface.extractTrackManyBins( genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath()) print '<p>', extractedTfbsFileNamer.getLink( 'Inspect bed-file' ), 'of all TF binding sites occurring within your%s gene region%s of interest.</p>' % ( expansionStr, genesPlural)
def execute_batch(cls, choices, galaxyFn=None, username=''): print GalaxyInterface.getHtmlBeginForRuns(galaxyFn) html = HtmlCore() html.header('Batch run results') refSnps = cls.get_ref_snp(choices) #print refSnps batchMal = "$Tool[hb_variant_melting_profiles](" + '|'.join( ["'%s'"] * len(choices)) + ")" cmdList = [] for rs in refSnps: #if len(rs[4]) > 1: # rs = list(rs) # rs[4] = list(rs[4])[0] # rs = tuple(rs) fakeChoices = (choices.genome, 'Single', '__batch__') + rs + choices[8:] #print rs cmdList.append(batchMal % fakeChoices) #print cmdList GalaxyInterface.runBatchLines(cmdList, galaxyFn, username=username, printResults=False, printProgress=True) #print HtmlCore().styleInfoEnd() results_tsv = GalaxyRunSpecificFile(['results.tsv'], galaxyFn) results = results_tsv.getFile() dir = os.path.dirname(results_tsv.getDiskPath()) for i in range(0, len(cmdList)): header = True ri = 0 for resultline in open(os.path.join(dir, str(i), 'results.tsv')): if header: header = False if i == 0: headertxt = '#run\t' + resultline results.write(headertxt) html.tableHeader(headertxt.split('\t')) else: results.write(str(i) + '\t' + resultline) if resultline.count('?') == 0: link = '<a href="%d/html/chart-%d.html">%d (graph)</a>' % ( i, ri, i) else: link = str(i) html.tableLine([link] + resultline.split('\t')) ri += 1 results.close() html.tableFooter() # XXX: temp fix for HB/stable bug if URL_PREFIX == '/hb': print '</div>' print '<p><b>' + results_tsv.getLink('Download results') + '</b></p>' print html print GalaxyInterface.getHtmlEndForRuns()
def findTFsOccurringInRegions(cls, genome, tfSource, regionsBedFn, upFlankSize, downFlankSize, galaxyFn): uniqueWebPath = GalaxyRunSpecificFile([], galaxyFn).getDiskPath() #assert genome == 'hg18' #other genomes not supported. TF id links do not specify genome for pre-selection of analysis tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome) assert tfTrackNameMappings != {}, 'No TF info for genome: %s' % genome tfTrackName = tfTrackNameMappings[tfSource] if (upFlankSize == downFlankSize == 0): flankedRegionsFn = regionsBedFn else: flankedRegionsFn = uniqueWebPath + os.sep + 'flankedRegs.bed' GalaxyInterface.expandBedSegments(regionsBedFn, flankedRegionsFn, genome, upFlankSize, downFlankSize) regSpec, binSpec = 'bed', flankedRegionsFn res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName) tfNames = res.getResDictKeys() #print 'RES: ', res.getGlobalResult()[tfNames[0]], type(res.getGlobalResult()[tfNames[0]]) pwm2tfids = safeshelve.open( os.sep.join([HB_SOURCE_CODE_BASE_DIR, 'data', 'pwm2TFids.shelf']), 'r') tf2class = safeshelve.open( os.sep.join([HB_SOURCE_CODE_BASE_DIR, 'data', 'TfId2Class.shelf']), 'r') pwmName2id = safeshelve.open( os.sep.join([HB_SOURCE_CODE_BASE_DIR, 'data', 'pwmName2id.shelf']), 'r') #print tfNames[0],tfNames[1], ' VS ', pwm2tfids.keys()[0], len(pwm2tfids) #tfs = list(reversed(sorted([(res.getGlobalResult()[tf], tf, '%s (%i hits (class %s))'%(tf, res.getGlobalResult()[tf]), '/'.join([tf2class[x] for x in pwm2tfids[tf]]) ) for tf in tfNames]))) #num hits, tfName, tfTextInclHits tfs = list(reversed(sorted([(res.getGlobalResult()[tf], tf, '%s (%i hits )'%(tf, res.getGlobalResult()[tf]) + \ (' (class: %s)'%'/'.join(set([str(tf2class.get(x)) for x in pwm2tfids[pwmName2id[tf]] if x in tf2class]))\ if (tf in pwmName2id and pwmName2id[tf] in pwm2tfids and any([x in tf2class for x in pwm2tfids[pwmName2id[tf]]]))\ else '') ) \ for tf in tfNames])) ) #num hits, tfName, tfTextInclHits tfsPlural = 's' if len(tfs) != 1 else '' print '<p>There are %i TF%s targeting your regions of interest, using "%s" as source of TF occurrences.</p>' % ( len(tfs), tfsPlural, tfSource) expansionStr = ' flanked' if not ( upFlankSize == downFlankSize == 0) else '' idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'], galaxyFn) idHtmlFileNamer.writeTextToFile('<br>'.join([ '<a href=/hbdev/hyper?track1=%s&track2=>%s</a>' % (quote(':'.join(tfTrackName + [tf[1]])), tf[2]) for tf in tfs ])) print '<p>', idHtmlFileNamer.getLink( 'Inspect html file' ), ' of all TF IDs occurring 1 or more times within your%s regions of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % ( expansionStr) idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'], galaxyFn) idFileNamer.writeTextToFile( os.linesep.join([tf[2] for tf in tfs]) + os.linesep) print '<p>', idFileNamer.getLink( 'Inspect text file' ), ' listing all TF IDs occurring 1 or more times within your%s regions of interest.</p>' % ( expansionStr) extractedTfbsFileNamer = GalaxyRunSpecificFile( ['tfbsInGeneRegions.bed'], galaxyFn) GalaxyInterface.extractTrackManyBins( genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath(), True) print '<p>', extractedTfbsFileNamer.getLoadToHistoryLink( 'Inspect bed-file' ), 'of all TF binding sites occurring within your%s regions of interest.</p>' % ( expansionStr) for dummy, tf, dummy2 in tfs: extractedTfbsFileNamer = GalaxyRunSpecificFile( [tf + '_tfbsInGeneRegions.bed'], galaxyFn) GalaxyInterface.extractTrackManyBins( genome, tfTrackName + [tf], regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath()) print '<p>', extractedTfbsFileNamer.getLoadToHistoryLink( 'Binding sites of the TF %s' % tf, 'bed' ), 'occurring within your%s regions of interest (bed-file).</p>' % ( expansionStr)
def compareCutoffSchemes(maxNumSamples, h, fdrThreshold, totalNumTests, stepSize, numReplications, a, b, galaxyFn=None): print '<PRE>' print 'Comparing cutoff schemes with parameters: maxNumSamples=%i, h=%i, fdrThreshold=%.2f, totalNumTests=%i, numReplications=%i' % ( maxNumSamples, h, fdrThreshold, totalNumTests, numReplications) print 'stepSize: ', stepSize print 'H1 p-values drawn from beta with a=%.3f and b=%.3f' % (a, b) print 'Minimum achieveable p-value is %.5f, which gives minimum Bonferroni-corrected p-value of %.5f (compares to a fdr threshold of %.2f)' % ( 1.0 / maxNumSamples, (1.0 / maxNumSamples) * totalNumTests, fdrThreshold) #estimate time use: prevTime = time.time() Simulator(maxNumSamples, None, None, a, b, fdrThreshold).numSamplesAsFunctionOfNumH1(1, 1, 1) baseMeasure = time.time() - prevTime if type(stepSize) == int: numSteps = len(range(0, totalNumTests + 1, stepSize)) elif type(stepSize) == list: numSteps = len(stepSize) withOnlyMaxNumEstimate = baseMeasure * totalNumTests * numSteps * numReplications #print 'Estimated running time: between %i and %i seconds.' % (withOnlyMaxNumEstimate, withOnlyMaxNumEstimate*3) print 'Estimated running time: around %i seconds. (%.1f hours)' % ( withOnlyMaxNumEstimate, withOnlyMaxNumEstimate / 3600.0) sortedKeys, onlyMaxCutoff, onlyMaxNumRejected, onlyMaxType1Errors, onlyMaxType2Errors = Simulator( maxNumSamples, None, None, a, b, fdrThreshold, galaxyFn).numSamplesAsFunctionOfNumH1(totalNumTests, stepSize, numReplications) sortedKeys, seqMcCutoff, seqMcNumRejected, seqMcType1Errors, seqMcType2Errors = Simulator( maxNumSamples, h, None, a, b, fdrThreshold, galaxyFn).numSamplesAsFunctionOfNumH1(totalNumTests, stepSize, numReplications) sortedKeys, mcFdrCutoff, mcFdrNumRejected, mcFdrType1Errors, mcFdrType2Errors = Simulator( None, h, fdrThreshold, a, b, fdrThreshold, galaxyFn).numSamplesAsFunctionOfNumH1(totalNumTests, stepSize, numReplications) maxY = max(max(s) for s in [onlyMaxCutoff, seqMcCutoff, mcFdrCutoff]) #minY = min( min(s) for s in [onlyMaxCutoff, seqMcCutoff, McFdrCutoff]) minY = 0 print 'Time spent: ', time.time() - prevTime, ' secs' print '</PRE>' #plotStaticFile.getDiskPath(True) if galaxyFn is not None: #print 'Generating aggregate McFdr simulation figures' plotStaticFile = GalaxyRunSpecificFile(['mainPlot.png'], galaxyFn) if type(stepSize) is int: allNumH1s = range(0, totalNumTests + 1, stepSize) elif type(stepSize) is list: allNumH1s = stepSize for numH1 in allNumH1s: catalogStaticFile = GalaxyRunSpecificFile( [str(numH1), 'cat.html'], galaxyFn) print catalogStaticFile.getLink('Tests with #True H1s=%i' % numH1), '<br>' #plotStaticFile.openRFigure() #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72) #r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxCutoff), ylim=r.unlist([minY,maxY]), type='l', xlab='Number of true H1s', ylab='Total MC samples' , col='black') #r.lines(r.unlist(sortedKeys), r.unlist(seqMcCutoff), col='red' ) #r.lines(r.unlist(sortedKeys), r.unlist(mcFdrCutoff), col='green' ) #r.legend('topleft',['BasicMc','SeqMc','McFdr'],col=['black','red','green'],lty=1) plotStaticFile.plotRLines( sortedKeys, [onlyMaxCutoff, seqMcCutoff, mcFdrCutoff], xlab='Number of true H1s', ylab='Total MC samples', legend=['BasicMc', 'SeqMc', 'McFdr']) #r('dev.off()') #plotStaticFile.closeRFigure() print plotStaticFile.getLink( 'View main plot' ) + ' of sumSamples as function of #H1s.', '<br>' numRejectedPlotStaticFile = GalaxyRunSpecificFile( ['secondaryPlot.png'], galaxyFn) numRejectedPlotStaticFile.plotRLines( sortedKeys, [onlyMaxNumRejected, seqMcNumRejected, mcFdrNumRejected], xlab='Number of true H1s', ylab='Num rejected tests', legend=['BasicMc', 'SeqMc', 'McFdr']) #numRejectedPlotStaticFile.openRFigure() #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72) #r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxNumRejected), ylim=r.unlist([0,totalNumTests]), type='l', xlab='Number of true H1s', ylab='Num rejected tests',col='black' ) #r.lines(r.unlist(sortedKeys), r.unlist(seqMcNumRejected), col='red' ) #r.lines(r.unlist(sortedKeys), r.unlist(mcFdrNumRejected), col='green' ) #r.lines(r.unlist(sortedKeys), r.unlist(sortedKeys), col='black', lty='dotted' ) #As this corresponds to perfect estimation.. #r.legend('topleft',['BasicMc','SeqMc','McFdr','NumFromH1'],col=['black','red','green','black'],lty=[1,1,1,2]) #r('dev.off()') #numRejectedPlotStaticFile.closeRFigure() print numRejectedPlotStaticFile.getLink( 'View secondary plot' ) + ' of #true H1s vs #tests rejected.', '<br>' #Classification errors classificationErrorPlotStaticFile = GalaxyRunSpecificFile( ['errors.png'], galaxyFn) classificationErrorPlotStaticFile.openRFigure() yMax = max( max(x) for x in [ mcFdrType2Errors, mcFdrType1Errors, seqMcType2Errors, seqMcType1Errors, onlyMaxType2Errors, onlyMaxType1Errors ]) #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72) r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxType1Errors), ylim=r.unlist([0, yMax]), type='l', xlab='Number of true H1s', ylab='Type 1/2 errors', col='black') r.lines(r.unlist(sortedKeys), r.unlist(onlyMaxType2Errors), col='black', lty='dotted') r.lines(r.unlist(sortedKeys), r.unlist(seqMcType1Errors), col='red') r.lines(r.unlist(sortedKeys), r.unlist(seqMcType2Errors), col='red', lty='dotted') r.lines(r.unlist(sortedKeys), r.unlist(mcFdrType1Errors), col='green') r.lines(r.unlist(sortedKeys), r.unlist(mcFdrType2Errors), col='green', lty='dotted') rpy1.legend('topleft', [ 'BasicMcType1', 'SeqMcType1', 'McFdrType1', 'BasicMcType2', 'SeqMcType2', 'McFdrType2' ], col=['black', 'red', 'green', 'black', 'red', 'green'], lty=[1, 1, 1, 2, 2, 2]) #r('dev.off()') classificationErrorPlotStaticFile.closeRFigure() print classificationErrorPlotStaticFile.getLink( 'View Type 1/2 error plot' ) + ' as function of number of true H1.', '<br>' #Classification errors onlyMaxAccuracy = [ sum(errors) * 1.0 / totalNumTests for errors in zip(onlyMaxType1Errors, onlyMaxType2Errors) ] seqMcAccuracy = [ sum(errors) * 1.0 / totalNumTests for errors in zip(seqMcType1Errors, seqMcType2Errors) ] mcFdrAccuracy = [ sum(errors) * 1.0 / totalNumTests for errors in zip(mcFdrType1Errors, mcFdrType2Errors) ] accuracyPlotStaticFile = GalaxyRunSpecificFile(['accuracy.png'], galaxyFn) accuracyPlotStaticFile.openRFigure() yMax = 0.2 #just set ad hoc here.. #r.png(filename=plotFn, height=600, width=800, units='px', pointsize=12, res=72) r.plot(r.unlist(sortedKeys), r.unlist(onlyMaxAccuracy), ylim=r.unlist([0, yMax]), type='l', xlab='Number of true H1s', ylab='Accuracy', col='black') r.lines(r.unlist(sortedKeys), r.unlist(seqMcAccuracy), col='red') r.lines(r.unlist(sortedKeys), r.unlist(mcFdrAccuracy), col='green') rpy1.legend('topleft', ['BasicMc', 'SeqMc', 'McFdr', 'NumFromH1'], col=['black', 'red', 'green'], lty=[1, 1, 1]) #r('dev.off()') accuracyPlotStaticFile.closeRFigure() print accuracyPlotStaticFile.getLink( 'View accuracy plot' ) + ' as function of number of true H1.', '<br>' #False positive rates onlyMaxFpr = [ float(fp) / pos if pos != 0 else 0 for fp, pos in zip(onlyMaxType1Errors, onlyMaxNumRejected) ] seqMcFpr = [ float(fp) / pos if pos != 0 else 0 for fp, pos in zip(seqMcType1Errors, seqMcNumRejected) ] mcFdrFpr = [ float(fp) / pos if pos != 0 else 0 for fp, pos in zip(mcFdrType1Errors, mcFdrNumRejected) ] fprPlotStaticFile = GalaxyRunSpecificFile(['fpr.png'], galaxyFn) fprPlotStaticFile.plotRLines(sortedKeys, [onlyMaxFpr, seqMcFpr, mcFdrFpr], legend=['BasicMc', 'SeqMc', 'McFdr']) print fprPlotStaticFile.getLink( 'View FPR plot') + ' as function of number of true H1.', '<br>'
def execute(cls, choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.gtr If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' print 'temporarily overriding tool, running McFdr2 simulation..' from test.sandbox.extra.McFdr2 import analyzeSampleNumAccuracy for numSamples in [100, 1000, 10000]: print '' print 'numSamples %s: ' % numSamples, for i in range(3): print analyzeSampleNumAccuracy(numSamples), return from proto.RSetup import r from numpy import array, minimum pVal, minNumSamples, maxNumSamples, chunkSize, numTests = [ float(x) for x in choices[:-1] ] print 'pVal:%.2f, minNumSamples:%i, maxNumSamples:%i, chunkSize:%i, numTests:%i' % ( pVal, minNumSamples, maxNumSamples, chunkSize, numTests) assert (maxNumSamples - minNumSamples) % chunkSize == 0 assert numTests == 1 #More not yet supported. Should in McFdr be something like the min-max, i.e. the minimum across iterations of the maximum p-value across tests.. pValEstimation = choices[-1] assert pValEstimation in ['Davison', 'ML'] if pValEstimation == 'Davison': pFunc = lambda k, n: 1.0 * (k + 1) / (n + 1) else: pFunc = lambda k, n: 1.0 * (k) / n numRepl = 10**4 stdAtMin = [ pFunc(k, minNumSamples) for k in r.rbinom(numRepl, minNumSamples, pVal) ] stdAtMax = [ pFunc(k, maxNumSamples) for k in r.rbinom(numRepl, maxNumSamples, pVal) ] mcFdrBestPVals = array([1.0] * numRepl) mcFdrSamples = minNumSamples #array([minNumSamples]*numRepl) mcFdrExtremes = array(r.rbinom(numRepl, minNumSamples, pVal)) while mcFdrSamples < maxNumSamples: tempMcFdrPVals = pFunc(mcFdrExtremes, mcFdrSamples) mcFdrBestPVals = minimum(mcFdrBestPVals, tempMcFdrPVals) mcFdrSamples += chunkSize mcFdrExtremes += array(r.rbinom(numRepl, chunkSize, pVal)) tempMcFdrPVals = pFunc(mcFdrExtremes, mcFdrSamples) mcFdrBestPVals = minimum(mcFdrBestPVals, tempMcFdrPVals) assert mcFdrSamples == maxNumSamples print 'Mean values<br>' print 'AtMin:%.7f, AtMax:%.7f, McFdr:%.7f' % tuple( [array(x).mean() for x in [stdAtMin, stdAtMax, mcFdrBestPVals]]) breaks = [pVal * 2 * x / 100.0 for x in range(0, 101)] + [1.0] histRes = r.hist(stdAtMin, breaks=breaks, plot=False) xVals = histRes['mids'] yValsStdAtMin = histRes['density'] histRes = r.hist(stdAtMax, breaks=breaks, plot=False) assert xVals == histRes['mids'] yValsStdAtMax = histRes['density'] histRes = r.hist(mcFdrBestPVals, breaks=breaks, plot=False) assert xVals == histRes['mids'] yValsMcFdr = histRes['density'] staticFile = GalaxyRunSpecificFile(['pDistr.png'], galaxyFn) staticFile.openRFigure() staticFile.plotRLines(xVals, [yValsStdAtMin, yValsStdAtMax, yValsMcFdr], alsoOpenAndClose=False, xlab='p-value', ylab='density', xlim=[0, 2 * pVal]) r.abline(v=pVal, lty='dotted', col='yellow') staticFile.closeRFigure() print staticFile.getLink('View estimated pval distribution')
def execute(cls, choices, galaxyFn=None, username=''): genome = choices.genome from quick.multitrack.MultiTrackCommon import getGSuiteDataFromGalaxyTN trackTitles, refTrackNameList, genome = getGSuiteDataFromGalaxyTN(choices.gsuite) queryTrackName = ExternalTrackManager.extractFnFromGalaxyTN(choices.targetTrack) if choices.isBasic: suffix = ExternalTrackManager.extractFileSuffixFromGalaxyTN(choices.targetTrack, False) regSpec = suffix binSpec = queryTrackName else: regSpec, binSpec = UserBinMixin.getRegsAndBinsSpec(choices) #targetTrack = choices.targetTrack.split(':') #targetTrackTitle = targetTrack[-1] #print targetTrackTitle # #binSpec = targetTrackTitle #Phenotype and disease associations:Assorted experiments:Virus integration, HPV specific, Kraus and Schmitz, including 50kb flanks from gold.gsuite.GSuiteConstants import TITLE_COL from gold.gsuite.GSuite import GSuite from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile from gold.gsuite.GSuiteEditor import selectColumnsFromGSuite staticFile=[] results = [] for refTrack in refTrackNameList: analysisDef = '-> ProportionCountStat' #ProportionCountStat #CountStat res = GalaxyInterface.runManual([refTrack], analysisDef, regSpec, binSpec, genome, username=username, galaxyFn=galaxyFn, printRunDescription=False, printResults=False, printProgress=False) segCoverageProp = [res[seg]['Result'] for seg in res.getAllRegionKeys()] results.append(segCoverageProp) regFileNamer = GalaxyRunSpecificFile(refTrack, galaxyFn) staticFile.append([regFileNamer.getLink('Download bed-file'), regFileNamer.getLoadToHistoryLink('Download bed-file to History')]) refGSuite = getGSuiteFromGalaxyTN(choices.gsuite) if TITLE_COL == choices.selectColumns: selected = trackTitles else: selected = refGSuite.getAttributeValueList(choices.selectColumns) yAxisNameOverMouse=[] metadataAll =[] for x in range(0, len(selected)): if selected[x] == None: yAxisNameOverMouse.append(str(trackTitles[x]) + ' --- ' + 'None') else: if TITLE_COL == choices.selectColumns: yAxisNameOverMouse.append(selected[x].replace('\'', '').replace('"', '')) else: metadata = str(selected[x].replace('\'', '').replace('"', '')) yAxisNameOverMouse.append(str(trackTitles[x]) + ' --- ' + metadata) metadataAll.append(metadata) colorListForYAxisNameOverMouse = [] if len(metadataAll) > 0: import quick.webtools.restricted.visualization.visualizationGraphs as vg cList = vg.colorList().fullColorList() uniqueCList = list(set(metadataAll)) for m in metadataAll: colorListForYAxisNameOverMouse.append(cList[uniqueCList.index(m)]) #startEnd - order in res startEndInterval = [] startEnd = [] i=0 extraX=[] rowLabel = [] for ch in res.getAllRegionKeys(): rowLabel.append(str(ch.chr) + ":" + str(ch.start) + "-" + str(ch.end) + str(' (Pos)' if ch.strand else ' (Neg)')) if not i==0 and not i==len(res.getAllRegionKeys())-1: start = ch.start if start-end > 0: startEnd.append(start-end) else: startEnd.append('null') extraX.append("""{ color: 'orange', width: 5, value: '""" + str(i-0.5) + """' }""") startEndInterval.append(ch.end - ch.start) else: startEndInterval.append(ch.end - ch.start) end = ch.end i+=1 extraXAxis='plotLines: [ ' extraXAxis = extraXAxis + ",".join(extraX) extraXAxis = extraXAxis + """ ], """ #rowLabel = res.getAllRegionKeys() #rowLabel = [str(x) for x in rowLabel] import quick.webtools.restricted.visualization.visualizationPlots as vp htmlCore = HtmlCore() htmlCore.begin() htmlCore.divBegin(divId='results-page') htmlCore.divBegin(divClass='results-section') htmlCore.divBegin('plotDiv') htmlCore.line(vp.addJSlibs()) htmlCore.line(vp.useThemePlot()) htmlCore.line(vp.addJSlibsExport()) htmlCore.line(vp.axaddJSlibsOverMouseAxisisPopup()) #vp.addGuideline(htmlCore) htmlCore.line(vp._addGuidelineV1()) htmlCore.line(vp.addJSlibsHeatmap()) from config.Config import DATA_FILES_PATH from proto.StaticFile import StaticFile, GalaxyRunSpecificFile #sf = GalaxyRunSpecificFile(['result.txt'], galaxyFn) #outFile = sf.getDiskPath(ensurePath=True) htmlCore.divBegin() writeFile = open( cls.makeHistElement(galaxyExt='tabular', title='result'), 'w') # htmlCore.link('Get all results', sf.getURL()) htmlCore.divEnd() i = 0 writeFile.write('Track' + '\t' + '\t'.join(rowLabel)+ '\n') for rList in results: writeFile.write(str(yAxisNameOverMouse[i]) + '\t' + '\t'.join([str(r) for r in rList]) + '\n') i+=1 fileOutput = GalaxyRunSpecificFile(['heatmap.png'], galaxyFn) ensurePathExists(fileOutput.getDiskPath()) fileOutputPdf = GalaxyRunSpecificFile(['heatmap.pdf'], galaxyFn) ensurePathExists(fileOutputPdf.getDiskPath()) cls.generateStaticRPlot(results, colorListForYAxisNameOverMouse, rowLabel, yAxisNameOverMouse, colorMaps[choices.colorMapSelectList], fileOutput.getDiskPath(), fileOutputPdf.getDiskPath()) htmlCore.divBegin(divId='heatmap', style="padding: 10px 0 px 10 px 0px;margin: 10px 0 px 10 px 0px") htmlCore.link('Download heatmap image', fileOutputPdf.getURL()) htmlCore.divEnd() if len(results) * len(results[1]) >= 10000: htmlCore.image(fileOutput.getURL()) else: min = 1000000000 max = -1000000000 for rList in results: for r in rList: if min > r: min = r if max < r: max = r if max-min != 0: resultNormalised = [] for rList in results: resultNormalisedPart = [] for r in rList: resultNormalisedPart.append((r-min)/(max-min)) resultNormalised.append(resultNormalisedPart) addText = '(normalised to [0, 1])' else: resultNormalised = results addText = '' hm, heatmapPlotNumber, heatmapPlot = vp.drawHeatMap( resultNormalised, colorMaps[choices.colorMapSelectList], label='this.series.xAxis.categories[this.point.x] + ' + "'<br >'" + ' + yAxisNameOverMouse[this.point.y] + ' + "'<br>Overlap proportion" + str(addText) + ": <b>'" + ' + this.point.value + ' + "'</b>'", yAxisTitle= 'Reference tracks', categories=rowLabel, tickInterval=1, plotNumber=3, interaction=True, otherPlotNumber=1, titleText='Overlap with reference tracks for each local region', otherPlotData=[startEnd, startEndInterval], overMouseAxisX=True, overMouseAxisY=True, yAxisNameOverMouse=yAxisNameOverMouse, overMouseLabelY=" + 'Track: '" + ' + this.value + ' + "' '" + ' + yAxisNameOverMouse[this.value] + ', overMouseLabelX = ' + this.value.substring(0, 20) +', extrOp = staticFile ) htmlCore.line(hm) htmlCore.line(vp.drawChartInteractionWithHeatmap( [startEndInterval, startEnd], tickInterval=1, type='line', categories=[rowLabel, rowLabel], seriesType=['line', 'column'], minWidth=300, height=500, lineWidth=3, titleText=['Lengths of segments (local regions)','Gaps between consecutive segments'], label=['<b>Length: </b>{point.y}<br/>', '<b>Gap length: </b>{point.y}<br/>'], subtitleText=['',''], yAxisTitle=['Lengths','Gap lengths'], seriesName=['Lengths','Gap lengths'], xAxisRotation=90, legend=False, extraXAxis=extraXAxis, heatmapPlot=heatmapPlot, heatmapPlotNumber=heatmapPlotNumber, overMouseAxisX=True, overMouseLabelX = ' + this.value.substring(0, 20) +' )) htmlCore.divEnd() htmlCore.divEnd() htmlCore.divEnd() htmlCore.end() htmlCore.hideToggle(styleClass='debug') print htmlCore
def execute(cls, choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' from quick.application.UserBinSource import parseRegSpec genome = choices[0] nmer = choices[1].lower() regSpec = choices[2] analysisRegions = parseRegSpec(regSpec, genome) binSize = cls._calcBinSize(nmer, analysisRegions) binSpec = '*' if binSize is None else str(binSize) numBins = len(AutoBinner(analysisRegions, binSize)) from quick.application.GalaxyInterface import GalaxyInterface from quick.util.GenomeInfo import GenomeInfo trackName1 = GenomeInfo.getPropertyTrackName( genome, 'nmer') + [str(len(nmer)) + '-mers', nmer] trackName2 = [''] analysisDef = 'Counts: The number of track1-points -> CountPointStat' #regSpec = '*' #print 'Using binSize: ',binSpec #print 'TN1: ',trackName1 from proto.hyperbrowser.HtmlCore import HtmlCore print str(HtmlCore().styleInfoBegin(styleClass='debug')) GalaxyInterface.run(trackName1, trackName2, analysisDef, regSpec, binSpec, genome, galaxyFn) print str(HtmlCore().styleInfoEnd()) plotFileNamer = GalaxyRunSpecificFile( ['0', 'CountPointStat_Result_gwplot.pdf'], galaxyFn) textualDataFileNamer = GalaxyRunSpecificFile( ['0', 'CountPointStat_Result.bedgraph'], galaxyFn) core = HtmlCore() core.paragraph( 'Inspect k-mer frequency variation as a %s or as underlying %s.</p>' % (plotFileNamer.getLink('plot'), textualDataFileNamer.getLink('textual data'))) core.divider() core.paragraph( 'The occurrence frequency of your specified k-mer ("%s") has been computed along the genome, within your specified analysis region ("%s").' % (nmer, regSpec)) core.paragraph( 'The analysis region was divided into %i bins, based on calculations trying to find appropriate bin size (get enough data per bin and restrict maximum number of bins).' % numBins) trackName1modified = trackName1[0:-2] + trackName1[-1:] preSelectedAnalysisUrl = createHyperBrowserURL(genome, trackName1modified, [''], analysis='Counts', method='__custom__', region=regSpec, binsize=binSpec) core.divider() core.paragraph( 'If you do not find the inferred bin size to be appropriate, you can set this manually in a ' + str(HtmlCore().link('new analysis', preSelectedAnalysisUrl)) + '.') print str(core)