Ejemplo n.º 1
0
 def _constructBins(regSpec, binSpec, genome, trackNames):
     # Construct and check bins
     try:
         from quick.application.GalaxyInterface import GalaxyInterface
         userBinSource = GalaxyInterface._getUserBinSource(regSpec, binSpec, genome, trackNames)
         return [None, userBinSource]
     except Exception, e:
         results = Results([], [], '')
         results.addError(InvalidRunSpecException('Error in specification of analysis region or binsize: ' + str(e)))
         logMessage('Error in specification of analysis region (' + regSpec +') or binsize: (' + binSpec + ')')
         if DebugConfig.PASS_ON_BATCH_EXCEPTIONS:
             raise
         return [results, None]
Ejemplo n.º 2
0
    def _constructBins(regSpec, binSpec, genome, trackName1, trackName2):
        #Construct and check bins
        try:
            #userBinSource= UserBinSource(regSpec, binSpec)
            from quick.application.GalaxyInterface import GalaxyInterface
#            from config.Config import DEFAULT_GENOME
            userBinSource = GalaxyInterface._getUserBinSource(regSpec, binSpec, genome, trackName1, trackName2)
            return [None, userBinSource]
        except Exception, e:
            #results = Results(trackName1, trackName2, statClassName)
            results = Results([],[],'')
            results.addError(InvalidRunSpecException('Error in specification of analysis region or binsize: ' + str(e)))
            logMessage('Error in specification of analysis region (' + regSpec +') or binsize: (' + binSpec + ')')
            if DebugConfig.PASS_ON_BATCH_EXCEPTIONS:
                raise
            return [results, None]
Ejemplo n.º 3
0
    def runJob(batchLine, genome, fullAccess, galaxyFn=None, printProgress=True):
        bc = BatchRunner.parseBatchLine(batchLine, genome, fullAccess)
        if bc.errorResult is not None:
            return bc.errorResult
        
        #Try a full run, and return either results or an exception
        try:
            #track = Track(trackName1)
            #track2 = Track(trackName2)
            #if 'tf1' in paramDict:
            #    track.setFormatConverter(formatConverter)
            
            #results = StatRunner.run(userBinSource , Track(trackName1), Track(trackName2), \
            #                         wrapClass(STAT_CLASS_DICT[statClassName], keywords=paramDict) )
            #results = StatRunner.run(userBinSource , track, track2, \
            #                         wrapClass(STAT_CLASS_DICT[statClassName], keywords=paramDict) )
            fullRunParams = {}
            
            if USE_PARALLEL:
                #if galaxyFn == None: #then this is a test
                uniqueId = time.time()
                #else:
                    #uniqueId = extractIdFromGalaxyFn(galaxyFn)[1]
                    
                fullRunParams["uniqueId"] = uniqueId
            
            if bc.cleanedTrackNameIntensity is not None:
                fullRunParams['trackNameIntensity'] = '|'.join(tuple(bc.cleanedTrackNameIntensity))
            
            analysisDefParams = [ '[' + key + '=' + value + ']' for key,value in bc.paramDict.items()]
            analysisDef = ''.join(analysisDefParams) + '->' + bc.statClassName

            from quick.application.GalaxyInterface import GalaxyInterface
            
            GalaxyInterface._tempAnalysisDefHacks(analysisDef)
            
            if printProgress:
                print 'Corresponding batch command line:<br>' + \
                    GalaxyInterface._revEngBatchLine(bc.trackName1, bc.trackName2, bc.trackNameIntensity, analysisDef, bc.regSpec, bc.binSpec, genome) + '<br><br>'
            
            results = AnalysisDefJob(analysisDef, bc.cleanedTrackName1, bc.cleanedTrackName2, bc.userBinSource, galaxyFn=galaxyFn, **fullRunParams).run(printProgress)
            presCollectionType = results.getPresCollectionType()

            if len(results.getResDictKeys()) > 0 and GalaxyInterface.APPEND_ASSEMBLY_GAPS and presCollectionType=='standard':
                if USE_PARALLEL:
                    gapRes = AssemblyGapJob(bc.userBinSource, genome, uniqueId=uniqueId).run(printProgress)
                else:
                    gapRes = AssemblyGapJob(bc.userBinSource, genome).run(printProgress)
                results.includeAdditionalResults(gapRes, ensureAnalysisConsistency=False)

        except Exception, e:
            #print 'NOWAG BExc'
            results = Results(bc.cleanedTrackName1, bc.cleanedTrackName2, bc.statClassName)
            results.addError(e)
            logException(e,message='Error in batch run')
            if DebugConfig.PASS_ON_BATCH_EXCEPTIONS:
                raise
            return results
Ejemplo n.º 4
0
    def parseBatchLine(batchLine, genome, fullAccess):
        if batchLine[0] == '#' or batchLine.strip() == '':
            return

        from urllib import unquote

        #Split and check number of columns
        cols = [x for x in batchLine.strip().split(BATCH_COL_SEPARATOR)]
        if len(cols) != 6:
            results = Results(['N/A'], ['N/A'], 'N/A')
            #results.addResultComponent( 'Invalid',InvalidRunResultComponent('Error in batch specification. 6 columns are required, while '\
            #                            + str(len(cols)) + ' are given.'))
            results.addError(InvalidRunSpecException('Error in batch specification. 6 columns are required, while '\
                                        + str(len(cols)) + ' are given: ' + batchLine))
            return results, None, None, None, None

        bc = BatchContents()

        bc.regSpec = cols[1]
        bc.binSpec = unquote(cols[2])
        from quick.application.ExternalTrackManager import ExternalTrackManager
        if ExternalTrackManager.isGalaxyTrack(bc.binSpec.split(':')):
            bc.binSpec = ExternalTrackManager.extractFnFromGalaxyTN(
                bc.binSpec.split(':'))

        try:
            from quick.application.GalaxyInterface import GalaxyInterface
            bc.trackName1 = [unquote(x) for x in cols[3].split(':')]
            bc.trackName2 = [unquote(x) for x in cols[4].split(':')]
            bc.cleanedTrackName1, bc.cleanedTrackName2 = GalaxyInterface._cleanUpTracks(
                [bc.trackName1, bc.trackName2], genome, realPreProc=True)

            bc.cleanedTrackName1 = BatchRunner._inferTrackName(
                bc.cleanedTrackName1, genome, fullAccess)
            bc.cleanedTrackName2 = BatchRunner._inferTrackName(
                bc.cleanedTrackName2, genome, fullAccess)

        except (InvalidRunSpecException, IdenticalTrackNamesError), e:
            if DebugConfig.PASS_ON_BATCH_EXCEPTIONS:
                raise
            bc.errorResult = Results(['N/A'], ['N/A'], 'N/A')
            bc.errorResult.addError(e)
            return bc
Ejemplo n.º 5
0
    def parseBatchLine(batchLine, genome, fullAccess):
        if batchLine[0] == '#' or batchLine.strip() == '':
            return
            
        from urllib import unquote
        
        # Split and check number of columns
        cols = [x for x in batchLine.strip().split(BATCH_COL_SEPARATOR)]
        if len(cols) != 6:
            results = Results(['N/A'], ['N/A'], 'N/A')
            results.addError(InvalidRunSpecException('Error in batch specification. 6 columns are required, while '\
                                        + str(len(cols)) + ' are given: ' + batchLine))
            return results, None, None, None, None 

        bc = BatchContents()
        
        bc.regSpec = cols[1]
        bc.binSpec = unquote(cols[2])

        from quick.application.ExternalTrackManager import ExternalTrackManager
        if ExternalTrackManager.isGalaxyTrack(bc.binSpec.split(':')):
            bc.binSpec = ExternalTrackManager.extractFnFromGalaxyTN(bc.binSpec.split(':'))

        bc.statClassName, bc.paramDict = BatchRunner._parseClassAndParams(cols[5])

        bc.trackNames = [[unquote(x) for x in cols[i].split(':')] for i in [3, 4]]
        if 'trackNameIntensity' in bc.paramDict:
            bc.trackNames.append(convertTNstrToTNListFormat(bc.paramDict['trackNameIntensity'], doUnquoting=True))

        from quick.application.GalaxyInterface import GalaxyInterface

        partlyCleanedTrackNames = GalaxyInterface._cleanUpTracks(bc.trackNames, genome, realPreProc=True)

        try:
            bc.cleanedTrackNames = BatchRunner._inferTrackNames(partlyCleanedTrackNames, genome, fullAccess)

        except (InvalidRunSpecException,IdenticalTrackNamesError), e:
            if DebugConfig.PASS_ON_BATCH_EXCEPTIONS:
                raise
            bc.errorResult = Results(['N/A'],['N/A'],'N/A')
            bc.errorResult.addError(e)
            return bc
Ejemplo n.º 6
0
    def runJob(batchLine, genome, fullAccess):
        if batchLine[0] == '#' or batchLine.strip()=='':
            return
            
        from urllib import unquote
        
        #Split and check number of columns
        cols = [x for x in batchLine.strip().split(BATCH_COL_SEPARATOR)]
        if len(cols) != 6:
            results = Results(['N/A'],['N/A'],'N/A')
            #results.addResultComponent( 'Invalid',InvalidRunResultComponent('Error in batch specification. 6 columns are required, while '\
            #                            + str(len(cols)) + ' are given.'))
            results.addError(InvalidRunSpecException('Error in batch specification. 6 columns are required, while '\
                                        + str(len(cols)) + ' are given: ' + batchLine))
            return results

        #print 'COL2: ',cols[2]
        cols[2] = unquote(cols[2])
        #print 'COL2: ',cols[2]
        from quick.application.ExternalTrackManager import ExternalTrackManager
        if ExternalTrackManager.isGalaxyTrack(cols[2].split(':')):
            cols[2] = ExternalTrackManager.extractFnFromGalaxyTN(cols[2].split(':'))
            #print 'COL2: ',cols[2]
        
        try:
            from quick.application.GalaxyInterface import GalaxyInterface
            trackName1 = [unquote(x) for x in cols[3].split(':')]
            trackName2 = [unquote(x) for x in cols[4].split(':')]
            cleanedTrackName1, cleanedTrackName2 = GalaxyInterface._cleanUpTracks([trackName1, trackName2], genome, realPreProc=True)

            cleanedTrackName1 = BatchRunner._inferTrackName(':'.join(cleanedTrackName1), genome, fullAccess)
            cleanedTrackName2 = BatchRunner._inferTrackName(':'.join(cleanedTrackName2), genome, fullAccess)
            
        except (InvalidRunSpecException,IdenticalTrackNamesError), e:
            if DebugConfig.PASS_ON_BATCH_EXCEPTIONS:
                raise
            results = Results(['N/A'],['N/A'],'N/A')
            results.addError(e)
            return results
Ejemplo n.º 7
0
    def runJob(batchLine, genome, fullAccess, galaxyFn=None, printProgress=True):
        bc = BatchRunner.parseBatchLine(batchLine, genome, fullAccess)
        if bc.errorResult is not None:
            return bc.errorResult
        
        #Try a full run, and return either results or an exception
        try:
            #track = Track(trackName1)
            #track2 = Track(trackName2)
            #if 'tf1' in paramDict:
            #    track.setFormatConverter(formatConverter)
            
            #results = StatRunner.run(userBinSource , Track(trackName1), Track(trackName2), \
            #                         wrapClass(STAT_CLASS_DICT[statClassName], keywords=paramDict) )
            #results = StatRunner.run(userBinSource , track, track2, \
            #                         wrapClass(STAT_CLASS_DICT[statClassName], keywords=paramDict) )
            fullRunParams = {}
            
            if USE_PARALLEL:
                # TODO: Requirements for parallel runs should not be added in places like these. Parallelization
                #  should be a feature of the job runner somehow

                #if galaxyFn == None: #then this is a test
                uniqueId = time.time()
                #else:
                    #uniqueId = extractIdFromGalaxyFn(galaxyFn)[1]
                    
                fullRunParams["uniqueId"] = uniqueId

            from quick.application.GalaxyInterface import GalaxyInterface

            analysisDefParams = [ '[' + key + '=' + value + ']' for key,value in bc.paramDict.items()]
            analysisDef = ''.join(analysisDefParams) + '->' + bc.statClassName

            # TODO: Keeping the ugly accesses to private methods in GalaxyInterface for now. To be refactored.
            trackNames, analysisDef = GalaxyInterface._cleanUpAnalysisDef(bc.cleanedTrackNames, analysisDef)

            if printProgress:
                revEngBatchLine = RunDescription.getRevEngBatchLine(
                    analysisDef, bc.trackNames, bc.cleanedTrackNames, bc.regSpec, bc.binSpec, genome
                )

                print 'Corresponding batch command line:<br>{}<br><br>'.format(revEngBatchLine)

            results = AnalysisDefJob(analysisDef, bc.cleanedTrackNames[0], bc.cleanedTrackNames[1], bc.userBinSource, galaxyFn=galaxyFn, **fullRunParams).run(printProgress)
            presCollectionType = results.getPresCollectionType()

            if len(results.getResDictKeys()) > 0 and GalaxyInterface.APPEND_ASSEMBLY_GAPS and presCollectionType=='standard':
                if USE_PARALLEL:
                    gapRes = AssemblyGapJob(bc.userBinSource, genome, uniqueId=uniqueId).run(printProgress)
                else:
                    gapRes = AssemblyGapJob(bc.userBinSource, genome).run(printProgress)
                results.includeAdditionalResults(gapRes, ensureAnalysisConsistency=False)

        except Exception, e:
            #print 'NOWAG BExc'
            results = Results(bc.cleanedTrackNames[0], bc.cleanedTrackNames[1], bc.statClassName)
            results.addError(e)
            logException(e,message='Error in batch run')
            if DebugConfig.PASS_ON_BATCH_EXCEPTIONS:
                raise
            return results
    def executeSelfFeature(cls, genome, tracks, track_names, clusterMethod,
                           extra_option, feature, distanceType, kmeans_alg,
                           galaxyFn, regSpec, binSpec):

        from proto.RSetup import r
        #regSpec, binSpec = 'bed', '/usit/invitro/data/galaxy/galaxy-dist-hg-dev/./database/files/017/dataset_17084.dat'
        silenceRWarnings()

        jobFile = open(galaxyFn, 'w')
        #         print>>jobFile, 'PARAMS: ', dict(zip('genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec'.split(','), [repr(v)+'<br>'for v in [genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg,regSpec, binSpec]])), '<br><br>'
        batchRun = GalaxyRunSpecificFile(['batch_run_job.txt'], galaxyFn)
        print >> jobFile, '<h3>Results for the "similarity of positional distribution along the genome" way of clustering<h3/><br/><br/>'
        with open(batchRun.getDiskPath(ensurePath=True), 'w') as batchFile:
            print >> batchFile, '$clusterBySelfFeature', (genome, '$'.join([
                ':'.join(t) for t in tracks
            ]), ':'.join(track_names), clusterMethod, extra_option, feature,
                                                          distanceType,
                                                          kmeans_alg, regSpec,
                                                          binSpec)
        print >> jobFile, batchRun.getLink(
            'View batch script line for this analysis<br/>')
        #print>>jobFile, 'Batch script syntax for this analysis:<br>$clusterBySelfFeature', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names)  , clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec), '<br><br>'
        #print>>jobFile, 'signature of method clusterBySelfFeature:<br>', 'clusterBySelfFeature(genome, tracksStr, track_namesStr, clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec):<br><br><br>'
        prettyTrackNames = [
            v[-1].replace('RoadMap_', '').replace('.H3K4me1', '')
            for v in tracks
        ]
        #prettyTrackNames = [prettyPrintTrackName(v, shortVersion=True) for v in tracks]
        f_matrix = cls.construct_feature_matrix(genome, tracks, feature,
                                                regSpec, binSpec)
        #print>>jobFile, 'dir f_matrix: ', dir(f_matrix), regSpec, binSpec
        userBinSource = GalaxyInterface._getUserBinSource(
            regSpec, binSpec, genome)
        binNames = [
            str(bin)
            for binIndex, bin in enumerate(sorted(list(userBinSource)))
        ]
        if len(binNames) != f_matrix.shape[1]:
            binNames = ['Microbin' + str(i) for i in range(f_matrix.shape[1])]
        r.assign('bin_names', binNames)
        r.assign('track_names', prettyTrackNames
                 )  #use as track names, will be shown in clustering figure
        r.assign('f_matrix', f_matrix)
        r.assign('distanceType', distanceType)
        r('row.names(f_matrix) <- track_names')
        r('colnames(f_matrix) <- bin_names')

        if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--":
            #print 'galaxyFn: ', galaxyFn
            figure = GalaxyRunSpecificFile(
                ['cluster_tracks_result_figure.pdf'], galaxyFn)
            figurepath = figure.getDiskPath(ensurePath=True)
            r('d <- dist(f_matrix, method=distanceType)')
            distTable = r('d')
            distMatrix = GalaxyRunSpecificFile(['distance_matrix_result.txt'],
                                               galaxyFn)
            distMatrixPath = distMatrix.getDiskPath(True)
            open(distMatrixPath, 'w').write(str(distTable))
            print >> jobFile, distMatrix.getLink(
                'View the distance matrix for this analysis <br>')
            #with open(distMatrixPath,'w') as distObj:
            #    #distTable = d_matrix.tolist()
            #    core = HtmlCore()
            #    core.tableHeader(['']+track_names,firstRow=True)
            #    rowSize = len(track_names)
            #    index=0
            #    while index<len(distTable):
            #        core.tableLine([track_names[index % rowSize]]+[str(v) for v in distTable[index:index+rowSize]])
            #    #for index, row in enumerate(distTable):
            #    #    core.tableLine([track_names[index]]+[str(v) for v in row])
            #    core.tableFooter()
            #    print>>distObj, str(core)
            #print>>jobFile, distMatrix.getLink('View the distance matrix for this analysis <br>')

            if True:  #f_matrix.shape[1] <= 100:
                r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.robj'],
                                                       galaxyFn)
                #', '.join([str(v) for v in row])
                r.assign('f_matrix_fn', r_f_matrixFile.getDiskPath(True))
                r('dput(f_matrix, f_matrix_fn)')
                #r_f_matrixFile.writeTextToFile(', '.join(cls.getFlattenedMatrix(f_matrix)) + '\n\nTrack names: '+', '.join(prettyTrackNames)+'\n\nNumber of tracks: '+str(len(prettyTrackNames))+'\n\nbins: +)
                #r_f_matrixFile.writeTextToFile()
                #r_f_matrixFile.writeTextToFile(str(f_matrix)+'\n\n'+str(r.d))
                print >> jobFile, r_f_matrixFile.getLink(
                    'Access the R-representation of the Feature_matrix (text-file)'
                ), '<br/>'

            cls._clusterAndPlotDendrogram(figurepath, extra_option, 'd',
                                          'f_matrix', prettyTrackNames)
            print >> jobFile, figure.getLink(
                'View the clustering tree (dendrogram) for this analysis<br>')

            if True:  #f_matrix.shape[1] <= 100:
                #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn)
                #baseDir = os.path.dirname(heatmap.getDiskPath(True))

                resDict = Results([], [], '')
                resDict.setGlobalResult({
                    'result': {
                        'Matrix': f_matrix,
                        'Rows': np.array(track_names),
                        'Cols': np.array(binNames),
                        'Significance': None,
                        'RowClust': r('hr'),
                        'ColClust': None
                    }
                })
                header = 'View the resulting heatmap plot <br>'

                baseDir = GalaxyRunSpecificFile([], galaxyFn).getDiskPath()
                heatPresenter = HeatmapFromNumpyPresenter(
                    resDict, baseDir, header, printDimensions=False)
                print >> jobFile, heatPresenter.getReference('result')

                #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn)
                #heatmap_path = heatmap.getDiskPath(True)
                #r.pdf(heatmap_path)
                ##cm.colors(256)
                #r.library("gplots")
                #r('heatmap(f_matrix, col=redgreen(75), distfun=function(c) dist(c, method=distanceType), hclustfun=function(c) hclust(c, method=extra_option, members=NULL),Colv=NA, scale="none", xlab="", ylab="", cexRow=0.5, cexCol=0.5, margin=c(8,10))')#Features cluster tracks
                #r('dev.off()')
                ##print>>jobFile, r('dimnames(f_matrix)')
                #print>>jobFile, heatmap.getLink('View the resulting heatmap plot <br>')
            else:
                print >> jobFile, 'Heatmap not generated due to large size ', f_matrix.shape
        elif clusterMethod == 'K-means clustering' and extra_option != "--select--" and kmeans_alg != "--select--":
            textFile = GalaxyRunSpecificFile(
                ['result_of_kmeans_clustering.txt'], galaxyFn)
            textFilePath = textFile.getDiskPath(True)
            extra_option = int(extra_option)
            r.assign('kmeans_alg', kmeans_alg)
            r.assign('extra_option', extra_option)

            r(
                'hr <- kmeans(f_matrix,extra_option,algorithm=kmeans_alg)'
            )  #the number of cluster is gotten from clusterMethod+ tag, instead of 3 used here
            r('hr$height <- hr$height/max(hr$height)*10')
            kmeans_output = open(textFilePath, 'w')
            clusterSizes = r('hr$size')  #size of every cluster
            withinSS = r('hr$withinss')
            clusters = r('hr$cluster')
            for index1 in range(
                    extra_option
            ):  #extra_option actually the number of clusters
                #trackInCluster = [k for k,val in clusters.items() if val == index1]
                trackInCluster = [
                    k + 1 for k, val in enumerate(clusters)
                    if val == index1 + 1
                ]  #IS THIS CORRECT, I.E. SAME AS ABOVE??

                print >> kmeans_output, 'Cluster %i(%s objects) : ' % (
                    index1 + 1, str(clusterSizes[index1]))
                for name in trackInCluster:
                    print >> kmeans_output, name, '(This result may be a bit shaky afters some changes in rpy access)'

                print >> kmeans_output, 'Sum of square error for this cluster is : ' + str(
                    withinSS[index1]) + '\n'

            kmeans_output.close()
            print >> jobFile, textFile.getLink(
                'Detailed result of kmeans clustering <br>')

        #cls.print_data(f_matrix, jobFile)
        '''
    def executeReferenceTrack(cls,
                              genome,
                              tracks,
                              track_names,
                              clusterMethod,
                              extra_option,
                              distanceType,
                              kmeans_alg,
                              galaxyFn,
                              regSpec,
                              binSpec,
                              numreferencetracks=None,
                              refTracks=None,
                              refFeatures=None,
                              yesNo=None,
                              howMany=None,
                              upFlank=None,
                              downFlank=None):
        from proto.RSetup import r
        silenceRWarnings()
        jobFile = open(galaxyFn, 'w')
        print >> jobFile, '<h3>Results for the "similarity of relations to other sets of genomic features" way of clustering<h3/><br/><br/>'
        #         print>>jobFile, 'PARAMS: ', dict(zip('genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec'.split(','), [repr(v)+'<br>'for v in [genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec]])), '<br><br>'
        batchRun = GalaxyRunSpecificFile(['batch_run_job.txt'], galaxyFn)
        with open(batchRun.getDiskPath(ensurePath=True), 'w') as batchFile:
            print >> batchFile, '$clusterByReference', (genome, '$'.join([
                ':'.join(t) for t in tracks
            ]), ':'.join(track_names), clusterMethod, extra_option,
                                                        distanceType,
                                                        kmeans_alg, regSpec,
                                                        binSpec,
                                                        numreferencetracks,
                                                        refTracks, refFeatures,
                                                        yesNo, howMany,
                                                        upFlank, downFlank)
        print >> jobFile, batchRun.getLink(
            'View batch script line for this analysis<br/>')

        #print>>jobFile, 'Batch script syntax for this analysis:<br>', '$clusterByReference', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names)  , clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec,numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank), '<br><br>'
        #print>>jobFile, 'signature of method clusterByReference:<br>', 'clusterByReference(genome, tracksStr, track_namesStr, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec, numreferencetracks=None, refTracks=None, refFeatures=None, yesNo=None, howMany=None, upFlank=None, downFlank=None)<br><br><br>'
        prettyTrackNames = [
            v[-1].replace("RoadMap_", "").replace('.H3K4me1', '')
            for v in tracks
        ]

        #prettyTrackNames = [prettyPrintTrackName(v) for v in tracks]
        #paramNames = ['numreferencetracks', 'refTracks', 'refFeatures', 'yesNo', 'howMany', 'upFlank', 'downFlank']
        #for index, value in enumerate([numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank]):
        #    if value != None:
        #        print paramNames[index]+'='+ str(value),
        #print ''

        reftrack_names = [
        ]  #for use in creating the heatmap (as the column names)

        options = [
        ]  #for the case using refTracks, options contains feature for every refTrack, chosen by user.

        if numreferencetracks:
            for i in range(int(numreferencetracks)):
                ref_i = refTracks[i].split(
                    ":"
                )  #name of refTrack is being used to construct the name of expanded refTrack
                #refTracks.append(ref_i) #put the refTrack into refTracks list
                reftrack_names.append(ref_i[-1])
                temp_opt1 = 'ref' + str(i) + 'feature'
                options += [] if refFeatures[i] is None else [refFeatures[i]]
                if yesNo and yesNo[
                        i] == "Yes" and howMany and howMany[i] != '--select--':
                    for expan in range(int(howMany[i])):
                        reftrack_names.append(ref_i[-1] + '_' +
                                              upFlank[i][expan])
                        upFlank = int(upFlank[i][expan])
                        downFlank = int(downFlank[i][expan])
                        withinRunId = str(i + 1) + ' expansion ' + str(expan +
                                                                       1)
                        outTrackName = GalaxyInterface.expandBedSegmentsFromTrackNameUsingGalaxyFn(
                            ref_i, genome, upFlank, downFlank, galaxyFn,
                            withinRunId)  #outTrackName is unique for run
                        refTracks.append(
                            outTrackName
                        )  #put the expanded track into refTracks list
                        options.append(
                            options[-1]
                        )  # use chosen feature for refTack as valid feature for the expanded

            for index, track in enumerate(refTracks):
                #print track, '<br>'
                if isinstance(track, basestring):
                    track = track.split(":")
                refTracks[index] = track[:-1] if track[
                    -1] == "-- All subtypes --" else track

        if len(refTracks) > 0:

            trackFormats = [
                TrackInfo(genome, track).trackFormatName for track in tracks
            ]

            trackLen = len(tracks)
            refLen = len(refTracks)
            f_matrix = np.zeros((trackLen, refLen))
            for i in range(trackLen):
                for j in range(refLen):
                    #print 'len(options), refLen, len(tracks), trackLen, len(trackFormats):', len(options), refLen, len(tracks), trackLen, len(trackFormats)
                    f_matrix[i,
                             j] = cls.extract_feature(genome, tracks[i],
                                                      refTracks[j], options[j],
                                                      regSpec, binSpec,
                                                      trackFormats[i])
            r.assign('track_names', prettyTrackNames
                     )  #use as track names, will be shown in clustering figure
            r.assign('reftrack_names', reftrack_names)
            r.assign('f_matrix', f_matrix)
            r.assign('distanceType', distanceType)
            r('row.names(f_matrix) <- track_names')
            r('colnames(f_matrix) <- reftrack_names')

            if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--":
                figure = GalaxyRunSpecificFile(
                    ['cluster_tracks_result_figure.pdf'], galaxyFn)
                figurepath = figure.getDiskPath(True)
                #r.pdf(figurepath, 8,8)
                r('d <- dist(f_matrix, method=distanceType)')
                distTable = r('d')
                distMatrix = GalaxyRunSpecificFile(
                    ['distance_matrix_result.txt'], galaxyFn)
                distMatrixPath = distMatrix.getDiskPath(True)
                open(distMatrixPath, 'w').write(str(distTable))
                print >> jobFile, distMatrix.getLink(
                    'View the distance matrix for this analysis <br>')

                #with open(distMatrixPath,'w') as distObj:
                #    #distTable = d_matrix.tolist()
                #    core = HtmlCore()
                #    core.tableHeader(['']+track_names,firstRow=True)
                #    rowSize = len(track_names)
                #    index=0
                #    while index<len(distTable):
                #        core.tableLine([track_names[index % rowSize]]+[str(v) for v in distTable[index:index+rowSize]])
                #    core.tableFooter()
                #    print>>distObj, str(core)
                #print>>jobFile, distMatrix.getLink('View the distance matrix for this analysis <br>')
                #print r.f_matrix
                #print r.d

                r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.robj'],
                                                       galaxyFn)
                r.assign('f_matrix_fn', r_f_matrixFile.getDiskPath(True))
                r('dput(f_matrix, f_matrix_fn)')
                print >> jobFile, r_f_matrixFile.getLink(
                    'Access the R-representation of the Feature_matrix (text-file) <br>'
                ),

                #r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.txt'], galaxyFn)
                #r_f_matrixFile.writeTextToFile(str(f_matrix)+'\n\n'+str(r.d))
                #print>>jobFile, r_f_matrixFile.getLink('r.f_matrix & r.d <br>')

                cls._clusterAndPlotDendrogram(figurepath, extra_option, 'd',
                                              'f_matrix', prettyTrackNames)
                #r.assign('extra_option',extra_option)
                #r('hr <- hclust(d, method=extra_option, members=NULL)')
                #r('hr$height <- hr$height/max(hr$height)*10')
                #r('plot(hr, ylab="Distance", hang=-1)')
                #
                #r('dev.off()')
                print >> jobFile, figure.getLink(
                    'View the clustering tree (dendrogram) for this analysis<br>'
                )
            elif clusterMethod == 'K-means clustering' and extra_option != "--select--" and kmeans_alg != "--select--":
                textFile = GalaxyRunSpecificFile(
                    ['result_of_kmeans_clustering.txt'], galaxyFn)
                textFilePath = textFile.getDiskPath(True)
                extra_option = int(extra_option)
                r.assign('extra_option', extra_option)
                r.assign('kmeans_alg', kmeans_alg)
                r(
                    'hr <- kmeans(f_matrix,extra_option,algorithm=kmeans_alg)'
                )  #the number of cluster is gotten from clusterMethod+ tag, instead of 3 used here
                r('hr$height <- hr$height/max(hr$height)*10')
                kmeans_output = open(textFilePath, 'w')
                clusterSizes = r('hr$size')  #size of every cluster

                withinSS = r('hr$withinss')
                clusters = np.array(
                    r('hr$cluster')
                )  #convert to array in order to handle the index more easily
                track_names = np.array(track_names)
                for index1 in range(
                        extra_option
                ):  #extra_option actually the number of clusters
                    trackInCluster = [
                        k for k, val in clusters.items() if val == index1
                    ]

                    print >> kmeans_output, 'Cluster %i(%s objects) : ' % (
                        index1 + 1, str(clusterSizes[index1]))
                    for name in trackInCluster:
                        print >> kmeans_output, name

                    print >> kmeans_output, 'Sum of square error for this cluster is : ' + str(
                        withinSS[index1]) + '\n'
                kmeans_output.close()
                print >> jobFile, textFile.getLink(
                    'Detailed result of kmeans clustering <br>')

            #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn)
            #baseDir = os.path.dirname(heatmap.getDiskPath(True))
            ##r.png(heatmap_path, width=800, height=700)

            resDict = Results([], [], 'ClusTrack')
            resDict.setGlobalResult({
                'result': {
                    'Matrix': f_matrix,
                    'Rows': np.array(track_names),
                    'Cols': np.array(reftrack_names),
                    'Significance': None,
                    'RowClust': r('hr'),
                    'ColClust': None
                }
            })
            header = 'Heatmap of Feature matrix for "similarity of positional distribution along the genome" '

            baseDir = GalaxyRunSpecificFile([], galaxyFn).getDiskPath()
            heatPresenter = HeatmapFromNumpyPresenter(resDict,
                                                      baseDir,
                                                      header,
                                                      printDimensions=False)

            print >> jobFile, heatPresenter.getReference('result')
            #r.pdf(heatmap_path)
            #r.library("gplots")
            #r('heatmap(f_matrix, col=redgreen(75), Colv=NA, scale="none", xlab="", ylab="", margins=c(10,10))')#Features cluster tracks
            #r('dev.off()')

            #print>>jobFile, heatmap.getLink('View the resulting heatmap plot <br>')
            #cls.print_data(f_matrix, jobFile)

        else:
            print 'Have to specify a set of refTracks'
Ejemplo n.º 10
0
 def _emptyResults(self):
     return Results(["Track 1"], ["Track 2"], self._statClass.__name__)
Ejemplo n.º 11
0
 def _emptyResults(self):
     return Results(self._track.trackName, self._track2.trackName \
                     if self._track2 is not None else [], self._statClass.__name__)