Esempio n. 1
0
 def __init__(self, data_package, wilc_data_type='probe', weight=None, scale=None, minWeight=.5 ):
     """
     This class generates a sequence of learner objects.
     Inputs:
         req:
             data_package: provide an already set up packager.DataPackage object
         optional:
             wilc_data_type: should we use probes or genes to simulate the wilcoxon process in the TSx's
             weight: A list of weights that estimated times are divided by
             scale: A list of scaling factors for the complexity estimates.
             minWeight: a weight at which to normalize the weights.
     """
     self._queue = []
     for i in range(4):#make 4 queues, 1 for each alg
         self._queue.append(PriorityQueue())
     if weight == None:
         self.weight = [1.0,1.0,1.0,1.0]
     else:
         self.weight = weight
     if scale is None:
         self.scale = [None,None,None,None]
     else:
         self.scale = scale
     self.data_package = data_package
     #estimates are based on wilc_data_type
     data, numGenes = self.data_package.getDataVector(wilc_data_type)
     _, gene_net_size = self.data_package.getGeneNetVector(0) 
     cs = self.data_package.getClassVector()
     class1size = cs[0]
     class2size = cs[1]
     self._est = ResourceEstimate( data, class1size, class2size, numGenes, gene_net_size)
     self.wilc = Wilcoxon(data, numGenes, class1size, class2size) 
     self._genQ()
     self.minWeight = minWeight 
Esempio n. 2
0
class LearnerQueue:
    dirac = 0
    tsp = 1
    tst = 2
    ktsp = 3
    def __init__(self, data_package, wilc_data_type='probe', weight=None, scale=None, minWeight=.5 ):
        """
        This class generates a sequence of learner objects.
        Inputs:
            req:
                data_package: provide an already set up packager.DataPackage object
            optional:
                wilc_data_type: should we use probes or genes to simulate the wilcoxon process in the TSx's
                weight: A list of weights that estimated times are divided by
                scale: A list of scaling factors for the complexity estimates.
                minWeight: a weight at which to normalize the weights.
        """
        self._queue = []
        for i in range(4):#make 4 queues, 1 for each alg
            self._queue.append(PriorityQueue())
        if weight == None:
            self.weight = [1.0,1.0,1.0,1.0]
        else:
            self.weight = weight
        if scale is None:
            self.scale = [None,None,None,None]
        else:
            self.scale = scale
        self.data_package = data_package
        #estimates are based on wilc_data_type
        data, numGenes = self.data_package.getDataVector(wilc_data_type)
        _, gene_net_size = self.data_package.getGeneNetVector(0) 
        cs = self.data_package.getClassVector()
        class1size = cs[0]
        class2size = cs[1]
        self._est = ResourceEstimate( data, class1size, class2size, numGenes, gene_net_size)
        self.wilc = Wilcoxon(data, numGenes, class1size, class2size) 
        self._genQ()
        self.minWeight = minWeight 

    def __iter__(self):
        return self.next()

    def next(self):
        n = self._getNext()
        while n is not None:
            yield n
            n=self._getNext()
        raise StopIteration()

    def _genQ(self):
        for i in range(4):
            self._queue.append(PriorityQueue())

    def feedback(self, learner_id, apparent_accuracy):
        """
        Adjusts the weights of a learner.
        learner_id matching global data attribute
        apparent_accuracy - percent accuracy of learner
        """
        self._adjWeight(learner_id, apparent_accuracy)

    def genDirac(self, min_network_size, numTopNetworks, data_type='gene'):
        """
        generates the pq for dirac.
        inputs: 
            min_network_size: a tuple with (start, end, increment)
            numTopNetworks: a tuple with (start, end, increment)
        """
        self.dirac_param = (min_network_size, numTopNetworks, data_type)
        for netsize in range(*min_network_size):
            for numTop in range(*numTopNetworks):
                self._addDirac(netsize, numTop, data_type)

    def _addDirac(self, min_network_size, numTopNetworks, data_type):
        """
        Given these values, add settings and running time estimate to the
        Dirac queue.
        """
        settings = {}
        settings['learner'] = LearnerQueue.dirac
        settings['min_network_size'] = min_network_size
        settings['numTopNetworks'] = numTopNetworks
        settings['data_type'] = data_type
        data, nGenes =  self.data_package.getDataVector(data_type)
        settings['data'] = data
        settings['numGenes'] = nGenes
        self._queue[LearnerQueue.dirac].put((self._est.Diractime(min_network_size), settings))
    
    def genTSP(self, r1, r2, equijoin=False, data_type='probe'):
        """
        generates the pq for tsp.
        inputs: 
            r1 - tuple describing the range for filter 1 (from, to, increment)
            r2 - tuple describing the range for filter 2 (from, to, increment)
            equijoin - boolean, should we restrict filters to [10,10] [20,20] etc.
            
        """
        self.tsp_param = ( r1, r2, equijoin, data_type)
        _, nGenes = self.data_package.getDataVector(data_type)
    
        rest_check = {}
        for x in range(*r1):
            if x >= nGenes:
                break
            for y in range(*r2):
                if y>= nGenes:
                    break
                if not equijoin or x == y:
                    x_adj = self.wilc.filterAdjust(x)
                    y_adj = self.wilc.filterAdjust(y)
                    adj = sorted([self.wilc.filterAdjust(a) for a in [x,y]])
                    x_adj, y_adj = tuple(adj)
                    if (x_adj, y_adj) not in rest_check:
                        rest_check[(x_adj, y_adj)] = 1#keep unique after adjust
                        rVec = tsp.IntVector()
                        rVec.push_back(x_adj)
                        rVec.push_back(y_adj)
                        self._addTSP(rVec, data_type)

    def _addTSP(self, restrictions, data_type):
        """
        Given these values, add settings and running time estimate to the
        TSP queue.
        restrictions is a vector of integers that contains the filter values
        data_type is probe/gene
        """
        settings = {}
        settings['learner'] = LearnerQueue.tsp
        settings['restrictions'] = restrictions
        settings['data_type'] = data_type
        data, nGenes = self.data_package.getDataVector(data_type)
        settings['data'] = data
        settings['numGenes'] = nGenes
        self._queue[LearnerQueue.tsp].put((self._est.TSPtime(restrictions), settings))

    def genTST(self, r1, r2, r3, equijoin=False, data_type='probe'):
        """
        generates the pq for tst
        inputs: 
            r1 - tuple describing the range for filter 1 (from, to, increment)
            r2 - tuple describing the range for filter 2 (from, to, increment)
            r3 - tuple describing the range for filter 3 (from, to, increment)
            equijoin - boolean, should we restrict filters to [10,10] [20,20] etc.
            
        """
        self.tst_param = ( r1, r2, r3, equijoin, data_type)
        _, nGenes = self.data_package.getDataVector(data_type)
        rest_check = {}
        for x in range(*r1):
            if x >= nGenes:
                break #GIGO - bad range
            for y in range(*r2):
                if y >= nGenes:
                    break
                for z in range(*r3):
                    if z >= nGenes: 
                        break
                    if not equijoin or x == y == z:
                        adj = sorted([self.wilc.filterAdjust(a) for a in [x,y,z]])
                        x_adj, y_adj, z_adj = tuple(adj)
                        if (x_adj, y_adj, z_adj) not in rest_check:
                            #rest_check (don't want duplicates)
                            rest_check[(x_adj, y_adj, z_adj)] = 1
                            rVec = tsp.IntVector()
                            rVec.push_back(x_adj)
                            rVec.push_back(y_adj)
                            rVec.push_back(z_adj)
                            self._addTST(rVec, data_type)


    def _addTST(self, restrictions, data_type):
        """
        Given these values, add settings and running time estimate to the
        TSP queue.
        restrictions is a vector of integers that contains the filter values
        data_type is probe/gene
        """
        settings = {}
        settings['learner'] = LearnerQueue.tst
        settings['restrictions'] = restrictions
        settings['data_type'] = data_type
        data, nGenes = self.data_package.getDataVector(data_type)
        settings['data'] = data
        settings['numGenes'] = nGenes
        self._queue[LearnerQueue.tst].put((self._est.TSTtime(restrictions), settings))

    def genKTSP(self, maxK, ncv, nlo, r1, r2, equijoin=False, data_type='probe'):
        """
        generates the pq for tst
        inputs:
            maxK - tuple describing the range for the maximum k value
            ncv - tuple describing the range for number of cross validations
            nlo - tuple describing the range for number of elements to leave out of internal crossvalidation 
            r1 - tuple describing the range for filter 1 (from, to, increment)
            r2 - tuple describing the range for filter 2 (from, to, increment)
            equijoin - boolean, should we restrict filters to [10,10] [20,20] etc.
            
        """
        self.ktsp_param =  ( maxK, ncv, nlo, r1, r2, equijoin, data_type)
        _, nGenes = self.data_package.getDataVector(data_type)
        rest_check = {}
        for x in range(*r1):
            if x >= nGenes:
                break
            for y in range(*r2):
                if y>= nGenes:
                    break
                if not equijoin or x == y:
                    x_adj = self.wilc.filterAdjust(x)
                    y_adj = self.wilc.filterAdjust(y)
                    if (x_adj, y_adj) not in rest_check:
                        rest_check[(x_adj, y_adj)] = 1#keep unique
                        rVec = tsp.IntVector()
                        rVec.push_back(x_adj)
                        rVec.push_back(y_adj)
                        for k in range(*maxK):
                            for cv in range(*ncv):
                                for n in range(*nlo):
                                    self._addKTSP(k, cv, n, rVec, data_type)


    def _addKTSP(self, maxk, num_cross_validate, num_leave_out, restrictions, data_type='probe'):
        """
        Given these values, add settings and running time estimate to the
        TSP queue.
        restrictions is a vector of integers that contains the filter values
        data_type is probe/gene
        
        """
        settings = {}
        settings['learner'] = LearnerQueue.ktsp
        settings['restrictions'] = restrictions
        settings['maxk'] = maxk
        settings['num_leave_out'] = num_leave_out
        settings['num_cross_validate'] = num_cross_validate
        settings['data_type'] = data_type
        data, nGenes = self.data_package.getDataVector(data_type)
        settings['data'] = data
        settings['numGenes'] = nGenes
        self._queue[LearnerQueue.ktsp].put((self._est.kTSPtime(maxk, num_cross_validate, restrictions), settings))


    def _calcScale(self, real_time, complexity):
        """
        Takes the quotient of the complexity over time in order to
        generate a scaling factor for the running time.
        """
        return complexity/real_time

    def _adjWeight(self, learner, score):
        """
        Adjusts the given learners weight by the product of the given score
        """
        self.weight[learner] = score*self.weight[learner]
        if self.weight[learner] < self.minWeight:
            self._normalizeWeight()

    def _adjScale(self, learner, scale):
        self.scale[learner] = scale

    def _normalizeWeight(self):
        """
        Divide all weights by the max weight to rescale the weights.
        """
        wMax = max(self.weight)
        self.weight = [x/wMax for x in self.weight]
        wMin = min(self.weight)
        self.minWeight = wMin/2.0

    def _getNext(self):
        """
        Returns the next algorithms settings
        (the complexity, settings dict)
        Returns None when all of the queues are empty
        """
        poss = []
        def_min = float('inf')
        curr = None
        min_complexity = None 
        min_settings = None 
        for x in range(4):
            #cycle through the queues looking for the best score
            if not self._queue[x].empty():
                next = self._queue[x].get()
                complexity = next[0]
                next_settings = next[1]
                est_time = self.getEstimatedTime(x, complexity)
                
                weighted_time = est_time/self.weight[x]
                if  weighted_time < def_min:
                    def_min = weighted_time
                    min_complexity = complexity
                    min_settings = next_settings
                    if curr is not None:
                        #put previous best back
                        self._queue[curr[1]['learner']].put(curr)
                    curr = next 
                else:
                    self._queue[x].put(next)
        if min_complexity is None:
            return None
        else:
            return (min_complexity, min_settings) 

    def trainLearner(self, settings, complexity):
        """
        Returns a learner that has been trained according to the settings given.
        """
        start_time = time.clock()
        l =  self.getLearner(settings)
        l.train()
        real_time = time.clock() - start_time

        if real_time > 0.0:            
            newScale = self._calcScale(real_time, complexity)
        else:
            #should not happen, but windows may be silly
            newScale = None

        self._adjScale(settings['learner'], newScale)       
        return l

    def getEstimatedTime(self, learner_id, complexity):
        if self.scale[learner_id] is None:
            return .00001
        else:
            return float(complexity)/float(self.scale[learner_id])
 
    def getLearner(self, settings):
        """
        Returns a learner object corresponding to the provided settings dict.
        """
        if settings['learner'] == LearnerQueue.tsp:
            data = settings['data']
            numGenes = settings['numGenes']
            classSizes = self.data_package.getClassVector()
            filt = settings['restrictions']
            return tsp.TSP(data, numGenes, classSizes, filt)

        if settings['learner'] == LearnerQueue.tst:
            data = settings['data']
            numGenes = settings['numGenes']
            classSizes = self.data_package.getClassVector()
            filt = settings['restrictions']
               

            return tst.TST(data, numGenes, classSizes, filt)
  
        if settings['learner'] == LearnerQueue.ktsp:
            data = settings['data']
            numGenes = settings['numGenes']
            classSizes = self.data_package.getClassVector()
            filt = settings['restrictions']
            maximumK = settings['maxk']
            nleaveout = settings['num_leave_out']
            nValidationRuns = settings['num_cross_validate'] 
            return ktsp.KTSP(data, numGenes, classSizes, filt, maximumK, nleaveout, nValidationRuns) 
 
        if settings['learner'] == LearnerQueue.dirac:
            data = settings['data']
            numGenes = settings['numGenes']
            classSizes = self.data_package.getClassVector()
            #clear out old gene net and create new one
            self.data_package.createGeneNetVector(settings['min_network_size'])
            geneNet, geneNetSize = self.data_package.getGeneNetVector(settings['min_network_size'])
            numnet = settings['numTopNetworks']
            geneNetMap = self.data_package.gene_net_map
            return dirac.Dirac(data, numGenes, classSizes, geneNet, geneNetSize,numnet, geneNetMap )    

        raise Exception("Unrecognized learner")