def random_forest(data, classify, nrOfTrees=100, attributes=None):
    '''
    make a random forest using orange
    
    For more details see `orange ensemble <http://orange.biolab.si/doc/modules/orngEnsemble.htm>`_
    
    :param data: data from :meth:`perform_experiments`.
    :param classify: function for classifying runs.
    :param nrOfTrees: number of trees in the forest (default: 100).
    :param attributes: Number of attributes used in a randomly drawn subset 
                       when searching for best attribute to split the node in 
                       tree growing (default: None, and if kept this way, this 
                       is turned into square root of attributes in 
                       example set).
    :rtype: an orange random forest.
    
    '''
    data = build_orange_data(data, classify)
    
    #do the random forest
    #see http://orange.biolab.si/doc/modules/orngEnsemble.htm for details
    info("executing random forest")
    measure = orngEnsemble.MeasureAttribute_randomForests(trees=nrOfTrees, attributes=attributes)
    
    return measure
def random_forest_measure_attributes(data, classify):
    '''
    performs feature selection using random forests in orange.
    
    For more details see `orange ensemble <http://orange.biolab.si/doc/modules/orngEnsemble.htm>`_
    
    :param data: data from :meth:`perform_experiments`.
    :param classify: function for classifying runs.
    :param nrOfTrees: number of trees in the forest (default: 100).
    :param attributes: Number of attributes used in a randomly drawn subset 
                       when searching for best attribute to split the node in 
                       tree growing. (default: None, and if kept this way, this 
                       is turned into square root of attributes in example set)
    :rtype: sorted list of tuples with uncertainty names and importance values.
    
    '''
    data = build_orange_data(data, classify)
    
    #do the random forest
    #see http://orange.biolab.si/doc/modules/orngEnsemble.htm for details
    info("executing random forest for attribute selection")
    measure = orngEnsemble.MeasureAttribute_randomForests(trees=100)
    
    #calculate importance
    imps = measure.importances(data)
    
    #sort importance, using schwartzian transform
    results = [] 
    for i,imp in enumerate(imps): 
        results.append((imp, data.domain.attributes[i].name))
    results.sort(reverse=True)
    
    results = [(entry[1], entry[0]) for entry in results]
    return results
def construct_features(data, 
                       trendThold, 
                       crisisThold):
    info("calculating features")
    
    # Checks the parameters of the distance function that may be defined by the user in the distanceSetup dict
    
    
    features = np.zeros(shape=(data.shape[0], 3))
    for i in range(data.shape[0]):
        features[i,:] = construct_feature_vector(data[i, :], trendThold, crisisThold)
    return features
def build_orange_data(data,classify):
    '''
    
    helper function for turning the data from :meth:`perform_experiments` into 
    a data object that can be used by the various orange functions. 
    
    For more details see `orange domain <http://orange.biolab.si/doc/reference/Domain.htm>`_  
    
    :param data: return from :meth:`perform_experiments`.
    :param classify: function to be used for determining the class for each 
                     run.
    
    '''
    info("building orange data")
    
    experiments, results = data

    #build domain
    dtypes =  []
    for entry in experiments.dtype.descr:
        dtypes.append((entry[0], experiments.dtype.fields.get(entry[0])))
    
    attributes = []
    for entry in dtypes:
        name, dtype = entry
        dtype = dtype[0].name
        if dtype == 'int' or dtype =='object':
            attribute = ENUM(name)
            [attribute.addValue(str(value)) for value in set(experiments[name].tolist())]
        else:
            attribute = FLOAT(name, startValue = np.min(experiments[name]), 
                              endValue = np.max(experiments[name]))
        attributes.append(attribute)

    data = np.array(experiments.tolist())
        
    #determine classes
    classes = classify(results)
    classVar = ENUM('class')
    #these numbers are merely referring to the possible classes
    [classVar.addValue(str(i)) for i in set(classes.tolist())] 
    #by default the last entry in the list should be the class variable
    attributes.append(classVar) 
    domain = orange.Domain(attributes)
    
    data = np.hstack((data, classes[:, np.newaxis]))
    data = data.tolist()
    data = orange.ExampleTable(domain, data)

    return data
def perform_prim(x,
                 y,
                 box_init = None,
                 peel_alpha = 0.05,
                 paste_alpha = 0.01,
                 mass_min = 0.05,
                 threshold = None,
                 pasting = False,
                 threshold_type = 1):
    if threshold==None:
        threshold = np.mean(y)
   
    k_max = np.ceil(1/mass_min)
    k_max = int(k_max)
    info("k max: %s" %(k_max))
    
    if box_init == None:
        #if no initial box, make initial box
        box_init = np.array([np.min(x, axis=0),np.max(x, axis=0)])
        box_diff = box_init[1, :] - box_init[0, :]
        box_init[0,:] = box_init[0, :] - 10*paste_alpha*box_diff
        box_init[1,:] = box_init[1, :] + 10*paste_alpha*box_diff
    else:
        #else, identify all points in initial box, rest is discarded
        logical =  in_box(x, box_init)
        x = x[logical]
        y = y[logical]


    n = y.shape[0]
    y = y * threshold_type
    boxes = find_boxes(x, y, box_init, 
                       peel_alpha, paste_alpha, mass_min, 
                       np.min(y)-0.1*np.abs(np.min(y)), 
                       pasting, 0, k_max, n)
    
    # adjust for negative hdr  
    for box in boxes:
        box.y = threshold_type*box.y
        box.y_mean = threshold_type*box.y_mean

    # the list of found boxes has the dump box as first element
    # we need to reverse the ordering to get the correct order in which
    # the boxes have been found
    boxes.reverse()
    boxes = prim_hdr(boxes, threshold, threshold_type)
    
    return boxes
def perform_prim(x,
                 y,
                 box_init = None,
                 peel_alpha = 0.05,
                 paste_alpha = 0.05,
                 mass_min = 0.05,
                 threshold = None,
                 pasting = False,
                 threshold_type = 1,
                 cases_of_interest = None,
                 obj_func = None):
    if threshold==None:
        threshold = np.mean(y)
   
    k_max = np.ceil(1/mass_min)
    k_max = int(k_max)
    info("max number of boxes: %s" %(k_max))
    
    if box_init == None:
        box_init = make_box(x)
    else:
        #else, identify all points in initial box, rest is discarded
        logical =  in_box(x, box_init)
        x = x[logical]
        y = y[logical]

    n = y.shape[0]
    y = y * threshold_type
    
    boxes = find_boxes(x, y, box_init, 
                       peel_alpha, paste_alpha, mass_min, 
                       np.min(y)-0.1*np.abs(np.min(y)), 
                       pasting, 0, k_max, n, cases_of_interest, obj_func)
    
    # adjust for negative hdr  
    for box in boxes:
        box.y = threshold_type*box.y
        box.y_mean = threshold_type*box.y_mean

    # the list of found boxes has the dump box as first element
    # we need to reverse the ordering to get the correct order in which
    # the boxes have been found
    boxes.reverse()
    boxes = prim_hdr(boxes, threshold, threshold_type)
    
    return boxes
def feature_selection(data, classify, k=5, m=100):
    '''
    
    perform feature selection using orange
    
    For more details see `orange feature selection <http://orange.biolab.si/doc/modules/orngFSS.htm>`_ and
    `orange measure attribute <http://orange.biolab.si/doc/reference/MeasureAttribute.htm>`_
    
    the default measure is ReliefF ((MeasureAttribute_relief in Orange).
    
    :param data: data from :meth:`perform_experiments`.
    :param classify: function for classifying runs.
    :param k: the number of neighbors for each example (default 5).
    :param m: number of examples to use, Set to -1 to use all (default 100).
    :rtype: sorted list of tuples with uncertainty names and reliefF attribute 
            scores.
    
    Orange provides other metrics for feature selection
    
    * Information Gain
    * Gain ratio 
    * Gini index 
    * Relevance of attributes 
    * Costs
    
    If you want to use any of of these instead of ReliefF, use the code
    supplied here as a template, but modify the measure. That is replace::
    
        measure = orange.MeasureAttribute_relief(k=k, m=m)
        
    with the measure of choice. See the above provided links for more details.
    
    '''
    data = build_orange_data(data, classify)

    info("executing feature selection")
    measure = orange.MeasureAttribute_relief(k=k, m=m)
    ma = orngFSS.attMeasure(data, measure)
    
    results = [] 
    for m in ma:
        results.append((m[1], m[0]))
    results.sort(reverse=True)
    
    results = [(entry[1], entry[0]) for entry in results]
    return results
def prim_hdr(prims,
             threshold,
             threshold_type):
    '''
    Highest density region for PRIM boxes
    
    prim        list of prim objects
    threshold    
    threshold_type
    
    '''
    
    n = 0
    for entry in prims:
        n += entry.y.shape[0]
    info("number of items in boxes: %s" %n)
  
    boxes = [(entry.y_mean, entry) for entry in prims]
    
    final_list = []
    dump_entries = []
    for entry in boxes:
        if entry[0]*threshold_type >= threshold*threshold_type:
            final_list.append(entry[1])
        else:
            dump_entries.append(entry[1])

    x_temp = None
    for entry in dump_entries: 
        if x_temp == None:
            x_temp = entry.x
            y_temp = entry.y
        else:
            x_temp = np.append(x_temp, entry.x, axis=0) 
            y_temp = np.append(y_temp, entry.y, axis=0)

    dump_box = Prim(x_temp, y_temp, prims[-1].box, 
                        y_temp.shape[0]/n)
        
    final_list.append(dump_box)

    return final_list
Exemple #9
0
def __filter(boxes, uncertainties=[]):
    dump_box=boxes[-1]
    boxes=boxes[0:-1]
    
    uv=uncertainties
    #iterate over uncertainties
    names = []

    if uncertainties:
        uv=uncertainties
    else:
        uv = [entry[0] for entry in dump_box.dtype.descr]

    for name in uv:
        
        #determine whether to show
        for box in boxes:
            minimum = box[name][0]
            maximum = box[name][1]
            value = box.dtype.fields.get(name)[0]
            if value == 'object':
                a = dump_box[name][0]
                
                if len(a) != len(minimum):
                    ans = False
                else:
                    ans = np.all(np.equal(a, minimum))
                if not ans:
                    names.append(name)
                    break
            elif (minimum > dump_box[name][0]) or\
                 (maximum < dump_box[name][1]):
                names.append(name)
                break
    a = set(uv) -set(names)

    string_list = ", ".join(a)

    info(string_list + " are not not visualized because they are not restricted")
    
    uv = names
    return uv
def construct_features(data, filterSlope, tHoldSlope, filterCurvature, 
                       tHoldCurvature, addMidExtension, addEndExtension):
    '''
    Constructs a feature vector for each of the data-series contained in the 
    data. 
    
    '''
    info("calculating features")
    
    # TODO, the casting of each feature to a list of tuples might be 
    # removed at some stage, it will lead to a speed up, for you 
    # can vectorize the calculations that use the feature vector
    features = []
    for i in range(data.shape[0]):
        feature = construct_feature_vector(data[i, :], filterSlope, tHoldSlope, 
                                     filterCurvature, tHoldCurvature, 
                                     addMidExtension, addEndExtension)
#        feature =  [tuple(feature[0,:]),tuple(feature[1,:])]
        features.append(feature)
    return features
def distance_triangle(data):
    """
    The triangle distance is calculated as follows;
        Let ds1(.) and ds2(.) be two data series of length N. Then;
        A equals to the summation of ds1(i).ds2(i) from i=1 to N
        B equals to the square-root of the (summation ds1(i)^2 from i=1 to N)
        C equals to the square-root of the (summation ds1(i)^2 from i=1 to N)
        
        distance_triangle = A/(B.C)
     
     The triangle distance works only with data series of the same length
     
     In the literature, it is claimed that the triangle distance can deal with noise and amplitude scaling very well, and may yield poor
     results in cases of offset translation and linear drift.   
    """

    runLogs = []
    # Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])),))
    index = -1
    for i in range(data.shape[0]):

        # For each run, a log is created
        # Log includes a description dictionary that has key information
        # for post-clustering analysis, and the data series itself. These
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc["Index"] = str(i)

        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)

        for j in range(i + 1, data.shape[0]):
            index += 1
            distance = trdist(data[i], data[j])
            dRow[index] = distance
    return dRow, runLogs
Exemple #12
0
def distance_sse(data):
    
    '''
    The SSE (sum of squared-errors) distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series.
    Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, 
    where error_term(i) equals to ds1(i)-ds2(i) 
    
    Since SSE calculation is based on pairwise comparison of individual data points, the data series should be of equal length.
    
    SSE distance equals to the square of Euclidian distance, which is a commonly used distance metric in time series comparisons.
    '''
    
    
    
    
    runLogs = []
    #Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
            
        # For each run, a log is created
        # Log includes a description dictionary that has key information 
        # for post-clustering analysis, and the data series itself. These 
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
    
        for j in range(i+1, data.shape[0]):
            index += 1
            distance = ssedist(data[i],data[j]) 
            dRow[index] = distance
    return dRow, runLogs
Exemple #13
0
def distance_mse(data):
    '''
    The MSE (mean squared-error) distance is equal to the SSE distance divided by the number of data points in data series.
    
    The SSE distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series.
    Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, 
    where error_term(i) equals to ds1(i)-ds2(i) 
    
    Given that SSE is calculated as given above, MSE equals SSE divided by N.
    
    As SSE distance, the MSE distance only works with data series of equal length.
    '''
    
    runLogs = []
    #Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
            
        # For each run, a log is created
        # Log includes a description dictionary that has key information 
        # for post-clustering analysis, and the data series itself. These 
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
    
        for j in range(i+1, data.shape[0]):
            index += 1
            distance = msedist(data[i],data[j]) 
            dRow[index] = distance
    return dRow, runLogs
def distance_euclidian(data):
    
    '''
    The Euclidian distance is equal to the square root of (the sum of squared-differences between corresponding dimensions of two N-dimensional vectors) 
    (i.e. two data series of length N).
    Let the data series be of length N; Then Euclidian distance between ds1 and ds2 equals to sqrt(the sum of the square of error terms from 1 to N), 
    where error_term(i) equals to ds1(i)-ds2(i) 
    
    '''
    
    
    
    
    runLogs = []
    #Generates the feature vectors for all the time series that are contained in numpy array data
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = -1
    for i in range(data.shape[0]):
            
        # For each run, a log is created
        # Log includes a description dictionary that has key information 
        # for post-clustering analysis, and the data series itself. These 
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
    
        for j in range(i+1, data.shape[0]):
            index += 1
            distance = eucldist(data[i],data[j]) 
            dRow[index] = distance
    return dRow, runLogs
Exemple #15
0
 def _generate_cases(self, nrOfCases):
     '''
     number of cases specifies the number of cases to generate in case
     of Monte Carlo and Latin Hypercube sampling.
     
     In case of full factorial sampling it specifies the resolution on
     non categorical uncertainties.
     
     In case of multiple model structures, the uncertainties over
     which to explore is the intersection of the sets of uncertainties of
     the model interface instances.
     
     :param nrOfCases: In case of Latin Hypercube sampling and Monte Carlo 
                       sampling, nrOfCases specifies the number of cases to
                       generate. In case of Full Factorial sampling,
                       nrOfCases specifies the resolution to use for sampling
                       continuous uncertainties.
     
     '''
     
     #get the intersection of the uncertainties of the different models
     if len(self._modelStructures)  >1:
         uncertainties = [msi.uncertainties for msi in self._modelStructures]
         uncertainties = set(uncertainties[0]).intersection(*uncertainties[:1])
         info("intersection contains %s uncertainties" %len(uncertainties))
     else:
         uncertainties = set(self._modelStructures[0].uncertainties)
      
     info("generating cases")
     
     designs = self.sampler.generate_design(uncertainties, nrOfCases)
     information = designs[1]
     designs = designs[0]
     cases = []
     for design in designs:
         case = {}
         for i, name in enumerate(information):
             case[name] = design[i]
         cases.append(case)
     
     info(str(len(cases)) + " cases generated")
     
     return cases, uncertainties
def find_boxes(x_remaining,
               y_remaining,
               box_init,
               peel_alpha,
               paste_alpha,
               mass_min,
               threshold,
               pasting,
               k, 
               k_max,
               n):
    '''    
     Finds box
    
     Parameters
     x - matrix of explanatory variables
     y - vector of response variable
     box.init - initial box (should cover range of x)
     mass.min - min box mass
     threshold - min box mean
     pasting - TRUE - include pasting step (after peeling)
             - FALSE - dont include pasting
    
     Returns
     List with fields
     x - data still inside box after peeling
     y - corresponding response values
     y.mean - mean of y
     box - box limits
     mass - box mass
    '''
    k+=1
    
    info("%s points remaining" % (y_remaining.shape[0]))
    
    new_box = peel(x_remaining, y_remaining, box_init, peel_alpha, 
                   mass_min, threshold, n)

    info("peeling completed")

    if pasting:
        logical = in_box(x_remaining, new_box, bool=True)
        x_inside = x_remaining[logical]
        y_inside = y_remaining[logical]

        new_box = paste(x_inside, y_inside, x_remaining, y_remaining, 
                           new_box, paste_alpha, mass_min, 
                           threshold, n)
        info("pasting completed")

    
    logical = in_box(x_remaining, new_box, bool=True)
    x_inside = x_remaining[logical]
    y_inside = y_remaining[logical]
    box_mass = y_inside.shape[0]/n

    # update data in light of found box
    x_remaining_temp = x_remaining[logical==False]
    y_remaining_temp = y_remaining[logical==False]

    if (y_remaining_temp.shape[0] != 0) &\
       (k < k_max) &\
       (compare(box_init, new_box)==False):

        # make a primObject
        prim_object = Prim(x_inside, y_inside, new_box, box_mass)
        info("Found box %s: y_mean=%s, mass=%s" % (k, 
                                                   prim_object.y_mean, 
                                                   prim_object.box_mass))
        info("%s points in new box" % (y_inside.shape[0]))
        
        boxes = find_boxes(x_remaining_temp, y_remaining_temp, 
                           box_init, peel_alpha, paste_alpha, mass_min, 
                           threshold, 
                           pasting, k, k_max, n)
        boxes.append(prim_object)
        return boxes
    else:
        info("Bump "+str(k)+" includes all remaining data")
        #make dump box
        box_mass = y_remaining.shape[0]/n
        dump_box = Prim(x_remaining, y_remaining, box_init, box_mass)
        return [dump_box]
def find_box(x,
             y,
             box,
             peel_alpha,
             paste_alpha,
             mass_min,
             threshold,
             d,
             n,
             pasting):
    '''    
     Finds box
    
     Parameters
     x - matrix of explanatory variables
     y - vector of response variable
     box.init - initial box (should cover range of x)
     mass.min - min box mass
     threshold - min box mean
     pasting - TRUE - include pasting step (after peeling)
             - FALSE - dont include pasting
    
     Returns
     List with fields
     x - data still inside box after peeling
     y - corresponding response values
     y.mean - mean of y
     box - box limits
     mass - box mass
    '''
    
    y_mean =  np.mean(y)
    mass = y.shape[0]/n 

    if (y_mean >= threshold) & (mass >= mass_min):
        boxk_peel = peel_one(x, y, box, peel_alpha, mass_min, threshold, d, n)
    else:
        boxk_peel = None
         
    boxk_temp = None
        
    while boxk_peel:
        boxk_temp = copy.deepcopy(boxk_peel)
        boxk_peel = peel_one(boxk_temp.x, boxk_temp.y, boxk_temp.box, 
                             peel_alpha, mass_min, threshold, d, n)
    

    info("peeling completed")

    if pasting:
        boxk_paste = boxk_temp
        
        while boxk_paste:
            boxk_temp = boxk_paste
            boxk_paste = paste_one(boxk_temp.x, boxk_temp.y, x, y, 
                                   boxk_temp.box, paste_alpha, mass_min, 
                                   threshold, d, n)
        info("pasting completed")
            
    boxk = boxk_temp
    return boxk
def prim_hdr(prim,
             threshold,
             threshold_type):
    '''
    Highest density region for PRIM boxes
    
    prim        list of prim objects
    threshold    
    threshold_type
    
    '''
    
    n = 0
    for entry in prim:
        n += entry.y.shape[0]
    info("number of items in boxes: %s" %n)

    y_means = np.asarray([entry.y_mean for entry in prim])
    hdr_ind =  np.where(y_means * threshold_type >= threshold*threshold_type)[0]
    
    if hdr_ind.shape[0] > 0:
        hdr_ind = np.max(hdr_ind)
    else:
        if threshold_type ==1:
            raise Warning("No prim box found with mean >= "+str(threshold))
        elif threshold_type ==-1:
            raise Warning("No prim box found with mean <= "+str(threshold))
        return None

    #highest density region  
    x_prim_hdr = []
    
    for k in range(hdr_ind+1):
        hdr = prim[k]
        x_prim_hdr.append(hdr)

    #combine non-hdr into a `dump' box
    if hdr_ind < len(prim)-1:
        info("making a dumpbox")
        x_temp = None
        for k in range(hdr_ind+1,len(prim)): #dit moet via een slice veel sneller kunnen
            if x_temp == None:
                x_temp = prim[k].x
                y_temp = prim[k].y
            else:
                x_temp = np.append(x_temp, prim[k].x, axis=0) 
                y_temp = np.append(y_temp, prim[k].y, axis=0)

        dump_box = Prim(x_temp, y_temp, np.mean(y_temp), prim[-1].box, 
                        y_temp.shape[0]/n)
        
        x_prim_hdr.append(dump_box)

    #dit kan niet wat hdr is een list
    x_prim_hdr_num_class = len(x_prim_hdr)
    x_prim_hdr_num_hdr_class = hdr_ind+1
    x_prim_hdr_threshold = threshold
    
    x_prim_hdr_ind = np.zeros((x_prim_hdr_num_class)) 
    x_prim_hdr_ind[:] = threshold_type

    return x_prim_hdr, x_prim_hdr_num_hdr_class
def prim_one(x,
             y,
             box_init = None,
             peel_alpha = 0.05,
             paste_alpha = 0.01,
             mass_min = 0.05,
             threshold = None,
             pasting = False,
             threshold_type = 1):

    d = x.shape[1]
    n = x.shape[0]
    
    k_max = np.ceil(1/mass_min)
    info("k max: %s" %(k_max))
    num_boxes = int(k_max)
    
    y_mean =  np.mean(y)
    mass_init =  y.shape[0]/n #should default to 1 if I read the code correctly
    
    if box_init == None:
        box_init = np.array([np.min(x, axis=0),np.max(x, axis=0)])
        box_diff = box_init[1, :] - box_init[0, :]
        box_init[0,:] = box_init[0, :] - 10*paste_alpha*box_diff
        box_init[1,:] = box_init[1, :] + 10*paste_alpha*box_diff
  
    # find first box
    k = 1

    a = x.shape[0]
    debug("remaing items: %s" % (a))

    boxk = find_box(x=x, y=y, box=box_init, peel_alpha=peel_alpha,
                   paste_alpha=paste_alpha, mass_min=mass_min,
                   threshold=np.min(y)-0.1*np.abs(np.min(y)), d=d, n=n, 
                   pasting=pasting)

    b = boxk.x.shape[0]
    debug("removed items: %s" %b)


    if boxk == None:
        info("unable to find box 1")  
        x_prim = Prim(x, threshold_type*y, y_mean=threshold_type*y_mean, 
                      box=box_init, box_mass=mass_init, threshold=np.mean(y)) 
        return x_prim
    else:
        info("Found box %s: y_mean=%s, mass=%s" % (k, threshold_type*boxk.y_mean, boxk.box_mass))
        boxes = []
        boxes.append(boxk)
    
    # find subsequent boxes
    if num_boxes > 1:

        #  data still under consideration
        x_out_ind_mat = np.empty(x.shape)
    
        for j in range(d):
            x_out_ind_mat[:, j] = (x[:, j] < boxk.box[0,j]) | (x[:,j] >boxk.box[1,j]) 
        
        x_out_ind = np.any(x_out_ind_mat,axis=1)
        
        x_out =  x[x_out_ind,:]
        y_out =  y[x_out_ind]
     
        a = x_out.shape[0]
        debug("remaing items: %s" % (a))
   
        while (y_out.shape[0] > 0) & (k < num_boxes):
            k = k+1
            
            boxk = find_box(x=x_out, y=y_out, box=box_init,
                           peel_alpha=peel_alpha, paste_alpha=paste_alpha,
                           mass_min=mass_min, 
                           threshold=np.min(y)-0.1*np.abs(np.min(y)), d=d, n=n,
                           pasting=pasting) 
            if boxk == None:
                info("Bump "+str(k)+" includes all remaining data")
                boxk = Prim(x_out, y_out, np.mean(y_out), box_init, 
                            y_out.shape[0]/n)
                b += boxk.x.shape[0]
                debug("removed items: %s" %b)
                
                boxes.append(boxk)
                break
            else:
                b += boxk.x.shape[0]
                debug("removed items: %s" %b)
                
                # update x and y
                debug("Found box %s: y_mean=%s, mass=%s" % (k, threshold_type*boxk.y_mean, boxk.box_mass))
        
                #  data still under consideration
                x_out_ind_mat = np.empty(x.shape)
            
                for j in range(d):
                    x_out_ind_mat[:, j] = (x[:, j] < boxk.box[0,j]) | (x[:,j] >boxk.box[1,j]) 
                
                x_out_ind_mat = np.any(x_out_ind_mat,axis=1)
                x_out_ind = x_out_ind & x_out_ind_mat
                
                x_out =  x[x_out_ind, :]
                y_out =  y[x_out_ind]
                
                a = x_out.shape[0]
                debug("remaing items: %s" %(a))
    
    
                boxes.append(boxk)

    # adjust for negative hdr  
    for box in boxes:
        box.y = threshold_type*box.y
        box.y_mean = threshold_type*box.y_mean

    prim_res, num_hdr=  prim_hdr(boxes, threshold, threshold_type)
    
    return prim_res, num_hdr
def cluster(data, outcome, distanceSetup={}, clusterSetup={}):
    '''
    
    Method that clusters time-series data from the specified cpickle file 
    according to a selected distance measure
    
     :param data: return from meth:`perform_experiments`
     :param outcome: Name of outcome/variable whose behavior is being analyzed
     :param distanceSetup: Dictionary that specifies the distance to be used in 
                          clustering, and the configuration of the distance 
                          (optional)  
    :param clusterSetup: Dictionary that specifies the configuration of the 
                         hierarchical clustering algorithm (optional)
    :rtype: A list of integers, which specify the clusters to which runs' 
            results are allocated
    
    The keys that can be specified in the distanceSetup are as follows;
    
    * 'distance': String that specifies the distance to be used. 
                  Options: bmd (default), mse, sse
    * 'filter?': Boolean that specifies whether the data series will be 
                 filtered (for bmd distance)
    * 'slope filter': A float number that specifies the filtering threshold 
                     for the slope (for every data point if change__in_the_
                     outcome/average_value_of_the_outcome < threshold, 
                     consider slope = 0) (for bmd distance)
    * 'curvature filter': A float number that specifies the filtering 
                          threshold for the curvature (for every data point if 
                          change__in_the_slope/average_value_of_the_slope < 
                          threshold, consider curvature = 0) (for bmd distance)
    * 'no of sisters': 50 (for bmd distance)
    
    The keys that can be specified in the clusterSetup are as follows:
    
    * 'plotClusters?': True
    * 'Plot type': 'single-window' #other option is 'single-window'
    * 'plotDendrogram?': True
    * 'inter-cluster distance': 'complete' # Other options are 'single' and 
                                'average'
    * 'cutoff criteria': 'inconsistent'   # Other options are 'distance' and 
                         'maxclust' 
    * 'cutoff criteria value': 0.5
    
    '''
    global varName 
    varName = outcome

    dataSeries = data[1][outcome]
    
    # Construct a n-by-n matrix of distances between behaviors of the selected 
    # outcome. It is a symmetrical matrix, with 0's along the 
    # nortwest-southeast diagonal. distanceSetup is a dictionary that specifies 
    # (and customizes) the distance function to be used.
    distMatrix = constructDMatrix(dataSeries, distanceSetup)
    info('done with distances')
    
    pyplot.ion()
    # Allocate individual runs into clusters using hierarchical agglomerative 
    # clustering. clusterSetup is a dictionary that customizes the clustering 
    # algorithm to be used.
    clusters = flatcluster(distMatrix, clusterSetup)
    
    if 'Plot type' in clusterSetup.keys():
        if clusterSetup['Plot type'] == 'multi-window':
            groupPlot = False
        elif clusterSetup['Plot type'] == 'single-window':
            groupPlot = True
        else:
            groupPlot = False
    else:
        groupPlot = False
    
    # Plots the clusters, unless it is specified not to be done in the setup
    if 'plotClusters?' in clusterSetup.keys():
        if clusterSetup['plotClusters?']:
            plotClusters(groupPlot)
        else:
            pass
    else:
        plotClusters(groupPlot)
    
    return clusters
Exemple #21
0
def envelopes3d_group_by(results, 
                         outcome,
                         groupBy = 'policy', 
                         discretesize = None,
                         logSpace=False,
                         ymin = None,
                         ymax = None):
    '''
    
    Function for making 3d envelopes. In contrast to the envelopes in 
    :mod:`graphs`, this version shows the density for every time step, instead 
    of only for the end state. Note that this function makes an envelope for 
    only 1 outcome. This envelopes will group the results based on the
    specified uncertainty. The user can supply a discretesize function
    to control the grouping in case of parameterUncertainties. This function
    will make a separate envelope for each group.
    
    :param results: The return from :meth:`run experiments`.
    :param outcome: Specify the name of outcome of interest for which you want to make 
                    the 3d envelopes.
    :param groupBy: The uncertainty to group by. (default=policy)
    :param discretesize: a discretesize function to control the grouping in case of parameterUncertainties
    :param logSpace: Boolean, if true, the log of the input data is used
    :param ymin: If provided, lower bound for the KDE, if not, ymin = np.min(results.get(outcome))
    :param ymax: If provided, lower bound for the KDE, if not, ymax = np.max(results.get(outcome))
    
    '''
    def f(x, y, results):
        """
        function that performs the kde for each timestep
        """
        
        x1 = x[:,0]
        y1 = y[0,:]
        results = np.asarray(results)
        
        z = []
        for i in range(len(list(x1))):
            data = results[:, i]
            try:
                z1 = kde.gaussian_kde(data)
                z1 = z1.evaluate(y1)
            except:
                z1 = np.zeros(shape=y1.shape)
            z.append(z1)
        z = np.asarray(z)
        z = np.log(z+1)
    
        return z
    
    #prepare the data
    experiments, results = results

    #get time axis
    try:
        time = results.pop('TIME')[0, :]
    except:
        time =  np.arange(0, results.values()[0].shape[1])
    
    
    def make_logical(cases, criterion, interval=False):
        if interval:
            
            return (cases[groupBy] >= criterion[0]) & (cases[groupBy] < criterion[1]) 
        else:
            return cases[groupBy]== criterion
    
    
    #get the results for the specific outcome of interest
    results = results.get(outcome)
    
    #log results
    if logSpace:
        results = np.log(results+1)
   
    #generate the grid
    if ymin == None:
        ymin = np.min(results)
        info("ymin: %s" % ymin)
    if ymax == None:
        ymax = np.max(results)
        info("ymax: %s" % ymax)

    length = min(100, results.shape[1])
    y = np.arange(ymin, ymax, (ymax-ymin)/length)
    X, Y = np.meshgrid(time, y)

    z = []

    #do the preparation for grouping by
    interval=False
    if (experiments[groupBy].dtype == np.float32) |\
       (experiments[groupBy].dtype == np.float64) |\
       ((experiments[groupBy].dtype == np.int) & (len(set(experiments[groupBy])) > 5)):
        interval=True
        if discretesize:
            categories = discretesize(experiments[groupBy])
        else:
            categories = __discretesize(experiments[groupBy])
    else:
        categories = set(experiments[groupBy])
        
    
    for category in categories:
        if interval:
            info("calculating kde for (%s, %s)" % (category))
        else:
            info("calculating kde for %s" % (category))
        logical = make_logical(experiments, category, interval)
        
        Z = f(X.T,Y.T, results=results[logical])
        z.append(Z)

    #calculate the kde for the grid
    #visualize results
    fig = mlab.figure(1, bgcolor=(1, 1, 1), fgcolor=(0, 0, 0))
    
    fig.scene.disable_render = True
    for i, category in enumerate(categories):        
        if interval:
            info("plotting (%s, %s)" % (category))
        else:
            info("plotting %s" % (category))
        
        Z = z[i]
        extent = (-14+i*10,-6+i*10, 0,10, 0,5)
        s = mlab.mesh(X,Y, Z.T, extent=extent)
        mlab.outline(s, color=(.7, .7, .7), extent=extent)
        if i==0:
            mlab.axes(s,
                      extent=extent,
                      xlabel = '',
                      ylabel = '',
                      zlabel = 'density',
                      x_axis_visibility=False,
                      y_axis_visibility=False, 
                      z_axis_visibility=False) 
        
        category_name = repr(category)
            
        mlab.text(-16+10*i, i+10, category_name, z=-2, width=0.14)
    fig.scene.disable_render = False
    mlab.title(outcome, line_width=0.5)
    mlab.show()
def tree(data, 
         classify,
         sameMajorityPruning=False,
         mForPruning=0,
         maxMajority= 1,
         minSubset = 0,
         minExamples = 0):
    '''
    make a classification tree using orange
    
    For more details see `orange tree <http://orange.biolab.si/doc/modules/orngTree.htm>`_
    
    :param data: data from :meth:`perform_experiments`
    :param classify: function for classifying runs
    :param sameMajorityPruning: If true, invokes a bottom-up post-pruning by 
                                removing the subtrees of which all leaves 
                                classify to the same class (default: False).
    :param mForPruning: If non-zero, invokes an error-based bottom-up 
                        post-pruning, where m-estimate is used to estimate 
                        class probabilities (default: 0).
    :param maxMajority: Induction stops when the proportion of majority class 
                        in the node exceeds the value set by this parameter
                        (default: 1.0). 
    :param minSubset: Minimal number of examples in non-null leaves 
                      (default: 0).
    :param minExamples: Data subsets with less than minExamples examples are 
                        not split any further, that is, all leaves in the tree 
                        will contain at least that many of examples 
                        (default: 0).
    :rtype: a classification tree
    
    in order to print the results one can for example use `graphiv <http://www.graphviz.org/>`_.
    
    >>> import orgnTree
    >>> tree = tree(input, classify)
    >>> orngTree.printDot(tree, r'..\..\models\\tree.dot', 
                      leafStr="%V (%M out of %N)") 
    
    this generates a .dot file that can be opened and displayed using graphviz.
    the leafStr keyword argument specifies the format of the string for
    each leaf. See on this also the more detailed discussion on the orange 
    web site.
    
    At some future state, a convenience function might be added for turning
    a tree into a `networkx graph <http://networkx.lanl.gov/>`_. However, this
    is a possible future addition. 
    
    '''

    data = build_orange_data(data, classify)

    #make the actually tree, for details on the meaning of the parameters, see 
    #the orange webpage
    info("executing tree learner")
    tree = orngTree.TreeLearner(data,
                                sameMajorityPruning=sameMajorityPruning,
                                mForPruning=mForPruning,
                                maxMajority=maxMajority,
                                minSubset = minSubset ,
                                minExamples = minExamples)
    info("tree contains %s leaves" % orngTree.countLeaves(tree))
    
    return tree
def distance_gonenc(
    data,
    sisterCount=50,
    wSlopeError=1,
    wCurvatureError=1,
    filterSlope=True,
    tHoldSlope=0.1,
    filterCurvature=True,
    tHoldCurvature=0.1,
    addMidExtension=True,
    addEndExtension=True,
):

    """
    The distance measures the proximity of data series in terms of their 
    qualitative pattern features. In order words, it quantifies the proximity 
    between two different dynamic behaviour modes.
    
    It is designed to work mainly on non-stationary data. It's current version 
    does not perform well in catching the proximity of two cyclic/repetitive 
    patterns with different number of cycles (e.g. oscillation with 4 cycle 
    versus oscillation with 6 cycles).
    
    :param sisterCount: Number of long-versions that will be created for the 
                        short vector while comparing two data series with 
                        unequal feature vector lengths. 
    :param wSlopeError: Weight of the error between the 1st dimensions of the 
                        two feature vectors (i.e. Slope). (default=1)
    :param wCurvatureError: Weight of the error between the 2nd dimensions of 
                            the two feature vectors (i.e. Curvature). 
                            (default=1)
    :param wFilterSlope: Boolean, indicating whether the slope vectors should 
                         be filtered for minor fluctuations, or not. 
                         (default=True)
    :param tHoldSlope: The threshold value to be used in filtering out 
                       fluctuations in the slope. (default=0.1)
    :param filterCurvature: Boolean, indicating whether the curvature vectors 
                            should be filtered for minor fluctuations, or not.
                            (default=True)
    :param tHoldCurvature: The threshold value to be used in filtering out 
                           fluctuations in the curvature. (default=0.1)
    :param addMidExtension: Boolean, indicating whether the feature vectors 
                            should be extended by introducing transition 
                            sections along the vector.
                            (default=True)
    :param addEndExtension: Boolean, indicating whether the feature vectors 
                            should be extended by introducing startup/closing 
                            sections at the beginning/end of the vector.
                            (default=True)
    """

    runLogs = []
    # Generates the feature vectors for all the time series that are contained
    # in numpy array data
    features = construct_features(
        data, filterSlope, tHoldSlope, filterCurvature, tHoldCurvature, addMidExtension, addEndExtension
    )
    info("calculating distances")
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])),))
    index = -1
    for i in range(data.shape[0]):
        feature_i = features[i]

        # For each run, a log is created
        # Log includes a description dictionary that has key information
        # for post-clustering analysis, and the data series itself. These
        # logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc["Index"] = str(i)

        # this may not work due to data type mismatch
        featVector = feature_i

        behaviorDesc["Feature vector"] = str(featVector)
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)

        for j in range(i + 1, data.shape[0]):
            index += 1
            feature_j = features[j]
            if feature_i.shape[1] == feature_j.shape[1]:
                distance = distance_same_length(feature_i, feature_j, wSlopeError, wCurvatureError)

            else:
                distance = distance_different_lenght(feature_i, feature_j, wSlopeError, wCurvatureError, sisterCount)
            dRow[index] = distance
    return dRow, runLogs
def cluster(data, 
            outcome,
            distance='gonenc',
            interClusterDistance='complete',
            cMethod='inconsistent',
            cValue=2.5,
            plotDendrogram=True,
            plotClusters=True,
            groupPlot=False,
            **kwargs):
    '''
    
    Method that clusters time-series data from the specified cpickle file 
    according to a selected distance measure.
    
    :param data: return from meth:`perform_experiments`.
    :param outcome: Name of outcome/variable whose behavior is being analyzed
    :param distance: The distance metric to be used.
    :param interClusterDistance: How to calculate inter cluster distance.
                                 see `linkage <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage>`_ 
                                 for details.
    :param cMethod: Cutoff method, 
                    see `fcluster <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster>`_ 
                    for details.
    :param cValue: Cutoff value, see 
                   `fcluster <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster>`_ 
                   for details.
    :param plotDendogram: Boolean, if true, plot dendogram.
    :param plotCluster: Boolean, true if you want to plot clusters.
    :param groupPlot: Boolean, if true plot clusters in a single window, 
                      else the clusters are plotted in separate windows.
    :rtype: A tuple containing the list of distances, the cluster allocation, 
            and a list of logged distance metrics for each time series.     
    
    The remainder of the arguments are passed on to the specified distance 
    function. See the distance functions for details on these parameters.
    
    '''
    
    global varName 
    varName = outcome
    
    data = data[1][outcome]
    
    # Construct a list with distances. This list is the upper triange
    # of the distance matrix
    dRow, runLogs = construct_distances(data, distance, **kwargs)
    info('finished distances')
    

    # Allocate individual runs into clusters using hierarchical agglomerative 
    # clustering. clusterSetup is a dictionary that customizes the clustering 
    # algorithm to be used.
    z, clusters, runLogs = flatcluster(dRow, 
                                    runLogs, 
                                    plotDendrogram=plotDendrogram,
                                    interClusterDistance=interClusterDistance,
                                    cMethod=cMethod,
                                    cValue=cValue)
    
    
    sample_indices = pick_csamples(clusters, dRow)
    
#    if 'Plot type' in clusterSetup.keys():
#        if clusterSetup['Plot type'] == 'multi-window':
#            groupPlot = False
#        elif clusterSetup['Plot type'] == 'single-window':
#            groupPlot = True
#        else:
#            groupPlot = False
#    else:
#        groupPlot = False
    
    # Plots the clusters, unless it is specified not to be done in the setup
#    if 'plotClusters?' in clusterSetup.keys():
#        if clusterSetup['plotClusters?']:
#            plotClusters(groupPlot, runLogs)
#        else:
#            pass
    if plotClusters:
        plot_clusters(groupPlot, runLogs)
    
    return dRow, clusters, runLogs, z
def distance_willem(data, 
                    trendThold=0.001, 
                    crisisThold=0.02,
                    wIfCrisis=1,
                    wNoOfCrises=1,
                    wTrend=1,
                    wBandwith=1,
                    wSevCrises=1,
                    wTriDist=0.5):
    '''
    
    
    :param data: the time series for which to calculate the distances
    :param trendThold: threshold for trend
    :param crisisThold: threshold for crisis
    :param wIfCrisis: weight of crisis
    :param wNoOfCrisis: weight of number of crises
    :param wTrend: weight of trend
        
    '''
    
    
    runLogs = []
    features = construct_features(data, trendThold, crisisThold)
    
    #normalize
    norm_features = features.copy()
    np.log(norm_features[:, 1]+1)
    minimum = np.min(features, axis=0) 
    maximum = np.max(features, axis=0)
    a = 1/(maximum-minimum)
    b = -minimum/maximum-minimum
    norm_features = a*features+b
    
    
    info('calculating distances')
    dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), ))
    index = 0
    
    weights = np.array([wIfCrisis,
                       wNoOfCrises,
                       wSevCrises])
    max_distance = 0
    for i in range(data.shape[0]):
        feature_i = norm_features[i]
        # For each run, a log is created
        # Log includes a description dictionary that has key information for post-clustering analysis, and the data series itself
        # These logs are stored in a global array named runLogs
        behaviorDesc = {}
        behaviorDesc['Index'] = str(i)
        featVector = features[i] #this may not work due to data type mismatch
        featVector = tuple(featVector)
        behaviorDesc['Feature vector'] = "%d, %d, %s" % featVector
        behavior = data[i]
        localLog = (behaviorDesc, behavior)
        runLogs.append(localLog)
        
        for j in range(i+1, data.shape[0]):
            distance_tri = trdist(data[i],data[j])
            
            max_distance = max((max_distance, distance_tri))
             
            feature_j = norm_features[j]
            distance = np.abs(feature_i -feature_j)
            distance = weights*distance
            distance = np.sum(distance)+(distance_tri*wTriDist)
            dRow[index] = distance
            index += 1
        
#        distance = np.abs(feature_i - norm_features[i+1::])
#        distance = weights*distance
#        distance = np.sum(distance, axis=1)
#        dRow[index:index+distance.shape[0]] = distance
#        index += distance.shape[0]
    print max_distance
    info('distances determined')
    return dRow, runLogs
Exemple #26
0
def perform_prim(results, 
                 classify, 
                 peel_alpha = 0.05, 
                 paste_alpha = 0.05,
                 mass_min = 0.05, 
                 threshold = None, 
                 pasting=True, 
                 threshold_type=1,
                 obj_func=def_obj_func):
    r'''
    
    perform Patient Rule Induction Method (PRIM). This function performs 
    the PRIM algorithm on the data. It uses a Python implementation of PRIM
    inspired by the `R <http://www.oga-lab.net/RGM2/func.php?rd_id=prim:prim-package>`_ 
    algorithm. Compared to the R version, the Python version is data type aware. 
    That is, real valued, ordinal, and categorical data are treated 
    differently. Moreover, the pasting phase of PRIM in the R algorithm is
    not consistent with the literature. The Python version is. 
    
    the PRIM algorithm tries to find subspaces of the input space that share
    some characteristic in the output space. The characteristic that the 
    current implementation looks at is the mean of the results. Thus, the 
    output space is 1-D, and the characteristic is assumed to be continuous.
    
    As a work around, to deal with classes, the user can supply a classify 
    function. This function should return a binary classification 
    (i.e. 1 or 0). Then, the mean of the box is indicative of the concentration 
    of cases of class 1. That is, if the specified threshold is say 0.8 and the 
    threshold_type is 1, PRIM looks for subspaces of the input space that 
    contains at least 80\% cases of class 1.   
    
    :param results: the return from :meth:`perform_experiments`.
    :param classify: either a string denoting the outcome of interest to use
                     or a function. In case of a string and time series data, 
                     the end state is used.
    :param peel_alpha: parameter controlling the peeling stage (default = 0.05). 
    :param paste_alpha: parameter controlling the pasting stage (default = 0.05).
    :param mass_min: minimum mass of a box (default = 0.05). 
    :param threshold: the threshold of the output space that boxes should meet. 
    :param pasting: perform pasting stage (default=True) 
    :param threshold_type: If 1, the boxes should go above the threshold, if -1
                           the boxes should go below the threshold, if 0, the 
                           algorithm looks for both +1 and -1.
    :param obj_func: The objective function to use. Default is 
                     :func:`def_obj_func`
    :return: a list of PRIM objects.
    
    for each box, the scenario discovery metrics *coverage* and *density* 
    are also calculated:
    
    
    .. math::
 
        coverage=\frac
                    {{\displaystyle\sum_{y_{i}\in{B}}y_{i}{'}}}
                    {{\displaystyle\sum_{y_{i}\in{B^I}}y_{i}{'}}}
    
    
    where :math:`y_{i}{'}=1` if :math:`x_{i}\in{B}` and :math:`y_{i}{'}=0`
    otherwise.
    
    
    .. math::
 
        density=\frac
                    {{\displaystyle\sum_{y_{i}\in{B}}y_{i}{'}}}
                    {{\displaystyle\left|{y_{i}}\right|\in{B}}}
    
    where :math:`y_{i}{'}=1` if :math:`x_{i}\in{B}` and :math:`y_{i}{'}=0`
    otherwise, and :math:`{\displaystyle\left|{y_{i}}\right|\in{B}}` is the
    cardinality of :math:`y_{i}`.
    
    
    Density is the ratio of the cases of interest in a box to the 
    total number of cases in that box. *density* is identical to the mean
    in case of a binary classification.  For more detail on these metrics see 
    `Bryant and Lempert (2010) <http://www.sciencedirect.com/science/article/pii/S004016250900105X>`_
    
    .. rubric:: references to relevant papers 
        
    * `original PRIM paper <http://www.springerlink.com/content/x3gpv05t34620878/>`_
    * `paper on ordinal data and PRIM <http://www.sciencedirect.com/science/article/pii/S095741740700231X>`_
        
    **ema application** 
        
    * `Lempert et al. (2006) <http://mansci.journal.informs.org/content/52/4/514>`_
    * `Groves and Lempert (2007) <http://www.sciencedirect.com/science/article/pii/S0959378006000896#ref_bib19>`_
    * `Bryant and Lempert (2010) <http://www.sciencedirect.com/science/article/pii/S004016250900105X>`_
    
    '''
    experiments, results = results
    
    #make y
    if type(classify) == StringType:
        results = results.get(classify)
        if len(results.shape) == 2:
            y = results[:, -1]
        else:
            y = results
            
        count = np.zeros(y.shape)
        count[y*threshold_type > threshold] = 1
        cases_of_interest = np.sum(count)
        info("number of cases of interest is %d" % (np.sum(count)))
    elif callable(classify):
        y = classify(results)
        cases_of_interest = np.sum(y)
        info("number of cases of interest is %d" % (np.sum(y)))
    else:
        raise EMAError("incorrect specification of classify, this should be a function or a string")
   

    x = experiments
    
    #perform prim
    boxes = recursivePrim.perform_prim(x, y, box_init=None, peel_alpha=peel_alpha, 
                                            paste_alpha=paste_alpha, mass_min=mass_min, 
                                            threshold=threshold, pasting=pasting, 
                                            threshold_type=threshold_type,obj_func=obj_func,
                                            cases_of_interest=cases_of_interest)
    
    #calculate scenario discovery metrics and add these to boxes
    boxes = calculate_sd_metrics(boxes, y, threshold, threshold_type)
    
    #return prim
    return boxes
Exemple #27
0
    def perform_experiments(self, 
                           cases,
                           callback = util.DefaultCallback,
                           kwargs = None):
        """
        Method responsible for running the experiments on a structure. In case 
        of multiple model structures, the outcomes are set to the intersection 
        of the sets of outcomes of the various models.         
        
        :param cases: In case of Latin Hypercube sampling and Monte Carlo 
                      sampling, cases specifies the number of cases to
                      generate. In case of Full Factorial sampling,
                      cases specifies the resolution to use for sampling
                      continuous uncertainties. Alternatively, one can supply
                      a list of dicts, where each dicts contains a case.
                      That is, an uncertainty name as key, and its value. 
        :param callback: Class that will be called after finishing a 
                         single experiment,
        :param kwargs: generic keyword arguments to pass to the model_init
        :returns: a `structured numpy array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_ 
                  containing the experiments, and a dict with the names of the 
                  outcomes as keys and an numpy array as value.
                
        .. rubric:: suggested use
        
        In general, analysis scripts require both the structured array of the 
        experiments and the dictionary of arrays containing the results. The 
        recommended use is the following::
        
        >>> results = ensemble.perform_experiments(10000) #recommended use
        >>> experiments, output = ensemble.perform_experiments(10000) #will work fine
        
        The latter option will work fine, but most analysis scripts require 
        to wrap it up into a tuple again::
        
        >>> data = (experiments, output)
        
        Another reason for the recommended use is that you can save this tuple
        directly::
        
        >>> import expWorkbench.util as util
        >>> util.save_results(results, file)
          
        
        
        """
        if type(cases) ==  types.IntType:
            cases, uncertainties = self._generate_cases(cases)
        if type(cases) == types.ListType:
            
            #get the intersection of uncertainties
            if len(self._modelStructures)  >1:
                uncertainties = [msi.uncertainties for msi in self._modelStructures]
                uncertainties = set(uncertainties[0]).intersection(*uncertainties[:1])
                info("intersection contains %s uncertainties" %len(uncertainties))
            else:
                uncertainties = self._modelStructures[0].uncertainties
            
            #filter out those how ore in the cases keys
            uncertaintyNames = cases[0].keys()
            uncertainties = [uncertianty for uncertianty in uncertainties if uncertianty.name in uncertaintyNames]
        
        if not self._policies:
            self._policies.append({"name": "None"})

        nrOfExperiments =len(cases)*len(self._policies)*len(self._modelStructures) 
        info(str(nrOfExperiments) + 
             " experiment will be executed")

        
        #set outcomes to the intersect of outcomes across models
        outcomes = [msi.outcomes for msi in self._modelStructures]
        outcomes = set(outcomes[0]).intersection(*outcomes[:1])
        for msi in self._modelStructures:
            msi.outcomes = list(outcomes)
                
        #initialize the callback object
        callback = callback(uncertainties, outcomes, nrOfExperiments).callback
                
        if self.parallel:
            info("starting to perform experiments in parallel")
            pool = CalculatorPool(self._modelStructures, 
                                  processes=self.processes,
                                  callback=callback, 
                                  kwargs=kwargs)
            results = pool.runExperiments(cases, self._policies)
            
            for entry in results:
                try:
                    result = entry.get()
                except EMAParallelError as e:
                    exception(e)
                except Exception as e:
                    raise
            results = results[-1].get()
            del pool
        else:
            info("starting to perform experiments sequentially")

            def cleanup(modelInterfaces):
                for msi in modelInterfaces:
                    msi.cleanup()
                    del msi

            for policy in self._policies:
                for msi in self._modelStructures:
                    try:
                        msi.model_init(policy, kwargs)
                    except (EMAError, NotImplementedError) as inst:
                        exception(inst)
                        cleanup(self._modelStructures)
                        raise
    
                    for case in cases:
                        caseToRun = copy.deepcopy(case)
                        try:
                            msi.run_model(caseToRun)
                        except CaseError as e:
                            warning(str(e))
                        result = msi.retrieve_output()
                        msi.reset_model()
                        results = callback(
                                           caseToRun, policy, msi.name, 
                                           result
                                           )
            cleanup(self._modelStructures)
        info("experiments finished")
        
        return results
def find_boxes(x_remaining,
               y_remaining,
               box_init,
               peel_alpha,
               paste_alpha,
               mass_min,
               threshold,
               pasting,
               k, 
               k_max,
               n,
               cases_of_interest,
               obj_func):
    '''    
     Finds box
    
     Parameters
     x - matrix of explanatory variables
     y - vector of response variable
     box.init - initial box (should cover range of x)
     mass.min - min box mass
     threshold - min box mean
     pasting - TRUE - include pasting step (after peeling)
             - FALSE - dont include pasting
    
     Returns
     List with fields
     x - data still inside box after peeling
     y - corresponding response values
     y.mean - mean of y
     box - box limits
     mass - box mass
    '''
    k+=1
    
    info("%s points remaining" % (y_remaining.shape[0]))
    
    new_box = peel(x_remaining, y_remaining, box_init, peel_alpha, 
                   mass_min, threshold, n, obj_func)

    info("peeling completed")

    if pasting:
        logical = in_box(x_remaining, new_box)
        x_inside = x_remaining[logical]
        y_inside = y_remaining[logical]

        new_box = paste(x_inside, y_inside, x_remaining, y_remaining, 
                           box_init, new_box, paste_alpha, mass_min, 
                           threshold, n, obj_func)
        info("pasting completed")

    
    logical = in_box(x_remaining, new_box)
    x_inside = x_remaining[logical]
    y_inside = y_remaining[logical]
    box_mass = y_inside.shape[0]/n

    # update data in light of found box
    x_remaining_temp = x_remaining[logical==False]
    y_remaining_temp = y_remaining[logical==False]

    if (y_remaining_temp.shape[0] != 0) &\
       (k < k_max) &\
       (equal(box_init, new_box)==False):

#        #some debugging stuff
#
#        name = 'total productivity growth main switch'
#        print "new box"
#        print new_box[name][0]
#        print "data"
#        
#        for entry in set(x_remaining_temp[name]):
#            
#            nr =  x_remaining_temp[x_remaining_temp[name]==entry].shape[0]
#            print "%s\t%s" %(entry, nr)
#        
#        #end of debugging stuff

        # make a primObject
        prim_object = Prim(x_inside, y_inside, new_box, box_mass)
        coverage = (n * prim_object.y_mean * prim_object.box_mass)/cases_of_interest
        info("Found box %s: y_mean=%s, mass=%s, coverage=%s" % (k, 
                                                                prim_object.y_mean, 
                                                                prim_object.box_mass,
                                                                coverage))
        info("%s points in new box" % (y_inside.shape[0]))
        box_init = make_box(x_remaining)
        boxes = find_boxes(x_remaining_temp, y_remaining_temp, 
                           box_init, peel_alpha, paste_alpha, mass_min, 
                           threshold, 
                           pasting, k, k_max, n, cases_of_interest, obj_func)
        boxes.append(prim_object)
        return boxes
    else:
        info("Bump "+str(k)+" includes all remaining data")
        #make dump box
        box_mass = y_remaining.shape[0]/n
        dump_box = Prim(x_remaining, y_remaining, box_init, box_mass)
        return [dump_box]