def random_forest(data, classify, nrOfTrees=100, attributes=None): ''' make a random forest using orange For more details see `orange ensemble <http://orange.biolab.si/doc/modules/orngEnsemble.htm>`_ :param data: data from :meth:`perform_experiments`. :param classify: function for classifying runs. :param nrOfTrees: number of trees in the forest (default: 100). :param attributes: Number of attributes used in a randomly drawn subset when searching for best attribute to split the node in tree growing (default: None, and if kept this way, this is turned into square root of attributes in example set). :rtype: an orange random forest. ''' data = build_orange_data(data, classify) #do the random forest #see http://orange.biolab.si/doc/modules/orngEnsemble.htm for details info("executing random forest") measure = orngEnsemble.MeasureAttribute_randomForests(trees=nrOfTrees, attributes=attributes) return measure
def random_forest_measure_attributes(data, classify): ''' performs feature selection using random forests in orange. For more details see `orange ensemble <http://orange.biolab.si/doc/modules/orngEnsemble.htm>`_ :param data: data from :meth:`perform_experiments`. :param classify: function for classifying runs. :param nrOfTrees: number of trees in the forest (default: 100). :param attributes: Number of attributes used in a randomly drawn subset when searching for best attribute to split the node in tree growing. (default: None, and if kept this way, this is turned into square root of attributes in example set) :rtype: sorted list of tuples with uncertainty names and importance values. ''' data = build_orange_data(data, classify) #do the random forest #see http://orange.biolab.si/doc/modules/orngEnsemble.htm for details info("executing random forest for attribute selection") measure = orngEnsemble.MeasureAttribute_randomForests(trees=100) #calculate importance imps = measure.importances(data) #sort importance, using schwartzian transform results = [] for i,imp in enumerate(imps): results.append((imp, data.domain.attributes[i].name)) results.sort(reverse=True) results = [(entry[1], entry[0]) for entry in results] return results
def construct_features(data, trendThold, crisisThold): info("calculating features") # Checks the parameters of the distance function that may be defined by the user in the distanceSetup dict features = np.zeros(shape=(data.shape[0], 3)) for i in range(data.shape[0]): features[i,:] = construct_feature_vector(data[i, :], trendThold, crisisThold) return features
def build_orange_data(data,classify): ''' helper function for turning the data from :meth:`perform_experiments` into a data object that can be used by the various orange functions. For more details see `orange domain <http://orange.biolab.si/doc/reference/Domain.htm>`_ :param data: return from :meth:`perform_experiments`. :param classify: function to be used for determining the class for each run. ''' info("building orange data") experiments, results = data #build domain dtypes = [] for entry in experiments.dtype.descr: dtypes.append((entry[0], experiments.dtype.fields.get(entry[0]))) attributes = [] for entry in dtypes: name, dtype = entry dtype = dtype[0].name if dtype == 'int' or dtype =='object': attribute = ENUM(name) [attribute.addValue(str(value)) for value in set(experiments[name].tolist())] else: attribute = FLOAT(name, startValue = np.min(experiments[name]), endValue = np.max(experiments[name])) attributes.append(attribute) data = np.array(experiments.tolist()) #determine classes classes = classify(results) classVar = ENUM('class') #these numbers are merely referring to the possible classes [classVar.addValue(str(i)) for i in set(classes.tolist())] #by default the last entry in the list should be the class variable attributes.append(classVar) domain = orange.Domain(attributes) data = np.hstack((data, classes[:, np.newaxis])) data = data.tolist() data = orange.ExampleTable(domain, data) return data
def perform_prim(x, y, box_init = None, peel_alpha = 0.05, paste_alpha = 0.01, mass_min = 0.05, threshold = None, pasting = False, threshold_type = 1): if threshold==None: threshold = np.mean(y) k_max = np.ceil(1/mass_min) k_max = int(k_max) info("k max: %s" %(k_max)) if box_init == None: #if no initial box, make initial box box_init = np.array([np.min(x, axis=0),np.max(x, axis=0)]) box_diff = box_init[1, :] - box_init[0, :] box_init[0,:] = box_init[0, :] - 10*paste_alpha*box_diff box_init[1,:] = box_init[1, :] + 10*paste_alpha*box_diff else: #else, identify all points in initial box, rest is discarded logical = in_box(x, box_init) x = x[logical] y = y[logical] n = y.shape[0] y = y * threshold_type boxes = find_boxes(x, y, box_init, peel_alpha, paste_alpha, mass_min, np.min(y)-0.1*np.abs(np.min(y)), pasting, 0, k_max, n) # adjust for negative hdr for box in boxes: box.y = threshold_type*box.y box.y_mean = threshold_type*box.y_mean # the list of found boxes has the dump box as first element # we need to reverse the ordering to get the correct order in which # the boxes have been found boxes.reverse() boxes = prim_hdr(boxes, threshold, threshold_type) return boxes
def perform_prim(x, y, box_init = None, peel_alpha = 0.05, paste_alpha = 0.05, mass_min = 0.05, threshold = None, pasting = False, threshold_type = 1, cases_of_interest = None, obj_func = None): if threshold==None: threshold = np.mean(y) k_max = np.ceil(1/mass_min) k_max = int(k_max) info("max number of boxes: %s" %(k_max)) if box_init == None: box_init = make_box(x) else: #else, identify all points in initial box, rest is discarded logical = in_box(x, box_init) x = x[logical] y = y[logical] n = y.shape[0] y = y * threshold_type boxes = find_boxes(x, y, box_init, peel_alpha, paste_alpha, mass_min, np.min(y)-0.1*np.abs(np.min(y)), pasting, 0, k_max, n, cases_of_interest, obj_func) # adjust for negative hdr for box in boxes: box.y = threshold_type*box.y box.y_mean = threshold_type*box.y_mean # the list of found boxes has the dump box as first element # we need to reverse the ordering to get the correct order in which # the boxes have been found boxes.reverse() boxes = prim_hdr(boxes, threshold, threshold_type) return boxes
def feature_selection(data, classify, k=5, m=100): ''' perform feature selection using orange For more details see `orange feature selection <http://orange.biolab.si/doc/modules/orngFSS.htm>`_ and `orange measure attribute <http://orange.biolab.si/doc/reference/MeasureAttribute.htm>`_ the default measure is ReliefF ((MeasureAttribute_relief in Orange). :param data: data from :meth:`perform_experiments`. :param classify: function for classifying runs. :param k: the number of neighbors for each example (default 5). :param m: number of examples to use, Set to -1 to use all (default 100). :rtype: sorted list of tuples with uncertainty names and reliefF attribute scores. Orange provides other metrics for feature selection * Information Gain * Gain ratio * Gini index * Relevance of attributes * Costs If you want to use any of of these instead of ReliefF, use the code supplied here as a template, but modify the measure. That is replace:: measure = orange.MeasureAttribute_relief(k=k, m=m) with the measure of choice. See the above provided links for more details. ''' data = build_orange_data(data, classify) info("executing feature selection") measure = orange.MeasureAttribute_relief(k=k, m=m) ma = orngFSS.attMeasure(data, measure) results = [] for m in ma: results.append((m[1], m[0])) results.sort(reverse=True) results = [(entry[1], entry[0]) for entry in results] return results
def prim_hdr(prims, threshold, threshold_type): ''' Highest density region for PRIM boxes prim list of prim objects threshold threshold_type ''' n = 0 for entry in prims: n += entry.y.shape[0] info("number of items in boxes: %s" %n) boxes = [(entry.y_mean, entry) for entry in prims] final_list = [] dump_entries = [] for entry in boxes: if entry[0]*threshold_type >= threshold*threshold_type: final_list.append(entry[1]) else: dump_entries.append(entry[1]) x_temp = None for entry in dump_entries: if x_temp == None: x_temp = entry.x y_temp = entry.y else: x_temp = np.append(x_temp, entry.x, axis=0) y_temp = np.append(y_temp, entry.y, axis=0) dump_box = Prim(x_temp, y_temp, prims[-1].box, y_temp.shape[0]/n) final_list.append(dump_box) return final_list
def __filter(boxes, uncertainties=[]): dump_box=boxes[-1] boxes=boxes[0:-1] uv=uncertainties #iterate over uncertainties names = [] if uncertainties: uv=uncertainties else: uv = [entry[0] for entry in dump_box.dtype.descr] for name in uv: #determine whether to show for box in boxes: minimum = box[name][0] maximum = box[name][1] value = box.dtype.fields.get(name)[0] if value == 'object': a = dump_box[name][0] if len(a) != len(minimum): ans = False else: ans = np.all(np.equal(a, minimum)) if not ans: names.append(name) break elif (minimum > dump_box[name][0]) or\ (maximum < dump_box[name][1]): names.append(name) break a = set(uv) -set(names) string_list = ", ".join(a) info(string_list + " are not not visualized because they are not restricted") uv = names return uv
def construct_features(data, filterSlope, tHoldSlope, filterCurvature, tHoldCurvature, addMidExtension, addEndExtension): ''' Constructs a feature vector for each of the data-series contained in the data. ''' info("calculating features") # TODO, the casting of each feature to a list of tuples might be # removed at some stage, it will lead to a speed up, for you # can vectorize the calculations that use the feature vector features = [] for i in range(data.shape[0]): feature = construct_feature_vector(data[i, :], filterSlope, tHoldSlope, filterCurvature, tHoldCurvature, addMidExtension, addEndExtension) # feature = [tuple(feature[0,:]),tuple(feature[1,:])] features.append(feature) return features
def distance_triangle(data): """ The triangle distance is calculated as follows; Let ds1(.) and ds2(.) be two data series of length N. Then; A equals to the summation of ds1(i).ds2(i) from i=1 to N B equals to the square-root of the (summation ds1(i)^2 from i=1 to N) C equals to the square-root of the (summation ds1(i)^2 from i=1 to N) distance_triangle = A/(B.C) The triangle distance works only with data series of the same length In the literature, it is claimed that the triangle distance can deal with noise and amplitude scaling very well, and may yield poor results in cases of offset translation and linear drift. """ runLogs = [] # Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])),)) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc["Index"] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i + 1, data.shape[0]): index += 1 distance = trdist(data[i], data[j]) dRow[index] = distance return dRow, runLogs
def distance_sse(data): ''' The SSE (sum of squared-errors) distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series. Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, where error_term(i) equals to ds1(i)-ds2(i) Since SSE calculation is based on pairwise comparison of individual data points, the data series should be of equal length. SSE distance equals to the square of Euclidian distance, which is a commonly used distance metric in time series comparisons. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 distance = ssedist(data[i],data[j]) dRow[index] = distance return dRow, runLogs
def distance_mse(data): ''' The MSE (mean squared-error) distance is equal to the SSE distance divided by the number of data points in data series. The SSE distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series. Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, where error_term(i) equals to ds1(i)-ds2(i) Given that SSE is calculated as given above, MSE equals SSE divided by N. As SSE distance, the MSE distance only works with data series of equal length. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 distance = msedist(data[i],data[j]) dRow[index] = distance return dRow, runLogs
def distance_euclidian(data): ''' The Euclidian distance is equal to the square root of (the sum of squared-differences between corresponding dimensions of two N-dimensional vectors) (i.e. two data series of length N). Let the data series be of length N; Then Euclidian distance between ds1 and ds2 equals to sqrt(the sum of the square of error terms from 1 to N), where error_term(i) equals to ds1(i)-ds2(i) ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 distance = eucldist(data[i],data[j]) dRow[index] = distance return dRow, runLogs
def _generate_cases(self, nrOfCases): ''' number of cases specifies the number of cases to generate in case of Monte Carlo and Latin Hypercube sampling. In case of full factorial sampling it specifies the resolution on non categorical uncertainties. In case of multiple model structures, the uncertainties over which to explore is the intersection of the sets of uncertainties of the model interface instances. :param nrOfCases: In case of Latin Hypercube sampling and Monte Carlo sampling, nrOfCases specifies the number of cases to generate. In case of Full Factorial sampling, nrOfCases specifies the resolution to use for sampling continuous uncertainties. ''' #get the intersection of the uncertainties of the different models if len(self._modelStructures) >1: uncertainties = [msi.uncertainties for msi in self._modelStructures] uncertainties = set(uncertainties[0]).intersection(*uncertainties[:1]) info("intersection contains %s uncertainties" %len(uncertainties)) else: uncertainties = set(self._modelStructures[0].uncertainties) info("generating cases") designs = self.sampler.generate_design(uncertainties, nrOfCases) information = designs[1] designs = designs[0] cases = [] for design in designs: case = {} for i, name in enumerate(information): case[name] = design[i] cases.append(case) info(str(len(cases)) + " cases generated") return cases, uncertainties
def find_boxes(x_remaining, y_remaining, box_init, peel_alpha, paste_alpha, mass_min, threshold, pasting, k, k_max, n): ''' Finds box Parameters x - matrix of explanatory variables y - vector of response variable box.init - initial box (should cover range of x) mass.min - min box mass threshold - min box mean pasting - TRUE - include pasting step (after peeling) - FALSE - dont include pasting Returns List with fields x - data still inside box after peeling y - corresponding response values y.mean - mean of y box - box limits mass - box mass ''' k+=1 info("%s points remaining" % (y_remaining.shape[0])) new_box = peel(x_remaining, y_remaining, box_init, peel_alpha, mass_min, threshold, n) info("peeling completed") if pasting: logical = in_box(x_remaining, new_box, bool=True) x_inside = x_remaining[logical] y_inside = y_remaining[logical] new_box = paste(x_inside, y_inside, x_remaining, y_remaining, new_box, paste_alpha, mass_min, threshold, n) info("pasting completed") logical = in_box(x_remaining, new_box, bool=True) x_inside = x_remaining[logical] y_inside = y_remaining[logical] box_mass = y_inside.shape[0]/n # update data in light of found box x_remaining_temp = x_remaining[logical==False] y_remaining_temp = y_remaining[logical==False] if (y_remaining_temp.shape[0] != 0) &\ (k < k_max) &\ (compare(box_init, new_box)==False): # make a primObject prim_object = Prim(x_inside, y_inside, new_box, box_mass) info("Found box %s: y_mean=%s, mass=%s" % (k, prim_object.y_mean, prim_object.box_mass)) info("%s points in new box" % (y_inside.shape[0])) boxes = find_boxes(x_remaining_temp, y_remaining_temp, box_init, peel_alpha, paste_alpha, mass_min, threshold, pasting, k, k_max, n) boxes.append(prim_object) return boxes else: info("Bump "+str(k)+" includes all remaining data") #make dump box box_mass = y_remaining.shape[0]/n dump_box = Prim(x_remaining, y_remaining, box_init, box_mass) return [dump_box]
def find_box(x, y, box, peel_alpha, paste_alpha, mass_min, threshold, d, n, pasting): ''' Finds box Parameters x - matrix of explanatory variables y - vector of response variable box.init - initial box (should cover range of x) mass.min - min box mass threshold - min box mean pasting - TRUE - include pasting step (after peeling) - FALSE - dont include pasting Returns List with fields x - data still inside box after peeling y - corresponding response values y.mean - mean of y box - box limits mass - box mass ''' y_mean = np.mean(y) mass = y.shape[0]/n if (y_mean >= threshold) & (mass >= mass_min): boxk_peel = peel_one(x, y, box, peel_alpha, mass_min, threshold, d, n) else: boxk_peel = None boxk_temp = None while boxk_peel: boxk_temp = copy.deepcopy(boxk_peel) boxk_peel = peel_one(boxk_temp.x, boxk_temp.y, boxk_temp.box, peel_alpha, mass_min, threshold, d, n) info("peeling completed") if pasting: boxk_paste = boxk_temp while boxk_paste: boxk_temp = boxk_paste boxk_paste = paste_one(boxk_temp.x, boxk_temp.y, x, y, boxk_temp.box, paste_alpha, mass_min, threshold, d, n) info("pasting completed") boxk = boxk_temp return boxk
def prim_hdr(prim, threshold, threshold_type): ''' Highest density region for PRIM boxes prim list of prim objects threshold threshold_type ''' n = 0 for entry in prim: n += entry.y.shape[0] info("number of items in boxes: %s" %n) y_means = np.asarray([entry.y_mean for entry in prim]) hdr_ind = np.where(y_means * threshold_type >= threshold*threshold_type)[0] if hdr_ind.shape[0] > 0: hdr_ind = np.max(hdr_ind) else: if threshold_type ==1: raise Warning("No prim box found with mean >= "+str(threshold)) elif threshold_type ==-1: raise Warning("No prim box found with mean <= "+str(threshold)) return None #highest density region x_prim_hdr = [] for k in range(hdr_ind+1): hdr = prim[k] x_prim_hdr.append(hdr) #combine non-hdr into a `dump' box if hdr_ind < len(prim)-1: info("making a dumpbox") x_temp = None for k in range(hdr_ind+1,len(prim)): #dit moet via een slice veel sneller kunnen if x_temp == None: x_temp = prim[k].x y_temp = prim[k].y else: x_temp = np.append(x_temp, prim[k].x, axis=0) y_temp = np.append(y_temp, prim[k].y, axis=0) dump_box = Prim(x_temp, y_temp, np.mean(y_temp), prim[-1].box, y_temp.shape[0]/n) x_prim_hdr.append(dump_box) #dit kan niet wat hdr is een list x_prim_hdr_num_class = len(x_prim_hdr) x_prim_hdr_num_hdr_class = hdr_ind+1 x_prim_hdr_threshold = threshold x_prim_hdr_ind = np.zeros((x_prim_hdr_num_class)) x_prim_hdr_ind[:] = threshold_type return x_prim_hdr, x_prim_hdr_num_hdr_class
def prim_one(x, y, box_init = None, peel_alpha = 0.05, paste_alpha = 0.01, mass_min = 0.05, threshold = None, pasting = False, threshold_type = 1): d = x.shape[1] n = x.shape[0] k_max = np.ceil(1/mass_min) info("k max: %s" %(k_max)) num_boxes = int(k_max) y_mean = np.mean(y) mass_init = y.shape[0]/n #should default to 1 if I read the code correctly if box_init == None: box_init = np.array([np.min(x, axis=0),np.max(x, axis=0)]) box_diff = box_init[1, :] - box_init[0, :] box_init[0,:] = box_init[0, :] - 10*paste_alpha*box_diff box_init[1,:] = box_init[1, :] + 10*paste_alpha*box_diff # find first box k = 1 a = x.shape[0] debug("remaing items: %s" % (a)) boxk = find_box(x=x, y=y, box=box_init, peel_alpha=peel_alpha, paste_alpha=paste_alpha, mass_min=mass_min, threshold=np.min(y)-0.1*np.abs(np.min(y)), d=d, n=n, pasting=pasting) b = boxk.x.shape[0] debug("removed items: %s" %b) if boxk == None: info("unable to find box 1") x_prim = Prim(x, threshold_type*y, y_mean=threshold_type*y_mean, box=box_init, box_mass=mass_init, threshold=np.mean(y)) return x_prim else: info("Found box %s: y_mean=%s, mass=%s" % (k, threshold_type*boxk.y_mean, boxk.box_mass)) boxes = [] boxes.append(boxk) # find subsequent boxes if num_boxes > 1: # data still under consideration x_out_ind_mat = np.empty(x.shape) for j in range(d): x_out_ind_mat[:, j] = (x[:, j] < boxk.box[0,j]) | (x[:,j] >boxk.box[1,j]) x_out_ind = np.any(x_out_ind_mat,axis=1) x_out = x[x_out_ind,:] y_out = y[x_out_ind] a = x_out.shape[0] debug("remaing items: %s" % (a)) while (y_out.shape[0] > 0) & (k < num_boxes): k = k+1 boxk = find_box(x=x_out, y=y_out, box=box_init, peel_alpha=peel_alpha, paste_alpha=paste_alpha, mass_min=mass_min, threshold=np.min(y)-0.1*np.abs(np.min(y)), d=d, n=n, pasting=pasting) if boxk == None: info("Bump "+str(k)+" includes all remaining data") boxk = Prim(x_out, y_out, np.mean(y_out), box_init, y_out.shape[0]/n) b += boxk.x.shape[0] debug("removed items: %s" %b) boxes.append(boxk) break else: b += boxk.x.shape[0] debug("removed items: %s" %b) # update x and y debug("Found box %s: y_mean=%s, mass=%s" % (k, threshold_type*boxk.y_mean, boxk.box_mass)) # data still under consideration x_out_ind_mat = np.empty(x.shape) for j in range(d): x_out_ind_mat[:, j] = (x[:, j] < boxk.box[0,j]) | (x[:,j] >boxk.box[1,j]) x_out_ind_mat = np.any(x_out_ind_mat,axis=1) x_out_ind = x_out_ind & x_out_ind_mat x_out = x[x_out_ind, :] y_out = y[x_out_ind] a = x_out.shape[0] debug("remaing items: %s" %(a)) boxes.append(boxk) # adjust for negative hdr for box in boxes: box.y = threshold_type*box.y box.y_mean = threshold_type*box.y_mean prim_res, num_hdr= prim_hdr(boxes, threshold, threshold_type) return prim_res, num_hdr
def cluster(data, outcome, distanceSetup={}, clusterSetup={}): ''' Method that clusters time-series data from the specified cpickle file according to a selected distance measure :param data: return from meth:`perform_experiments` :param outcome: Name of outcome/variable whose behavior is being analyzed :param distanceSetup: Dictionary that specifies the distance to be used in clustering, and the configuration of the distance (optional) :param clusterSetup: Dictionary that specifies the configuration of the hierarchical clustering algorithm (optional) :rtype: A list of integers, which specify the clusters to which runs' results are allocated The keys that can be specified in the distanceSetup are as follows; * 'distance': String that specifies the distance to be used. Options: bmd (default), mse, sse * 'filter?': Boolean that specifies whether the data series will be filtered (for bmd distance) * 'slope filter': A float number that specifies the filtering threshold for the slope (for every data point if change__in_the_ outcome/average_value_of_the_outcome < threshold, consider slope = 0) (for bmd distance) * 'curvature filter': A float number that specifies the filtering threshold for the curvature (for every data point if change__in_the_slope/average_value_of_the_slope < threshold, consider curvature = 0) (for bmd distance) * 'no of sisters': 50 (for bmd distance) The keys that can be specified in the clusterSetup are as follows: * 'plotClusters?': True * 'Plot type': 'single-window' #other option is 'single-window' * 'plotDendrogram?': True * 'inter-cluster distance': 'complete' # Other options are 'single' and 'average' * 'cutoff criteria': 'inconsistent' # Other options are 'distance' and 'maxclust' * 'cutoff criteria value': 0.5 ''' global varName varName = outcome dataSeries = data[1][outcome] # Construct a n-by-n matrix of distances between behaviors of the selected # outcome. It is a symmetrical matrix, with 0's along the # nortwest-southeast diagonal. distanceSetup is a dictionary that specifies # (and customizes) the distance function to be used. distMatrix = constructDMatrix(dataSeries, distanceSetup) info('done with distances') pyplot.ion() # Allocate individual runs into clusters using hierarchical agglomerative # clustering. clusterSetup is a dictionary that customizes the clustering # algorithm to be used. clusters = flatcluster(distMatrix, clusterSetup) if 'Plot type' in clusterSetup.keys(): if clusterSetup['Plot type'] == 'multi-window': groupPlot = False elif clusterSetup['Plot type'] == 'single-window': groupPlot = True else: groupPlot = False else: groupPlot = False # Plots the clusters, unless it is specified not to be done in the setup if 'plotClusters?' in clusterSetup.keys(): if clusterSetup['plotClusters?']: plotClusters(groupPlot) else: pass else: plotClusters(groupPlot) return clusters
def envelopes3d_group_by(results, outcome, groupBy = 'policy', discretesize = None, logSpace=False, ymin = None, ymax = None): ''' Function for making 3d envelopes. In contrast to the envelopes in :mod:`graphs`, this version shows the density for every time step, instead of only for the end state. Note that this function makes an envelope for only 1 outcome. This envelopes will group the results based on the specified uncertainty. The user can supply a discretesize function to control the grouping in case of parameterUncertainties. This function will make a separate envelope for each group. :param results: The return from :meth:`run experiments`. :param outcome: Specify the name of outcome of interest for which you want to make the 3d envelopes. :param groupBy: The uncertainty to group by. (default=policy) :param discretesize: a discretesize function to control the grouping in case of parameterUncertainties :param logSpace: Boolean, if true, the log of the input data is used :param ymin: If provided, lower bound for the KDE, if not, ymin = np.min(results.get(outcome)) :param ymax: If provided, lower bound for the KDE, if not, ymax = np.max(results.get(outcome)) ''' def f(x, y, results): """ function that performs the kde for each timestep """ x1 = x[:,0] y1 = y[0,:] results = np.asarray(results) z = [] for i in range(len(list(x1))): data = results[:, i] try: z1 = kde.gaussian_kde(data) z1 = z1.evaluate(y1) except: z1 = np.zeros(shape=y1.shape) z.append(z1) z = np.asarray(z) z = np.log(z+1) return z #prepare the data experiments, results = results #get time axis try: time = results.pop('TIME')[0, :] except: time = np.arange(0, results.values()[0].shape[1]) def make_logical(cases, criterion, interval=False): if interval: return (cases[groupBy] >= criterion[0]) & (cases[groupBy] < criterion[1]) else: return cases[groupBy]== criterion #get the results for the specific outcome of interest results = results.get(outcome) #log results if logSpace: results = np.log(results+1) #generate the grid if ymin == None: ymin = np.min(results) info("ymin: %s" % ymin) if ymax == None: ymax = np.max(results) info("ymax: %s" % ymax) length = min(100, results.shape[1]) y = np.arange(ymin, ymax, (ymax-ymin)/length) X, Y = np.meshgrid(time, y) z = [] #do the preparation for grouping by interval=False if (experiments[groupBy].dtype == np.float32) |\ (experiments[groupBy].dtype == np.float64) |\ ((experiments[groupBy].dtype == np.int) & (len(set(experiments[groupBy])) > 5)): interval=True if discretesize: categories = discretesize(experiments[groupBy]) else: categories = __discretesize(experiments[groupBy]) else: categories = set(experiments[groupBy]) for category in categories: if interval: info("calculating kde for (%s, %s)" % (category)) else: info("calculating kde for %s" % (category)) logical = make_logical(experiments, category, interval) Z = f(X.T,Y.T, results=results[logical]) z.append(Z) #calculate the kde for the grid #visualize results fig = mlab.figure(1, bgcolor=(1, 1, 1), fgcolor=(0, 0, 0)) fig.scene.disable_render = True for i, category in enumerate(categories): if interval: info("plotting (%s, %s)" % (category)) else: info("plotting %s" % (category)) Z = z[i] extent = (-14+i*10,-6+i*10, 0,10, 0,5) s = mlab.mesh(X,Y, Z.T, extent=extent) mlab.outline(s, color=(.7, .7, .7), extent=extent) if i==0: mlab.axes(s, extent=extent, xlabel = '', ylabel = '', zlabel = 'density', x_axis_visibility=False, y_axis_visibility=False, z_axis_visibility=False) category_name = repr(category) mlab.text(-16+10*i, i+10, category_name, z=-2, width=0.14) fig.scene.disable_render = False mlab.title(outcome, line_width=0.5) mlab.show()
def tree(data, classify, sameMajorityPruning=False, mForPruning=0, maxMajority= 1, minSubset = 0, minExamples = 0): ''' make a classification tree using orange For more details see `orange tree <http://orange.biolab.si/doc/modules/orngTree.htm>`_ :param data: data from :meth:`perform_experiments` :param classify: function for classifying runs :param sameMajorityPruning: If true, invokes a bottom-up post-pruning by removing the subtrees of which all leaves classify to the same class (default: False). :param mForPruning: If non-zero, invokes an error-based bottom-up post-pruning, where m-estimate is used to estimate class probabilities (default: 0). :param maxMajority: Induction stops when the proportion of majority class in the node exceeds the value set by this parameter (default: 1.0). :param minSubset: Minimal number of examples in non-null leaves (default: 0). :param minExamples: Data subsets with less than minExamples examples are not split any further, that is, all leaves in the tree will contain at least that many of examples (default: 0). :rtype: a classification tree in order to print the results one can for example use `graphiv <http://www.graphviz.org/>`_. >>> import orgnTree >>> tree = tree(input, classify) >>> orngTree.printDot(tree, r'..\..\models\\tree.dot', leafStr="%V (%M out of %N)") this generates a .dot file that can be opened and displayed using graphviz. the leafStr keyword argument specifies the format of the string for each leaf. See on this also the more detailed discussion on the orange web site. At some future state, a convenience function might be added for turning a tree into a `networkx graph <http://networkx.lanl.gov/>`_. However, this is a possible future addition. ''' data = build_orange_data(data, classify) #make the actually tree, for details on the meaning of the parameters, see #the orange webpage info("executing tree learner") tree = orngTree.TreeLearner(data, sameMajorityPruning=sameMajorityPruning, mForPruning=mForPruning, maxMajority=maxMajority, minSubset = minSubset , minExamples = minExamples) info("tree contains %s leaves" % orngTree.countLeaves(tree)) return tree
def distance_gonenc( data, sisterCount=50, wSlopeError=1, wCurvatureError=1, filterSlope=True, tHoldSlope=0.1, filterCurvature=True, tHoldCurvature=0.1, addMidExtension=True, addEndExtension=True, ): """ The distance measures the proximity of data series in terms of their qualitative pattern features. In order words, it quantifies the proximity between two different dynamic behaviour modes. It is designed to work mainly on non-stationary data. It's current version does not perform well in catching the proximity of two cyclic/repetitive patterns with different number of cycles (e.g. oscillation with 4 cycle versus oscillation with 6 cycles). :param sisterCount: Number of long-versions that will be created for the short vector while comparing two data series with unequal feature vector lengths. :param wSlopeError: Weight of the error between the 1st dimensions of the two feature vectors (i.e. Slope). (default=1) :param wCurvatureError: Weight of the error between the 2nd dimensions of the two feature vectors (i.e. Curvature). (default=1) :param wFilterSlope: Boolean, indicating whether the slope vectors should be filtered for minor fluctuations, or not. (default=True) :param tHoldSlope: The threshold value to be used in filtering out fluctuations in the slope. (default=0.1) :param filterCurvature: Boolean, indicating whether the curvature vectors should be filtered for minor fluctuations, or not. (default=True) :param tHoldCurvature: The threshold value to be used in filtering out fluctuations in the curvature. (default=0.1) :param addMidExtension: Boolean, indicating whether the feature vectors should be extended by introducing transition sections along the vector. (default=True) :param addEndExtension: Boolean, indicating whether the feature vectors should be extended by introducing startup/closing sections at the beginning/end of the vector. (default=True) """ runLogs = [] # Generates the feature vectors for all the time series that are contained # in numpy array data features = construct_features( data, filterSlope, tHoldSlope, filterCurvature, tHoldCurvature, addMidExtension, addEndExtension ) info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])),)) index = -1 for i in range(data.shape[0]): feature_i = features[i] # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc["Index"] = str(i) # this may not work due to data type mismatch featVector = feature_i behaviorDesc["Feature vector"] = str(featVector) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i + 1, data.shape[0]): index += 1 feature_j = features[j] if feature_i.shape[1] == feature_j.shape[1]: distance = distance_same_length(feature_i, feature_j, wSlopeError, wCurvatureError) else: distance = distance_different_lenght(feature_i, feature_j, wSlopeError, wCurvatureError, sisterCount) dRow[index] = distance return dRow, runLogs
def cluster(data, outcome, distance='gonenc', interClusterDistance='complete', cMethod='inconsistent', cValue=2.5, plotDendrogram=True, plotClusters=True, groupPlot=False, **kwargs): ''' Method that clusters time-series data from the specified cpickle file according to a selected distance measure. :param data: return from meth:`perform_experiments`. :param outcome: Name of outcome/variable whose behavior is being analyzed :param distance: The distance metric to be used. :param interClusterDistance: How to calculate inter cluster distance. see `linkage <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage>`_ for details. :param cMethod: Cutoff method, see `fcluster <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster>`_ for details. :param cValue: Cutoff value, see `fcluster <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster>`_ for details. :param plotDendogram: Boolean, if true, plot dendogram. :param plotCluster: Boolean, true if you want to plot clusters. :param groupPlot: Boolean, if true plot clusters in a single window, else the clusters are plotted in separate windows. :rtype: A tuple containing the list of distances, the cluster allocation, and a list of logged distance metrics for each time series. The remainder of the arguments are passed on to the specified distance function. See the distance functions for details on these parameters. ''' global varName varName = outcome data = data[1][outcome] # Construct a list with distances. This list is the upper triange # of the distance matrix dRow, runLogs = construct_distances(data, distance, **kwargs) info('finished distances') # Allocate individual runs into clusters using hierarchical agglomerative # clustering. clusterSetup is a dictionary that customizes the clustering # algorithm to be used. z, clusters, runLogs = flatcluster(dRow, runLogs, plotDendrogram=plotDendrogram, interClusterDistance=interClusterDistance, cMethod=cMethod, cValue=cValue) sample_indices = pick_csamples(clusters, dRow) # if 'Plot type' in clusterSetup.keys(): # if clusterSetup['Plot type'] == 'multi-window': # groupPlot = False # elif clusterSetup['Plot type'] == 'single-window': # groupPlot = True # else: # groupPlot = False # else: # groupPlot = False # Plots the clusters, unless it is specified not to be done in the setup # if 'plotClusters?' in clusterSetup.keys(): # if clusterSetup['plotClusters?']: # plotClusters(groupPlot, runLogs) # else: # pass if plotClusters: plot_clusters(groupPlot, runLogs) return dRow, clusters, runLogs, z
def distance_willem(data, trendThold=0.001, crisisThold=0.02, wIfCrisis=1, wNoOfCrises=1, wTrend=1, wBandwith=1, wSevCrises=1, wTriDist=0.5): ''' :param data: the time series for which to calculate the distances :param trendThold: threshold for trend :param crisisThold: threshold for crisis :param wIfCrisis: weight of crisis :param wNoOfCrisis: weight of number of crises :param wTrend: weight of trend ''' runLogs = [] features = construct_features(data, trendThold, crisisThold) #normalize norm_features = features.copy() np.log(norm_features[:, 1]+1) minimum = np.min(features, axis=0) maximum = np.max(features, axis=0) a = 1/(maximum-minimum) b = -minimum/maximum-minimum norm_features = a*features+b info('calculating distances') dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = 0 weights = np.array([wIfCrisis, wNoOfCrises, wSevCrises]) max_distance = 0 for i in range(data.shape[0]): feature_i = norm_features[i] # For each run, a log is created # Log includes a description dictionary that has key information for post-clustering analysis, and the data series itself # These logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) featVector = features[i] #this may not work due to data type mismatch featVector = tuple(featVector) behaviorDesc['Feature vector'] = "%d, %d, %s" % featVector behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): distance_tri = trdist(data[i],data[j]) max_distance = max((max_distance, distance_tri)) feature_j = norm_features[j] distance = np.abs(feature_i -feature_j) distance = weights*distance distance = np.sum(distance)+(distance_tri*wTriDist) dRow[index] = distance index += 1 # distance = np.abs(feature_i - norm_features[i+1::]) # distance = weights*distance # distance = np.sum(distance, axis=1) # dRow[index:index+distance.shape[0]] = distance # index += distance.shape[0] print max_distance info('distances determined') return dRow, runLogs
def perform_prim(results, classify, peel_alpha = 0.05, paste_alpha = 0.05, mass_min = 0.05, threshold = None, pasting=True, threshold_type=1, obj_func=def_obj_func): r''' perform Patient Rule Induction Method (PRIM). This function performs the PRIM algorithm on the data. It uses a Python implementation of PRIM inspired by the `R <http://www.oga-lab.net/RGM2/func.php?rd_id=prim:prim-package>`_ algorithm. Compared to the R version, the Python version is data type aware. That is, real valued, ordinal, and categorical data are treated differently. Moreover, the pasting phase of PRIM in the R algorithm is not consistent with the literature. The Python version is. the PRIM algorithm tries to find subspaces of the input space that share some characteristic in the output space. The characteristic that the current implementation looks at is the mean of the results. Thus, the output space is 1-D, and the characteristic is assumed to be continuous. As a work around, to deal with classes, the user can supply a classify function. This function should return a binary classification (i.e. 1 or 0). Then, the mean of the box is indicative of the concentration of cases of class 1. That is, if the specified threshold is say 0.8 and the threshold_type is 1, PRIM looks for subspaces of the input space that contains at least 80\% cases of class 1. :param results: the return from :meth:`perform_experiments`. :param classify: either a string denoting the outcome of interest to use or a function. In case of a string and time series data, the end state is used. :param peel_alpha: parameter controlling the peeling stage (default = 0.05). :param paste_alpha: parameter controlling the pasting stage (default = 0.05). :param mass_min: minimum mass of a box (default = 0.05). :param threshold: the threshold of the output space that boxes should meet. :param pasting: perform pasting stage (default=True) :param threshold_type: If 1, the boxes should go above the threshold, if -1 the boxes should go below the threshold, if 0, the algorithm looks for both +1 and -1. :param obj_func: The objective function to use. Default is :func:`def_obj_func` :return: a list of PRIM objects. for each box, the scenario discovery metrics *coverage* and *density* are also calculated: .. math:: coverage=\frac {{\displaystyle\sum_{y_{i}\in{B}}y_{i}{'}}} {{\displaystyle\sum_{y_{i}\in{B^I}}y_{i}{'}}} where :math:`y_{i}{'}=1` if :math:`x_{i}\in{B}` and :math:`y_{i}{'}=0` otherwise. .. math:: density=\frac {{\displaystyle\sum_{y_{i}\in{B}}y_{i}{'}}} {{\displaystyle\left|{y_{i}}\right|\in{B}}} where :math:`y_{i}{'}=1` if :math:`x_{i}\in{B}` and :math:`y_{i}{'}=0` otherwise, and :math:`{\displaystyle\left|{y_{i}}\right|\in{B}}` is the cardinality of :math:`y_{i}`. Density is the ratio of the cases of interest in a box to the total number of cases in that box. *density* is identical to the mean in case of a binary classification. For more detail on these metrics see `Bryant and Lempert (2010) <http://www.sciencedirect.com/science/article/pii/S004016250900105X>`_ .. rubric:: references to relevant papers * `original PRIM paper <http://www.springerlink.com/content/x3gpv05t34620878/>`_ * `paper on ordinal data and PRIM <http://www.sciencedirect.com/science/article/pii/S095741740700231X>`_ **ema application** * `Lempert et al. (2006) <http://mansci.journal.informs.org/content/52/4/514>`_ * `Groves and Lempert (2007) <http://www.sciencedirect.com/science/article/pii/S0959378006000896#ref_bib19>`_ * `Bryant and Lempert (2010) <http://www.sciencedirect.com/science/article/pii/S004016250900105X>`_ ''' experiments, results = results #make y if type(classify) == StringType: results = results.get(classify) if len(results.shape) == 2: y = results[:, -1] else: y = results count = np.zeros(y.shape) count[y*threshold_type > threshold] = 1 cases_of_interest = np.sum(count) info("number of cases of interest is %d" % (np.sum(count))) elif callable(classify): y = classify(results) cases_of_interest = np.sum(y) info("number of cases of interest is %d" % (np.sum(y))) else: raise EMAError("incorrect specification of classify, this should be a function or a string") x = experiments #perform prim boxes = recursivePrim.perform_prim(x, y, box_init=None, peel_alpha=peel_alpha, paste_alpha=paste_alpha, mass_min=mass_min, threshold=threshold, pasting=pasting, threshold_type=threshold_type,obj_func=obj_func, cases_of_interest=cases_of_interest) #calculate scenario discovery metrics and add these to boxes boxes = calculate_sd_metrics(boxes, y, threshold, threshold_type) #return prim return boxes
def perform_experiments(self, cases, callback = util.DefaultCallback, kwargs = None): """ Method responsible for running the experiments on a structure. In case of multiple model structures, the outcomes are set to the intersection of the sets of outcomes of the various models. :param cases: In case of Latin Hypercube sampling and Monte Carlo sampling, cases specifies the number of cases to generate. In case of Full Factorial sampling, cases specifies the resolution to use for sampling continuous uncertainties. Alternatively, one can supply a list of dicts, where each dicts contains a case. That is, an uncertainty name as key, and its value. :param callback: Class that will be called after finishing a single experiment, :param kwargs: generic keyword arguments to pass to the model_init :returns: a `structured numpy array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_ containing the experiments, and a dict with the names of the outcomes as keys and an numpy array as value. .. rubric:: suggested use In general, analysis scripts require both the structured array of the experiments and the dictionary of arrays containing the results. The recommended use is the following:: >>> results = ensemble.perform_experiments(10000) #recommended use >>> experiments, output = ensemble.perform_experiments(10000) #will work fine The latter option will work fine, but most analysis scripts require to wrap it up into a tuple again:: >>> data = (experiments, output) Another reason for the recommended use is that you can save this tuple directly:: >>> import expWorkbench.util as util >>> util.save_results(results, file) """ if type(cases) == types.IntType: cases, uncertainties = self._generate_cases(cases) if type(cases) == types.ListType: #get the intersection of uncertainties if len(self._modelStructures) >1: uncertainties = [msi.uncertainties for msi in self._modelStructures] uncertainties = set(uncertainties[0]).intersection(*uncertainties[:1]) info("intersection contains %s uncertainties" %len(uncertainties)) else: uncertainties = self._modelStructures[0].uncertainties #filter out those how ore in the cases keys uncertaintyNames = cases[0].keys() uncertainties = [uncertianty for uncertianty in uncertainties if uncertianty.name in uncertaintyNames] if not self._policies: self._policies.append({"name": "None"}) nrOfExperiments =len(cases)*len(self._policies)*len(self._modelStructures) info(str(nrOfExperiments) + " experiment will be executed") #set outcomes to the intersect of outcomes across models outcomes = [msi.outcomes for msi in self._modelStructures] outcomes = set(outcomes[0]).intersection(*outcomes[:1]) for msi in self._modelStructures: msi.outcomes = list(outcomes) #initialize the callback object callback = callback(uncertainties, outcomes, nrOfExperiments).callback if self.parallel: info("starting to perform experiments in parallel") pool = CalculatorPool(self._modelStructures, processes=self.processes, callback=callback, kwargs=kwargs) results = pool.runExperiments(cases, self._policies) for entry in results: try: result = entry.get() except EMAParallelError as e: exception(e) except Exception as e: raise results = results[-1].get() del pool else: info("starting to perform experiments sequentially") def cleanup(modelInterfaces): for msi in modelInterfaces: msi.cleanup() del msi for policy in self._policies: for msi in self._modelStructures: try: msi.model_init(policy, kwargs) except (EMAError, NotImplementedError) as inst: exception(inst) cleanup(self._modelStructures) raise for case in cases: caseToRun = copy.deepcopy(case) try: msi.run_model(caseToRun) except CaseError as e: warning(str(e)) result = msi.retrieve_output() msi.reset_model() results = callback( caseToRun, policy, msi.name, result ) cleanup(self._modelStructures) info("experiments finished") return results
def find_boxes(x_remaining, y_remaining, box_init, peel_alpha, paste_alpha, mass_min, threshold, pasting, k, k_max, n, cases_of_interest, obj_func): ''' Finds box Parameters x - matrix of explanatory variables y - vector of response variable box.init - initial box (should cover range of x) mass.min - min box mass threshold - min box mean pasting - TRUE - include pasting step (after peeling) - FALSE - dont include pasting Returns List with fields x - data still inside box after peeling y - corresponding response values y.mean - mean of y box - box limits mass - box mass ''' k+=1 info("%s points remaining" % (y_remaining.shape[0])) new_box = peel(x_remaining, y_remaining, box_init, peel_alpha, mass_min, threshold, n, obj_func) info("peeling completed") if pasting: logical = in_box(x_remaining, new_box) x_inside = x_remaining[logical] y_inside = y_remaining[logical] new_box = paste(x_inside, y_inside, x_remaining, y_remaining, box_init, new_box, paste_alpha, mass_min, threshold, n, obj_func) info("pasting completed") logical = in_box(x_remaining, new_box) x_inside = x_remaining[logical] y_inside = y_remaining[logical] box_mass = y_inside.shape[0]/n # update data in light of found box x_remaining_temp = x_remaining[logical==False] y_remaining_temp = y_remaining[logical==False] if (y_remaining_temp.shape[0] != 0) &\ (k < k_max) &\ (equal(box_init, new_box)==False): # #some debugging stuff # # name = 'total productivity growth main switch' # print "new box" # print new_box[name][0] # print "data" # # for entry in set(x_remaining_temp[name]): # # nr = x_remaining_temp[x_remaining_temp[name]==entry].shape[0] # print "%s\t%s" %(entry, nr) # # #end of debugging stuff # make a primObject prim_object = Prim(x_inside, y_inside, new_box, box_mass) coverage = (n * prim_object.y_mean * prim_object.box_mass)/cases_of_interest info("Found box %s: y_mean=%s, mass=%s, coverage=%s" % (k, prim_object.y_mean, prim_object.box_mass, coverage)) info("%s points in new box" % (y_inside.shape[0])) box_init = make_box(x_remaining) boxes = find_boxes(x_remaining_temp, y_remaining_temp, box_init, peel_alpha, paste_alpha, mass_min, threshold, pasting, k, k_max, n, cases_of_interest, obj_func) boxes.append(prim_object) return boxes else: info("Bump "+str(k)+" includes all remaining data") #make dump box box_mass = y_remaining.shape[0]/n dump_box = Prim(x_remaining, y_remaining, box_init, box_mass) return [dump_box]