def distance_sse(data): ''' The SSE (sum of squared-errors) distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series. Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, where error_term(i) equals to ds1(i)-ds2(i) Since SSE calculation is based on pairwise comparison of individual data points, the data series should be of equal length. SSE distance equals to the square of Euclidian distance, which is a commonly used distance metric in time series comparisons. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 distance = ssedist(data[i],data[j]) dRow[index] = distance return dRow, runLogs
def log_stats(self, gen): '''Log statistics on the progress of the evolution''' functions = { "min": self.minima, "max": self.maxima, "std": self.std, "mean": self.mean, } hof = self.__get_hof_in_array() info_message = pd.DataFrame( index=['min', 'max', 'mean', 'std'], columns=['obj_{}'.format(i) for i in range(hof.shape[1])]) for key, value in functions.iteritems(): data = value(hof) info_message.loc[key] = data # let pandas do the formatting for us, but remove the trailing info # on the size of the DataFrame message = info_message.__str__() message = message.split('\n')[0:-2] message = "\n".join(message) line = "\ngeneration {}\n{}".format(gen, message) ema_logging.info(line)
def determine_time_dimension(outcomes): ''' helper function for determining or creating time dimension Parameters ---------- outcomes : dict Returns ------- ndarray ''' time = None try: time = outcomes['TIME'] time = time[0, :] outcomes.pop('TIME') except KeyError: values = iter(outcomes.values()) for value in values: if len(value.shape) == 2: time = np.arange(0, value.shape[1]) break if time is None: info("no time dimension found in results") return time, outcomes
def __init__(self, gui=False, thd=False): ''' Create a link with netlogo. Underneath, the netlogo jvm is started through jpype. :param gui: boolean, if true run netlogo with gui, otherwise run in headless mode. Defaults to false. :param thd: boolean, if thrue start netlogo in 3d mode. Defaults to false ''' if not jpype.isJVMStarted(): # netlogo jars jars = [ NETLOGO_HOME + r'/lib/scala-library.jar', NETLOGO_HOME + r'/lib/asm-all-3.3.1.jar', NETLOGO_HOME + r'/lib/picocontainer-2.13.6.jar', NETLOGO_HOME + r'/lib/log4j-1.2.16.jar', NETLOGO_HOME + r'/lib/jmf-2.1.1e.jar', NETLOGO_HOME + r'/lib/pegdown-1.1.0.jar', NETLOGO_HOME + r'/lib/parboiled-core-1.0.2.jar', NETLOGO_HOME + r'/lib/parboiled-java-1.0.2.jar', NETLOGO_HOME + r'/lib/mrjadapter-1.2.jar', NETLOGO_HOME + r'/lib/jhotdraw-6.0b1.jar', NETLOGO_HOME + r'/lib/quaqua-7.3.4.jar', NETLOGO_HOME + r'/lib/swing-layout-7.3.4.jar', NETLOGO_HOME + r'/lib/jogl-1.1.1.jar', NETLOGO_HOME + r'/lib/gluegen-rt-1.1.1.jar', NETLOGO_HOME + r'/NetLogo.jar', PYNETLOGO_HOME + r'/external_files/netlogoLink.jar' ] # format jars in right format for starting java virtual machine # TODO the use of the jre here is only relevant under windows # apparently # might be solvable by setting netlogo home user.dir joined_jars = jar_separator.join(jars) jarpath = '-Djava.class.path={}'.format(joined_jars) jvm_handle = jpype.getDefaultJVMPath() jpype.startJVM(jvm_handle, jarpath, "-Xms128M", "-Xmx1024m") jpype.java.lang.System.setProperty('user.dir', NETLOGO_HOME) if sys.platform == 'darwin': jpype.java.lang.System.setProperty("java.awt.headless", "true") debug("jvm started") link = jpype.JClass('netlogoLink.NetLogoLink') debug('NetLogoLink class found') if sys.platform == 'darwin' and gui: info('on mac only headless mode is supported') gui = False self.link = link(gui, thd) debug('NetLogoLink class instantiated')
def __call__(self, case_id, case, policy, name, result): ''' Method responsible for storing results. The implementation in this class only keeps track of how many runs have been completed and logging this. Any extension of AbstractCallback needs to implement this method. If one want to use the logging provided here, call it via super. Parameters ---------- case_id: int the job id case: dict the case to be stored policy: str the name of the policy being used name: str the name of the model being used result: dict the result dict ''' self.i+=1 ema_logging.debug(str(self.i)+" cases completed") if self.i % self.reporting_interval == 0: ema_logging.info(str(self.i)+" cases completed")
def determine_time_dimension(outcomes): ''' helper function for determining or creating time dimension Parameters ---------- outcomes : dict Returns ------- ndarray ''' time = None try: time = outcomes['TIME'] time = time[0, :] outcomes.pop('TIME') except KeyError: values = iter(outcomes.values()) for value in values: if len(value.shape)==2: time = np.arange(0, value.shape[1]) break if time is None: info("no time dimension found in results") return time, outcomes
def __init__(self, gui=False, thd=False): ''' Create a link with netlogo. Underneath, the netlogo jvm is started through jpype. :param gui: boolean, if true run netlogo with gui, otherwise run in headless mode. Defaults to false. :param thd: boolean, if thrue start netlogo in 3d mode. Defaults to false ''' if not jpype.isJVMStarted(): # netlogo jars jars = [NETLOGO_HOME + r'/lib/scala-library.jar', NETLOGO_HOME + r'/lib/asm-all-3.3.1.jar', NETLOGO_HOME + r'/lib/picocontainer-2.13.6.jar', NETLOGO_HOME + r'/lib/log4j-1.2.16.jar', NETLOGO_HOME + r'/lib/jmf-2.1.1e.jar', NETLOGO_HOME + r'/lib/pegdown-1.1.0.jar', NETLOGO_HOME + r'/lib/parboiled-core-1.0.2.jar', NETLOGO_HOME + r'/lib/parboiled-java-1.0.2.jar', NETLOGO_HOME + r'/lib/mrjadapter-1.2.jar', NETLOGO_HOME + r'/lib/jhotdraw-6.0b1.jar', NETLOGO_HOME + r'/lib/quaqua-7.3.4.jar', NETLOGO_HOME + r'/lib/swing-layout-7.3.4.jar', NETLOGO_HOME + r'/lib/jogl-1.1.1.jar', NETLOGO_HOME + r'/lib/gluegen-rt-1.1.1.jar', NETLOGO_HOME + r'/NetLogo.jar', PYNETLOGO_HOME + r'/external_files/netlogoLink.jar'] # format jars in right format for starting java virtual machine # TODO the use of the jre here is only relevant under windows # apparently # might be solvable by setting netlogo home user.dir joined_jars = jar_separator.join(jars) jarpath = '-Djava.class.path={}'.format(joined_jars) jvm_handle = jpype.getDefaultJVMPath() jpype.startJVM(jvm_handle, jarpath, "-Xms128M","-Xmx1024m") jpype.java.lang.System.setProperty('user.dir', NETLOGO_HOME) if sys.platform=='darwin': jpype.java.lang.System.setProperty("java.awt.headless", "true"); debug("jvm started") link = jpype.JClass('netlogoLink.NetLogoLink') debug('NetLogoLink class found') if sys.platform == 'darwin' and gui: info('on mac only headless mode is supported') gui=False self.link = link(gui, thd) debug('NetLogoLink class instantiated')
def do_text_ticks_labels(ax, i, j, field1, field2, ylabels, outcomes_to_show): ''' Helper function for setting the tick labels on the axes correctly on and off Parameters ---------- ax : axes i : int j : int field1 : str field2 : str ylabels : dict, optional outcomes_to_show : str ''' #text and labels if i == j: #only plot the name in the middle if ylabels: text = ylabels[field1] else: text = field1 ax.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', transform=ax.transAxes) # are we at the end of the row? if i != len(outcomes_to_show) - 1: #xaxis off ax.set_xticklabels([]) else: if ylabels: try: ax.set_xlabel(ylabels.get(field2)) except KeyError: info("no label specified for " + field2) else: ax.set_xlabel(field2) # are we at the end of the column? if j != 0: #yaxis off ax.set_yticklabels([]) else: if ylabels: try: ax.set_ylabel(ylabels.get(field1)) except KeyError: info("no label specified for " + field1) else: ax.set_ylabel(field1)
def do_text_ticks_labels(ax, i, j, field1, field2, ylabels, outcomes_to_show): ''' Helper function for setting the tick labels on the axes correctly on and off Parameters ---------- ax : axes i : int j : int field1 : str field2 : str ylabels : dict, optional outcomes_to_show : str ''' #text and labels if i == j: #only plot the name in the middle if ylabels: text = ylabels[field1] else: text = field1 ax.text(0.5, 0.5, text, horizontalalignment='center', verticalalignment='center', transform = ax.transAxes) # are we at the end of the row? if i != len(outcomes_to_show)-1: #xaxis off ax.set_xticklabels([]) else: if ylabels: try: ax.set_xlabel(ylabels.get(field2)) except KeyError: info("no label specified for "+field2) else: ax.set_xlabel(field2) # are we at the end of the column? if j != 0: #yaxis off ax.set_yticklabels([]) else: if ylabels: try: ax.set_ylabel(ylabels.get(field1)) except KeyError: info("no label specified for "+field1) else: ax.set_ylabel(field1)
def _run_optimization(self, generate_individual, evaluate_population,algorithm=None, obj_function=None, weights=None, levers=None, pop_size=None, reporting_interval=None, nr_of_generations=None, crossover_rate=None, mutation_rate=None, caching=False, **kwargs): ''' Helper function that runs the actual optimization Parameters ---------- toolbox : generate_individual : callable helper function for generating an individual evaluate_population : callable helper function for evaluating the population attr_list : list list of attributes (alleles) keys : list the names of the attributes in the same order as attr_list obj_function : callable the objective function pop_size : int the size of the population reporting_interval : int the interval for reporting progress, passed on to perform_experiments weights : tuple the weights on the outcomes nr_of_generations : int number of generations for which the GA will be run crossover_rate : float the crossover rate of the GA mutation_rate : float the mutation rate of the GA levers : dict a dictionary with param keys as keys, and as values info used in mutation. ''' self.algorithm = algorithm(weights, levers, generate_individual, obj_function, pop_size, evaluate_population, nr_of_generations, crossover_rate, mutation_rate, reporting_interval, self, caching, **kwargs) # Begin the generational process for _ in range(nr_of_generations): pop = self.algorithm.get_population() info("-- End of (successful) evolution --") return self.algorithm.stats_callback, pop
def _terminate_pool(cls, taskqueue, inqueue, outqueue, pool, task_handler, result_handler, cache, working_dirs, ): ema_logging.info("terminating pool") # this is guaranteed to only be called once ema_logging.debug('finalizing pool') TERMINATE = 2 task_handler._state = TERMINATE for p in pool: taskqueue.put(None) # sentinel time.sleep(1) ema_logging.debug('helping task handler/workers to finish') cls._help_stuff_finish(inqueue, task_handler, len(pool)) assert result_handler.is_alive() or len(cache) == 0 result_handler._state = TERMINATE outqueue.put(None) # sentinel if pool and hasattr(pool[0], 'terminate'): ema_logging.debug('terminating workers') for p in pool: p.terminate() ema_logging.debug('joining task handler') task_handler.join(1e100) ema_logging.debug('joining result handler') result_handler.join(1e100) if pool and hasattr(pool[0], 'terminate'): ema_logging.debug('joining pool workers') for p in pool: p.join() # cleaning up directories # TODO investigate whether the multiprocessing.util tempdirectory # functionality can be used instead for directory in working_dirs: ema_logging.debug("deleting "+str(directory)) shutil.rmtree(directory)
def make_data_structure(clusters, distRow, runLogs): nr_clusters = np.max(clusters) cluster_list = [] for i in range(1, nr_clusters+1): info("starting with cluster %s" %i) #determine the indices for cluster i indices = np.where(clusters==i)[0] drow_indices = np.zeros((indices.shape[0]**2-indices.shape[0])/2, dtype=int) s = 0 #get the indices for the distance for the runs in the cluster for q in range(indices.shape[0]): for r in range(q+1, indices.shape[0]): b = indices[q] a = indices[r] drow_indices[s] = get_drow_index(indices[r], indices[q], clusters.shape[0]) s+=1 #get the distance for the runs in the cluster dist_clust = distRow[drow_indices] #make a distance matrix dist_matrix = squareform(dist_clust) #sum across the rows row_sum = dist_matrix.sum(axis=0) #get the index of the result with the lowest sum of distances min_cIndex = row_sum.argmin() # convert this cluster specific index back to the overall cluster list # of indices originalIndices = np.where(clusters==i) originalIndex = originalIndices[0][min_cIndex] print(originalIndex) a = list(np.where(clusters==i)[0]) a = [int(entry) for entry in a] cluster = Cluster(i, np.where(clusters==i)[0], originalIndex, [runLogs[entry] for entry in a], dist_clust) cluster_list.append(cluster) return cluster_list
def make_data_structure(clusters, distRow, runLogs): nr_clusters = np.max(clusters) cluster_list = [] for i in range(1, nr_clusters + 1): info("starting with cluster %s" % i) #determine the indices for cluster i indices = np.where(clusters == i)[0] drow_indices = np.zeros((indices.shape[0]**2 - indices.shape[0]) / 2, dtype=int) s = 0 #get the indices for the distance for the runs in the cluster for q in range(indices.shape[0]): for r in range(q + 1, indices.shape[0]): b = indices[q] a = indices[r] drow_indices[s] = get_drow_index(indices[r], indices[q], clusters.shape[0]) s += 1 #get the distance for the runs in the cluster dist_clust = distRow[drow_indices] #make a distance matrix dist_matrix = squareform(dist_clust) #sum across the rows row_sum = dist_matrix.sum(axis=0) #get the index of the result with the lowest sum of distances min_cIndex = row_sum.argmin() # convert this cluster specific index back to the overall cluster list # of indices originalIndices = np.where(clusters == i) originalIndex = originalIndices[0][min_cIndex] print(originalIndex) a = list(np.where(clusters == i)[0]) a = [int(entry) for entry in a] cluster = Cluster(i, np.where(clusters == i)[0], originalIndex, [runLogs[entry] for entry in a], dist_clust) cluster_list.append(cluster) return cluster_list
def _run_through_cache(self, individuals): '''Helper function, check whether individuals already have been evaluated if so use the cached value ''' invalid_inds = [ind for ind in individuals if not ind.fitness.valid] ema_logging.info('nr. of invalid individuals before checking cache: {}'.format(len(invalid_inds))) for invalid_ind in invalid_inds: # construct key key = [repr(invalid_ind.get(entry)) for entry in self.lever_keys] key = tuple(key) try: # set value if in caching invalid_ind.fitness.values = self.cache[key] except KeyError: pass invalid_inds = [ind for ind in individuals if not ind.fitness.valid] ema_logging.info('nr. of invalid individuals after checking cache: {}'.format(len(invalid_inds)))
def construct_features(data, filterSlope, tHoldSlope, filterCurvature, tHoldCurvature, addMidExtension, addEndExtension): ''' Constructs a feature vector for each of the data-series contained in the data. ''' info("calculating features") # TODO, the casting of each feature to a list of tuples might be # removed at some stage, it will lead to a speed up, for you # can vectorize the calculations that use the feature vector features = [] for i in range(data.shape[0]): feature = construct_feature_vector(data[i, :], filterSlope, tHoldSlope, filterCurvature, tHoldCurvature, addMidExtension, addEndExtension) # feature = [tuple(feature[0,:]),tuple(feature[1,:])] features.append(feature) return features
def distance_triangle(data): ''' The triangle distance is calculated as follows; Let ds1(.) and ds2(.) be two data series of length N. Then; A equals to the summation of ds1(i).ds2(i) from i=1 to N B equals to the square-root of the (summation ds1(i)^2 from i=1 to N) C equals to the square-root of the (summation ds1(i)^2 from i=1 to N) distance_triangle = A/(B.C) The triangle distance works only with data series of the same length In the literature, it is claimed that the triangle distance can deal with noise and amplitude scaling very well, and may yield poor results in cases of offset translation and linear drift. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 distance = trdist(data[i],data[j]) dRow[index] = distance return dRow, runLogs
def _get_population(self): if self._restart_required(): self.called += 1 self.last_eps_progress = 0 self.pop_size = self.desired_labda * len(self.archive.items) new_pop = self._rebuild_population() ema_logging.info( self.message.format(self.pop_size, len(self.archive.items), self.tournament_size)) # run new population through cache invalid_new_pop = new_pop if self.cache: invalid_new_pop = self._run_through_cache(new_pop) # update selection pressure... self.tournament_size = int( max(2, self.selection_presure * self.pop_size)) if invalid_new_pop: # Evaluate the individuals with an invalid fitness self.evaluate_population(invalid_new_pop, self.reporting_interval, self.toolbox, self.ensemble) # update cache with newly analysed population if self.caching: self._update_cache(invalid_new_pop) # Select the next generation population self.pop = self.toolbox.select(self.pop + new_pop, self.pop_size) self.stats_callback(self.pop) self.stats_callback.log_stats(self.called) return self.pop else: return super(epsNSGA2, self)._get_population()
def _get_population(self): if self._restart_required(): self.called +=1 self.last_eps_progress = 0 self.pop_size = self.desired_labda * len(self.archive.items) new_pop = self._rebuild_population() ema_logging.info(self.message.format(self.pop_size, len(self.archive.items), self.tournament_size)) # run new population through cache invalid_new_pop = new_pop if self.cache: invalid_new_pop = self._run_through_cache(new_pop) # update selection pressure... self.tournament_size = int(max(2, self.selection_presure*self.pop_size)) if invalid_new_pop: # Evaluate the individuals with an invalid fitness self.evaluate_population(invalid_new_pop, self.reporting_interval, self.toolbox, self.ensemble) # update cache with newly analysed population if self.caching: self._update_cache(invalid_new_pop) # Select the next generation population self.pop = self.toolbox.select(self.pop + new_pop, self.pop_size) self.stats_callback(self.pop) self.stats_callback.log_stats(self.called) return self.pop else: return super(epsNSGA2, self)._get_population()
def distance_triangle(data): ''' The triangle distance is calculated as follows; Let ds1(.) and ds2(.) be two data series of length N. Then; A equals to the summation of ds1(i).ds2(i) from i=1 to N B equals to the square-root of the (summation ds1(i)^2 from i=1 to N) C equals to the square-root of the (summation ds1(i)^2 from i=1 to N) distance_triangle = A/(B.C) The triangle distance works only with data series of the same length In the literature, it is claimed that the triangle distance can deal with noise and amplitude scaling very well, and may yield poor results in cases of offset translation and linear drift. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i + 1, data.shape[0]): index += 1 distance = trdist(data[i], data[j]) dRow[index] = distance return dRow, runLogs
def test_log_messages(self): ema_logging.log_to_stderr(ema_logging.DEBUG) with mock.patch('util.ema_logging._logger') as mocked_logger: message = 'test message' ema_logging.debug(message) mocked_logger.debug.assert_called_with(message) ema_logging.info(message) mocked_logger.info.assert_called_with(message) ema_logging.warning(message) mocked_logger.warning.assert_called_with(message) ema_logging.error(message) mocked_logger.error.assert_called_with(message) ema_logging.exception(message) mocked_logger.exception.assert_called_with(message) ema_logging.critical(message) mocked_logger.critical.assert_called_with(message)
def log_stats(self, gen): '''Log statistics on the progress of the evolution''' functions = {"min":self.minima, "max":self.maxima, "std":self.std, "mean":self.mean,} hof = self.__get_hof_in_array() info_message = pd.DataFrame(index=['min', 'max', 'mean', 'std'], columns=['obj_{}'.format(i) for i in range(hof.shape[1])]) for key, value in functions.iteritems(): data = value(hof) info_message.loc[key] = data # let pandas do the formatting for us, but remove the trailing info # on the size of the DataFrame message = info_message.__str__() message = message.split('\n')[0:-2] message = "\n".join(message) line = "\ngeneration {}\n{}".format(gen,message) ema_logging.info(line)
def filter_scalar_outcomes(outcomes): ''' Helper function that removes non time series outcomes from all the outcomes. Parameters ---------- outcomes : dict Returns ------- dict the filtered outcomes ''' outcomes_to_remove = [] for key, value in outcomes.items(): if len(value.shape) <2: outcomes_to_remove.append(key) info("%s not shown because it is not time series data" %key) [outcomes.pop(entry) for entry in outcomes_to_remove] return outcomes
def _run_through_cache(self, individuals): '''Helper function, check whether individuals already have been evaluated if so use the cached value ''' invalid_inds = [ind for ind in individuals if not ind.fitness.valid] ema_logging.info( 'nr. of invalid individuals before checking cache: {}'.format( len(invalid_inds))) for invalid_ind in invalid_inds: # construct key key = [repr(invalid_ind.get(entry)) for entry in self.lever_keys] key = tuple(key) try: # set value if in caching invalid_ind.fitness.values = self.cache[key] except KeyError: pass invalid_inds = [ind for ind in individuals if not ind.fitness.valid] ema_logging.info( 'nr. of invalid individuals after checking cache: {}'.format( len(invalid_inds)))
def filter_scalar_outcomes(outcomes): ''' Helper function that removes non time series outcomes from all the outcomes. Parameters ---------- outcomes : dict Returns ------- dict the filtered outcomes ''' outcomes_to_remove = [] for key, value in outcomes.items(): if len(value.shape) < 2: outcomes_to_remove.append(key) info("%s not shown because it is not time series data" % key) [outcomes.pop(entry) for entry in outcomes_to_remove] return outcomes
def distance_mse(data): ''' The MSE (mean squared-error) distance is equal to the SSE distance divided by the number of data points in data series. The SSE distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series. Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, where error_term(i) equals to ds1(i)-ds2(i) Given that SSE is calculated as given above, MSE equals SSE divided by N. As SSE distance, the MSE distance only works with data series of equal length. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 distance = msedist(data[i],data[j]) dRow[index] = distance return dRow, runLogs
def distance_mse(data): ''' The MSE (mean squared-error) distance is equal to the SSE distance divided by the number of data points in data series. The SSE distance between two data series is equal to the sum of squared-errors between corresponding data points of these two data series. Let the data series be of length N; Then SSE distance between ds1 and ds2 equals to the sum of the square of error terms from 1 to N, where error_term(i) equals to ds1(i)-ds2(i) Given that SSE is calculated as given above, MSE equals SSE divided by N. As SSE distance, the MSE distance only works with data series of equal length. ''' runLogs = [] #Generates the feature vectors for all the time series that are contained in numpy array data info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i + 1, data.shape[0]): index += 1 distance = msedist(data[i], data[j]) dRow[index] = distance return dRow, runLogs
class AbstractCallback(object): ''' Abstract base class from which different call back classes can be derived. Callback is responsible for storing the results of the runs. Parameters ---------- uncs : list a list of the uncertainties over which the experiments are being run. outcomes : list a list of outcomes nr_experiments : int the total number of experiments to be executed reporting_interval : int, optional the interval at which to provide progress information via logging. Attributes ---------- i : int a counter that keeps track of how many experiments have been saved reporting_interval : int the frequency at which to log progress ''' __metaclass__ = abc.ABCMeta i = 0 reporting_interval = 100 def __init__(self, uncertainties, outcomes, nr_experiments, reporting_interval=100): self.reporting_interval = reporting_interval @abc.abstractmethod def __call__(self, case_id, case, policy, name, result): ''' Method responsible for storing results. The implementation in this class only keeps track of how many runs have been completed and logging this. Any extension of AbstractCallback needs to implement this method. If one want to use the logging provided here, call it via super. Parameters ---------- case_id: int the job id case: dict the case to be stored policy: str the name of the policy being used name: str the name of the model being used result: dict the result dict ''' self.i += 1 ema_logging.debug(str(self.i) + " cases completed") if self.i % self.reporting_interval == 0: ema_logging.info(str(self.i) + " cases completed")
def _run_optimization(self, generate_individual, evaluate_population, algorithm=None, obj_function=None, weights=None, levers=None, pop_size=None, reporting_interval=None, nr_of_generations=None, crossover_rate=None, mutation_rate=None, caching=False, **kwargs): ''' Helper function that runs the actual optimization Parameters ---------- toolbox : generate_individual : callable helper function for generating an individual evaluate_population : callable helper function for evaluating the population attr_list : list list of attributes (alleles) keys : list the names of the attributes in the same order as attr_list obj_function : callable the objective function pop_size : int the size of the population reporting_interval : int the interval for reporting progress, passed on to perform_experiments weights : tuple the weights on the outcomes nr_of_generations : int number of generations for which the GA will be run crossover_rate : float the crossover rate of the GA mutation_rate : float the mutation rate of the GA levers : dict a dictionary with param keys as keys, and as values info used in mutation. ''' self.algorithm = algorithm(weights, levers, generate_individual, obj_function, pop_size, evaluate_population, nr_of_generations, crossover_rate, mutation_rate, reporting_interval, self, caching, **kwargs) # Begin the generational process for _ in range(nr_of_generations): pop = self.algorithm.get_population() info("-- End of (successful) evolution --") return self.algorithm.stats_callback, pop
def perform_experiments(self, cases, callback=DefaultCallback, reporting_interval=100, model_kwargs={}, which_uncertainties=INTERSECTION, which_outcomes=INTERSECTION, **kwargs): """ Method responsible for running the experiments on a structure. In case of multiple model structures, the outcomes are set to the intersection of the sets of outcomes of the various models. Parameters ---------- cases : int or iterable In case of Latin Hypercube sampling and Monte Carlo sampling, cases specifies the number of cases to generate. In case of Full Factorial sampling, cases specifies the resolution to use for sampling continuous uncertainties. Alternatively, one can supply a list of dicts, where each dicts contains a case. That is, an uncertainty name as key, and its value. callback : callback, optional callable that will be called after finishing a single experiment (default is :class:`~callbacks.DefaultCallback`) reporting_interval : int, optional parameter for specifying the frequency with which the callback reports the progress. (Default is 100) model_kwargs : dict, optional dictionary of keyword arguments to be passed to model_init which_uncertainties : {INTERSECTION, UNION}, optional keyword argument for controlling whether, in case of multiple model structure interfaces, the intersection or the union of uncertainties should be used. which_outcomes : {INTERSECTION, UNION}, optional keyword argument for controlling whether, in case of multiple model structure interfaces, the intersection or the union of outcomes should be used. kwargs : dict, optional generic keyword arguments to pass on to the callback Returns ------- tuple a `structured numpy array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_ containing the experiments, and a dict with the names of the outcomes as keys and an numpy array as value. .. rubric:: suggested use In general, analysis scripts require both the structured array of the experiments and the dictionary of arrays containing the results. The recommended use is the following:: >>> results = ensemble.perform_experiments(10000) #recommended use >>> experiments, output = ensemble.perform_experiments(10000) The latter option will work fine, but most analysis scripts require to wrap it up into a tuple again:: >>> data = (experiments, output) Another reason for the recommended use is that you can save this tuple directly:: >>> import util as util >>> util.save_results(results, filename) """ return_val = self._generate_experiments(cases, which_uncertainties) experiments, nr_of_exp, uncertainties = return_val # identify the outcomes that are to be included overview_dict, element_dict = self._determine_unique_attributes( "outcomes") if which_outcomes == UNION: outcomes = element_dict.keys() elif which_outcomes == INTERSECTION: outcomes = overview_dict[tuple( [msi.name for msi in self.model_structures])] outcomes = [outcome.name for outcome in outcomes] else: raise ValueError("unknown value for which_outcomes") info(str(nr_of_exp) + " experiment will be executed") #initialize the callback object callback = callback(uncertainties, outcomes, nr_of_exp, reporting_interval=reporting_interval, **kwargs) if self.parallel: info("preparing to perform experiment in parallel") if not self.pool: self.pool = MultiprocessingPool(self.model_structures, model_kwargs=model_kwargs, nr_processes=self.processes) info("starting to perform experiments in parallel") self.pool.perform_experiments(callback, experiments) else: info("starting to perform experiments sequentially") cwd = os.getcwd() runner = ExperimentRunner(self._msis, model_kwargs) for experiment in experiments: experiment_id, case, policy, model_name, result = runner.run_experiment( experiment) callback(experiment_id, case, policy, model_name, result) os.chdir(cwd) results = callback.get_results() info("experiments finished") return results
def distance_gonenc(data, sisterCount=50, wSlopeError=1, wCurvatureError=1, filterSlope=True, tHoldSlope = 0.1, filterCurvature=True, tHoldCurvature=0.1, addMidExtension=True, addEndExtension=True ): ''' The distance measures the proximity of data series in terms of their qualitative pattern features. In order words, it quantifies the proximity between two different dynamic behaviour modes. It is designed to work mainly on non-stationary data. It's current version does not perform well in catching the proximity of two cyclic/repetitive patterns with different number of cycles (e.g. oscillation with 4 cycle versus oscillation with 6 cycles). :param data: :param sisterCount: Number of long-versions that will be created for the short vector while comparing two data series with unequal feature vector lengths. :param wSlopeError: Weight of the error between the 1st dimensions of the two feature vectors (i.e. Slope). (default=1) :param wCurvatureError: Weight of the error between the 2nd dimensions of the two feature vectors (i.e. Curvature). (default=1) :param wFilterSlope: Boolean, indicating whether the slope vectors should be filtered for minor fluctuations, or not. (default=True) :param tHoldSlope: The threshold value to be used in filtering out fluctuations in the slope. (default=0.1) :param filterCurvature: Boolean, indicating whether the curvature vectors should be filtered for minor fluctuations, or not. (default=True) :param tHoldCurvature: The threshold value to be used in filtering out fluctuations in the curvature. (default=0.1) :param addMidExtension: Boolean, indicating whether the feature vectors should be extended by introducing transition sections along the vector. (default=True) :param addEndExtension: Boolean, indicating whether the feature vectors should be extended by introducing startup/closing sections at the beginning/end of the vector. (default=True) ''' runLogs = [] #Generates the feature vectors for all the time series that are contained # in numpy array data features = construct_features(data, filterSlope, tHoldSlope, filterCurvature, tHoldCurvature, addMidExtension, addEndExtension) info("calculating distances") dRow = np.zeros(shape=(np.sum(np.arange(data.shape[0])), )) index = -1 for i in range(data.shape[0]): feature_i = features[i] # For each run, a log is created # Log includes a description dictionary that has key information # for post-clustering analysis, and the data series itself. These # logs are stored in a global array named runLogs behaviorDesc = {} behaviorDesc['Index'] = str(i) #this may not work due to data type mismatch featVector = feature_i behaviorDesc['Feature vector'] = str(featVector) behavior = data[i] localLog = (behaviorDesc, behavior) runLogs.append(localLog) for j in range(i+1, data.shape[0]): index += 1 feature_j = features[j] if feature_i.shape[1] == feature_j.shape[1]: distance = distance_same_length(feature_i, feature_j, wSlopeError, wCurvatureError) else: distance = distance_different_lenght(feature_i, feature_j, wSlopeError, wCurvatureError, sisterCount) dRow[index] = distance return dRow, runLogs
def cluster(data, outcome, distance='gonenc', interClusterDistance='complete', cMethod='inconsistent', cValue=2.5, plotDendrogram=True, plotClusters=True, groupPlot=False, **kwargs): ''' Method that clusters time-series data from the specified cpickle file according to a selected distance measure. Parameters ---------- data : tuple return from meth:`perform_experiments`. outcome : str Name of outcome/variable whose behavior is being analyzed distance : {'gonenc','triangle', 'sse', 'mse'} The distance metric to be used. interClusterDistance : str How to calculate inter cluster distance. see `linkage <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage>`_ for details. cMethod : str Cutoff method, see `fcluster <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster>`_ for details. cValue : float Cutoff value, see `fcluster <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster>`_ for details. plotDendogram : bool plotCluster : bool groupPlot: bool Returns ------- list distances list Clusters list distance metrics The remainder of the arguments are passed on to the specified distance function. Gonenc Distance: * 'distance': String that specifies the distance to be used. Options: bmd (default), mse, sse * 'filter?': Boolean that specifies whether the data series will be filtered (for bmd distance) * 'slope filter': A float number that specifies the filtering threshold for the slope (for every data point if change__in_the_ outcome/average_value_of_the_outcome < threshold, consider slope = 0) (for bmd distance) * 'curvature filter': A float number that specifies the filtering threshold for the curvature (for every data point if change__in_the_slope/average_value_of_the_slope < threshold, consider curvature = 0) (for bmd distance) * 'no of sisters': 50 (for bmd distance) ''' global varName varName = outcome data = data[1][outcome] # Construct a list with distances. This list is the upper triange # of the distance matrix dRow, runLogs = construct_distances(data, distance, **kwargs) info('finished distances') # Allocate individual runs into clusters using hierarchical agglomerative # clustering. clusterSetup is a dictionary that customizes the clustering # algorithm to be used. z, clusters, runLogs = flatcluster(dRow, runLogs, plotDendrogram=plotDendrogram, interClusterDistance=interClusterDistance, cMethod=cMethod, cValue=cValue) info("tranforming to list of clusters") clusters = make_data_structure(clusters, dRow, runLogs) if plotClusters: plot_clusters(groupPlot, runLogs) return dRow, clusters, z
def cluster(data, outcome, distance='gonenc', interClusterDistance='complete', cMethod='inconsistent', cValue=2.5, plotDendrogram=True, plotClusters=True, groupPlot=False, **kwargs): ''' Method that clusters time-series data from the specified cpickle file according to a selected distance measure. Parameters ---------- data : tuple return from meth:`perform_experiments`. outcome : str Name of outcome/variable whose behavior is being analyzed distance : {'gonenc','triangle', 'sse', 'mse'} The distance metric to be used. interClusterDistance : str How to calculate inter cluster distance. see `linkage <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage>`_ for details. cMethod : str Cutoff method, see `fcluster <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster>`_ for details. cValue : float Cutoff value, see `fcluster <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster>`_ for details. plotDendogram : bool plotCluster : bool groupPlot: bool Returns ------- list distances list Clusters list distance metrics The remainder of the arguments are passed on to the specified distance function. Gonenc Distance: * 'distance': String that specifies the distance to be used. Options: bmd (default), mse, sse * 'filter?': Boolean that specifies whether the data series will be filtered (for bmd distance) * 'slope filter': A float number that specifies the filtering threshold for the slope (for every data point if change__in_the_ outcome/average_value_of_the_outcome < threshold, consider slope = 0) (for bmd distance) * 'curvature filter': A float number that specifies the filtering threshold for the curvature (for every data point if change__in_the_slope/average_value_of_the_slope < threshold, consider curvature = 0) (for bmd distance) * 'no of sisters': 50 (for bmd distance) ''' global varName varName = outcome data = data[1][outcome] # Construct a list with distances. This list is the upper triange # of the distance matrix dRow, runLogs = construct_distances(data, distance, **kwargs) info('finished distances') # Allocate individual runs into clusters using hierarchical agglomerative # clustering. clusterSetup is a dictionary that customizes the clustering # algorithm to be used. z, clusters, runLogs = flatcluster( dRow, runLogs, plotDendrogram=plotDendrogram, interClusterDistance=interClusterDistance, cMethod=cMethod, cValue=cValue) info("tranforming to list of clusters") clusters = make_data_structure(clusters, dRow, runLogs) if plotClusters: plot_clusters(groupPlot, runLogs) return dRow, clusters, z
def perform_experiments(self, cases, callback=DefaultCallback, reporting_interval=100, model_kwargs = {}, which_uncertainties=INTERSECTION, which_outcomes=INTERSECTION, **kwargs): """ Method responsible for running the experiments on a structure. In case of multiple model structures, the outcomes are set to the intersection of the sets of outcomes of the various models. Parameters ---------- cases : int or iterable In case of Latin Hypercube sampling and Monte Carlo sampling, cases specifies the number of cases to generate. In case of Full Factorial sampling, cases specifies the resolution to use for sampling continuous uncertainties. Alternatively, one can supply a list of dicts, where each dicts contains a case. That is, an uncertainty name as key, and its value. callback : callback, optional callable that will be called after finishing a single experiment (default is :class:`~callbacks.DefaultCallback`) reporting_interval : int, optional parameter for specifying the frequency with which the callback reports the progress. (Default is 100) model_kwargs : dict, optional dictionary of keyword arguments to be passed to model_init which_uncertainties : {INTERSECTION, UNION}, optional keyword argument for controlling whether, in case of multiple model structure interfaces, the intersection or the union of uncertainties should be used. which_outcomes : {INTERSECTION, UNION}, optional keyword argument for controlling whether, in case of multiple model structure interfaces, the intersection or the union of outcomes should be used. kwargs : dict, optional generic keyword arguments to pass on to the callback Returns ------- tuple a `structured numpy array <http://docs.scipy.org/doc/numpy/user/basics.rec.html>`_ containing the experiments, and a dict with the names of the outcomes as keys and an numpy array as value. .. rubric:: suggested use In general, analysis scripts require both the structured array of the experiments and the dictionary of arrays containing the results. The recommended use is the following:: >>> results = ensemble.perform_experiments(10000) #recommended use >>> experiments, output = ensemble.perform_experiments(10000) The latter option will work fine, but most analysis scripts require to wrap it up into a tuple again:: >>> data = (experiments, output) Another reason for the recommended use is that you can save this tuple directly:: >>> import util as util >>> util.save_results(results, filename) """ return_val = self._generate_experiments(cases, which_uncertainties) experiments, nr_of_exp, uncertainties = return_val # identify the outcomes that are to be included overview_dict, element_dict = self._determine_unique_attributes("outcomes") if which_outcomes==UNION: outcomes = element_dict.keys() elif which_outcomes==INTERSECTION: outcomes = overview_dict[tuple([msi.name for msi in self.model_structures])] outcomes = [outcome.name for outcome in outcomes] else: raise ValueError("unknown value for which_outcomes") info(str(nr_of_exp) + " experiment will be executed") #initialize the callback object callback = callback(uncertainties, outcomes, nr_of_exp, reporting_interval=reporting_interval, **kwargs) if self.parallel: info("preparing to perform experiment in parallel") if not self.pool: self.pool = MultiprocessingPool(self.model_structures, model_kwargs=model_kwargs, nr_processes=self.processes) info("starting to perform experiments in parallel") self.pool.perform_experiments(callback, experiments) else: info("starting to perform experiments sequentially") cwd = os.getcwd() runner = ExperimentRunner(self._msis, model_kwargs) for experiment in experiments: experiment_id, case, policy, model_name, result = runner.run_experiment(experiment) callback(experiment_id, case, policy, model_name, result) os.chdir(cwd) results = callback.get_results() info("experiments finished") return results
def __init__(self, msis, processes=None, kwargs=None): ''' Parameters ---------- msis : list iterable of model structure interface instances processes: int nr. of processes to spawn, if none, it is set to equal the nr. of cores kwargs : dict kwargs to be pased to :meth:`model_init` ''' if processes is None: try: processes = multiprocessing.cpu_count() except NotImplementedError: processes = 1 ema_logging.info("nr of processes is "+str(processes)) # setup queues etc. self._setup_queues() self._taskqueue = queue.Queue(processes*2) self._cache = {} self._state = pool.RUN # handling of logging self.log_queue = multiprocessing.Queue() h = ema_logging.NullHandler() logging.getLogger(ema_logging.LOGGER_NAME).addHandler(h) log_queue_reader = LogQueueReader(self.log_queue) log_queue_reader.start() # setup of the actual pool self._pool = [] working_dirs = [] ema_logging.debug('generating workers') worker_root = None for i in range(processes): ema_logging.debug('generating worker '+str(i)) workername = self._get_worker_name(i) #setup working directories for parallel_ema for msi in msis: if msi.working_directory != None: if worker_root == None: wd = msis[0].working_directory abs_wd = os.path.abspath(wd) worker_root = os.path.dirname(abs_wd) wd_name = workername + msi.name working_directory = os.path.join(worker_root, wd_name) working_dirs.append(working_directory) shutil.copytree(msi.working_directory, working_directory, ) msi.set_working_directory(working_directory) w = LoggingProcess( self.log_queue, level = logging.getLogger(ema_logging.LOGGER_NAME)\ .getEffectiveLevel(), target=worker, args=(self._inqueue, self._outqueue, msis, kwargs ) ) self._pool.append(w) w.name = w.name.replace('Process', workername) w.daemon = True w.start() ema_logging.debug(' worker '+str(i) + ' generated') # thread for handling tasks self._task_handler = threading.Thread( target=CalculatorPool._handle_tasks, name='task handler', args=(self._taskqueue, self._quick_put, self._outqueue, self._pool ) ) self._task_handler.daemon = True self._task_handler._state = pool.RUN self._task_handler.start() # thread for handling results self._result_handler = threading.Thread( target=CalculatorPool._handle_results, name='result handler', args=(self._outqueue, self._quick_get, self._cache, self.log_queue) ) self._result_handler.daemon = True self._result_handler._state = pool.RUN self._result_handler.start() # function for cleaning up when finalizing object self._terminate = Finalize(self, self._terminate_pool, args=(self._taskqueue, self._inqueue, self._outqueue, self._pool, self._task_handler, self._result_handler, self._cache, working_dirs, ), exitpriority=15 ) ema_logging.info("pool has been set up")
vensim_single = ctypes.windll.vendll32 except AttributeError: vensim_single = None except WindowsError: vensim_single = None try: vensim_double = ctypes.windll.LoadLibrary('C:\Windows\SysWOW64\VdpDLL32.dll') except AttributeError: vensim_double = None except WindowsError: vensim_double = None if vensim_single and vensim_double: vensim = vensim_single info("both single and double precision vensim available, using single") elif vensim_single: vensim = vensim_single info('using single precision vensim') elif vensim_double: vensim = vensim_double info('using single precision vensim') else: message = "vensim dll not found, vensim functionality not available" sys.stderr.write(message+"\n") warning(message) del sys def be_quiet(quietflag): '''
def start(self): '''start the log watcher''' ema_logging.info('start watching on {}'.format(self.url)) self.stream.on_recv(self.log_message)