def _merge_pickle_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection""" # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params[ "__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author": author, "date": date, "dataset_directory": target_collection_path, "train_test": True, "parameter_setting": target_collection_params, "input_collection_name": source_collection_pathes[0][len(pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): data = source_collection.get_data(run, split, train_set_name_suffix) target_data = target_collection.get_data( run, split, train_set_name_suffix) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert ("test" == key[2]) value = target_collection.data.pop(key) key = (key[0], key[1], "train") target_collection.data[key] = value target_collection.store(target_collection_path)
def _merge_pickle_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection""" # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) try: author = pwd.getpwuid(os.getuid())[4] except: author = "unknown" self._log("Author could not be resolved.",level=logging.WARNING) date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author" : author, "date" : date, "dataset_directory" : target_collection_path, "train_test" : True, "parameter_setting" : target_collection_params, "input_collection_name" : source_collection_pathes[0][len( pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): data = source_collection.get_data(run, split, train_set_name_suffix) target_data = target_collection.get_data(run, split, train_set_name_suffix) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert("test" == key[2]) value = target_collection.data.pop(key) key = (key[0],key[1],"train") target_collection.data[key] = value target_collection.store(target_collection_path)
def _createProcesses(cls, processes, result_directory, operation_spec, parameter_settings, input_collections, command_template): # For each combination of classifier, input-collection and # run number, create one WEKA_process for dataset_dir in input_collections: collection = BaseDataset.load(dataset_dir) # Determine the number of iterations and splits to be used iterations = collection.meta_data["runs"] splits = collection.meta_data["splits"] if "runs" in operation_spec: assert(iterations in [1, operation_spec["runs"]]) iterations = operation_spec["runs"] if "cv_folds" in operation_spec: assert(splits in [1, operation_spec["cv_folds"]]) splits = operation_spec["cv_folds"] for parametrization in parameter_settings: for run_number in range(iterations): process = WEKAClassificationProcess(dataset_dir, command_template, parametrization, splits, run_number, result_directory) processes.put(process) # give executing process the sign that creation is now finished processes.put(False)
def __init__(self, dataset_dir, command_template, parametrization, run_number, split_number, operation_result_dir, hide_parameters = []): super(WEKAFilterProcess, self).__init__() # Determine the directory in which the of the process' results # are stored result_collection_name = dataset_dir.split(os.sep)[-2] for parameter_name, parameter_value in parametrization.iteritems(): # If this is a parameter that should not be hidden, then we have to # encode it in the result collection name if not parameter_name in hide_parameters: result_collection_name += "{__%s__:%s}" % (parameter_name.upper(), parameter_value) self.result_directory = os.path.join(operation_result_dir, result_collection_name) # Create directory for intermediate results if it does not exist yet create_directory(self.result_directory + os.sep + "data_run%s" % run_number) # Create collection collection = BaseDataset.load(dataset_dir) # The parametrization that is independent of the collection type # and the specific weka command template that is executed self.params = {"dataset_name": dataset_dir.replace('/','_'), "dataset_dir": dataset_dir, "run_number": run_number, "split_number": split_number, "weka_class_path": pySPACE.configuration.weka_class_path, "temp_results": self.result_directory} # Load the abbreviations abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir, 'operations/weka_templates', 'abbreviations.yaml'), 'r') self.abbreviations = yaml.load(abbreviations_file) # Add custom parameters for the weka command template for parameter_name, parameter_value in parametrization.iteritems(): # Auto-expand abbreviations if parameter_value in self.abbreviations: parameter_value = self.abbreviations[parameter_value] self.params[parameter_name] = parameter_value # Build the WEKA command by repeatedly replacing all placeholders in # the template while True: instantiated_template = command_template % self.params if instantiated_template == command_template: # All placeholders replace self.weka_command = instantiated_template break else: # We have to continue since we are not converged command_template = instantiated_template self.handler_class = None
def _copy_file(self, source_collection_path, target_collection_path, train_set_name_suffix): """ Copy a dataset to a new destination **Parameters** :source_collection_path: The path to the dataset that has to be copied. :target_collection_path: The path to where the dataset should be copied. :train_set_name_suffix: Either 'train' or 'test'. Specifies if the target dataset is handeled as training or testing data. """ source_collection = BaseDataset.load(source_collection_path) # if only test data was given, the "Rest_vs" collection is stored as # training data if self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in source_collection.data.keys(): assert ("test" == key[2]) value = source_collection.data.pop(key) key = (key[0], key[1], "train") source_collection.data[key] = value # we store the data in the same format as before source_collection.store(target_collection_path, source_collection.meta_data["storage_format"])
def _copy_file(self, source_collection_path, target_collection_path, train_set_name_suffix): """ Copy a dataset to a new destination **Parameters** :source_collection_path: The path to the dataset that has to be copied. :target_collection_path: The path to where the dataset should be copied. :train_set_name_suffix: Either 'train' or 'test'. Specifies if the target dataset is handeled as training or testing data. """ source_collection = BaseDataset.load(source_collection_path) # if only test data was given, the "Rest_vs" collection is stored as # training data if self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in source_collection.data.keys(): assert("test" == key[2]) value = source_collection.data.pop(key) key = (key[0],key[1],"train") source_collection.data[key] = value # we store the data in the same format as before source_collection.store(target_collection_path, source_collection.meta_data["storage_format"])
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates an WEKA operation based on the information given in the operation specification operation_spec """ assert(operation_spec["type"] == "weka_classification") # Determine all parameter combinations that should be tested parameter_settings = cls._get_parameter_space(operation_spec) # Read the command template from a file template_file = open(os.path.join(pySPACE.configuration.spec_dir, "operations", "weka_templates", operation_spec["template"]), 'r') command_template = template_file.read() template_file.close() # number of processes if "runs" in operation_spec: number_processes = len(input_paths) * len(parameter_settings) * \ operation_spec["runs"] else: # approximate the number of processes runs = [] for dataset_dir in input_paths: collection = BaseDataset.load(dataset_dir) runs.append(collection.meta_data["runs"]) runs = max(runs) number_processes = len(input_paths) * len(parameter_settings) * \ runs if debug == True: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, operation_spec, parameter_settings, input_paths, command_template) # create and return the weka operation object return cls(processes, operation_spec, result_directory, number_processes) else: # Create all processes by calling a recursive helper method in # another thread so that already created processes can be executed in # parallel. Therefore a queue is used which size is maximized to # guarantee that not to much objects are created (because this costs # memory). However, the actual number of 100 is arbitrary and might # be reviewed. processes = processing.Queue(100) create_process = processing.Process(target=cls._createProcesses, args=( processes, result_directory, operation_spec, parameter_settings, input_paths, command_template)) create_process.start() # create and return the weka operation object return cls(processes, operation_spec, result_directory, number_processes, create_process)
def __call__(self): """ Executes this process on the respective modality """ # Restore configuration pySPACE.configuration = self.configuration # reduce log_level for processing a second time and # set communication possibility for nodes to backend pySPACE.configuration.min_log_level = self.min_log_level pySPACE.configuration.logging_com = self.handler_args pySPACE.configuration.backend_com = self.backend_com ############## Prepare benchmarking ############## super(NodeChainProcess, self).pre_benchmarking() # Load the data and check that it can be processed # Note: This can not be done in the objects constructor since in # that case the whole input would need to be pickled # when doing the remote call abs_dataset_dir = os.sep.join([self.storage, self.rel_dataset_dir]) input_collection = BaseDataset.load(abs_dataset_dir) # We have to remember parameters used for generating this specific # input dataset if 'parameter_setting' in input_collection.meta_data.keys(): # but not __INPUT_DATASET__ and __RESULT_DIRECTORY__ for k, v in input_collection.meta_data['parameter_setting'].items(): if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]: self.parameter_setting[k] = v NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain, input_collection) ############## Do the actual benchmarking ############## self._log("Start benchmarking run %s of node_chain %s on dataset %s" % (self.run, self.node_chain_spec, self.rel_dataset_dir)) # Do the actual benchmarking for this collection/node_chain combination try: result_collection = \ self.node_chain.benchmark(input_collection = input_collection, run = self.run, persistency_directory = self.persistency_dir, store_node_chain = self.store_node_chain) except Exception, exception: # Send Exception to Logger import traceback print traceback.format_exc() self._log(traceback.format_exc(), level = logging.ERROR) raise
def __call__(self): """ Executes this process on the respective modality """ # Restore configuration pySPACE.configuration = self.configuration # reduce log_level for processing a second time and # set communication possibility for nodes to backend pySPACE.configuration.min_log_level = self.min_log_level pySPACE.configuration.logging_com = self.handler_args pySPACE.configuration.backend_com = self.backend_com ############## Prepare benchmarking ############## super(NodeChainProcess, self).pre_benchmarking() # Load the data and check that it can be processed # Note: This can not be done in the objects constructor since in # that case the whole input would need to be pickled # when doing the remote call abs_dataset_dir = os.sep.join([self.storage, self.rel_dataset_dir]) input_collection = BaseDataset.load(abs_dataset_dir) # We have to remember parameters used for generating this specific # input dataset if 'parameter_setting' in input_collection.meta_data.keys(): # but not __INPUT_DATASET__ and __RESULT_DIRECTORY__ for k, v in input_collection.meta_data['parameter_setting'].items(): if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]: self.parameter_setting[k] = v NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain, input_collection) ############## Do the actual benchmarking ############## self._log("Start benchmarking run %s of node_chain %s on dataset %s" % (self.run, self.node_chain_spec, self.rel_dataset_dir)) # Do the actual benchmarking for this collection/node_chain combination try: result_collection = \ self.node_chain.benchmark( input_collection=input_collection, run=self.run, persistency_directory=self.persistency_dir, store_node_chain=self.store_node_chain) except Exception, exception: # Send Exception to Logger import traceback self._log(traceback.format_exc(), level=logging.ERROR) raise
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates a statistic operation based on the information given in the operation specification operation_spec. If debug is TRUE the creation of the statistic processes will not be in a separated thread. """ assert (operation_spec["type"] == "statistic") input_path = operation_spec["input_path"] tabular = BaseDataset.load( os.path.join(pySPACE.configuration.storage, input_path)).data if operation_spec.has_key("filter"): conditions = csv_analysis.empty_dict(tabular) for key, l in operation_spec["filter"].items(): conditions[key].extend(l) tabular = csv_analysis.strip_dict(tabular, conditions) metric = operation_spec.get("metric", "Balanced_accuracy") parameter = operation_spec.get("parameter", "__Dataset__") rel_par = operation_spec.get("related_parameters", ["__Dataset__", "Key_Run", "Key_Fold"]) average = operation_spec.get("average", None) if average in rel_par: rel_par.remove(average) if metric in rel_par: rel_par.remove(metric) if parameter in rel_par: rel_par.remove(parameter) reduced_tabular = cls.reduce_tabular(tabular, rel_par, metric, parameter, average) number_processes = 1 processes = processing.Queue() cls._createProcesses(processes, result_directory, reduced_tabular) import shutil shutil.copy2( os.path.join(pySPACE.configuration.storage, input_path, "results.csv"), os.path.join(result_directory, "results.csv")) shutil.copy2( os.path.join(pySPACE.configuration.storage, input_path, "metadata.yaml"), os.path.join(result_directory, "metadata.yaml")) # create and return the shuffle operation object return cls(processes, operation_spec, result_directory, number_processes)
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates an Analysis operation based on the information given in the operation specification operation_spec """ assert (operation_spec["type"] == "analysis") input_path = operation_spec["input_path"] summary = BaseDataset.load( os.path.join(pySPACE.configuration.storage, input_path)) data_dict = summary.data # Determine the parameters that should be analyzed parameters = operation_spec["parameters"] # Determine the metrics that should be plotted metrics = operation_spec["metrics"] # Determine how many processes will be created number_parameter_values = [ len(set(data_dict[param])) for param in parameters ] number_processes = cls._numberOfProcesses(0, number_parameter_values) + 1 if debug == True: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, data_dict, parameters, metrics, True) return cls(processes, operation_spec, result_directory, number_processes) else: # Create all plot processes by calling a recursive helper method in # another thread so that already created processes can be executed # although creation of processes is not finished yet. Therefore a queue # is used which size is limited to guarantee that not to much objects # are created (since this costs memory). However, the actual number # of 100 is arbitrary and might be changed according to the system at hand. processes = processing.Queue(100) create_process = processing.Process( target=cls._createProcesses, args=(processes, result_directory, data_dict, parameters, metrics, True)) create_process.start() # create and return the operation object return cls(processes, operation_spec, result_directory, number_processes, create_process)
def _copy_pickle_file(self, source_collection_path, target_collection_path, train_set_name_suffix): source_collection = BaseDataset.load(source_collection_path) # if only test data was given, the "Rest_vs" collection is stored as # training data if self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in source_collection.data.keys(): assert("test" == key[2]) value = source_collection.data.pop(key) key = (key[0],key[1],"train") source_collection.data[key] = value source_collection.store(target_collection_path)
def _copy_pickle_file(self, source_collection_path, target_collection_path, train_set_name_suffix): source_collection = BaseDataset.load(source_collection_path) # if only test data was given, the "Rest_vs" collection is stored as # training data if self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in source_collection.data.keys(): assert ("test" == key[2]) value = source_collection.data.pop(key) key = (key[0], key[1], "train") source_collection.data[key] = value source_collection.store(target_collection_path)
def test_time_series_storing(self): if os.path.exists('tmp') is False: os.makedirs('tmp') source = SimpleTimeSeriesSourceNode() sink = TimeSeriesSinkNode() sink.register_input_node(source) sink.set_run_number(0) sink.process_current_split() result_collection = sink.get_result_dataset() result_collection.store('tmp') #sink.store_results("test_time_series_storing.tmp") reloaded_collection = BaseDataset.load('tmp') reloader = TimeSeriesSourceNode() reloader.set_input_dataset(reloaded_collection) #set_permanent_attributes(time_series_file = "test_time_series_storing.tmp") orig_data = list(source.request_data_for_testing()) restored_data = list(reloader.request_data_for_testing()) # Check that the two list have the same length self.assertEqual( len(orig_data), len(restored_data), "Numbers of time series before storing and after reloading are not equal!" ) # Check that there is a one-to-one correspondence for orig_datapoint, orig_label in orig_data: found = False for restored_datapoint, restored_label in restored_data: found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \ and (orig_label == restored_label) if found: break self.assert_( found, "One of the original time series cannot not be found after reloading" ) shutil.rmtree('tmp') # Cleaning up...
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates an Analysis operation based on the information given in the operation specification operation_spec """ assert(operation_spec["type"] == "analysis") input_path = operation_spec["input_path"] summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)) data_dict = summary.data # Determine the parameters that should be analyzed parameters = operation_spec["parameters"] # Determine the metrics that should be plotted metrics = operation_spec["metrics"] # Determine how many processes will be created number_parameter_values = [len(set(data_dict[param])) for param in parameters] number_processes = cls._numberOfProcesses(0, number_parameter_values)+1 if debug == True: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, data_dict, parameters, metrics, True) return cls( processes, operation_spec, result_directory, number_processes) else: # Create all plot processes by calling a recursive helper method in # another thread so that already created processes can be executed # although creation of processes is not finished yet. Therefore a queue # is used which size is limited to guarantee that not to much objects # are created (since this costs memory). However, the actual number # of 100 is arbitrary and might be changed according to the system at hand. processes = processing.Queue(100) create_process = processing.Process(target=cls._createProcesses, args=( processes, result_directory, data_dict, parameters, metrics, True)) create_process.start() # create and return the operation object return cls( processes, operation_spec, result_directory, number_processes, create_process)
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates a statistic operation based on the information given in the operation specification operation_spec. If debug is TRUE the creation of the statistic processes will not be in a separated thread. """ assert(operation_spec["type"] == "statistic") input_path = operation_spec["input_path"] tabular = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)).data if operation_spec.has_key("filter"): conditions= csv_analysis.empty_dict(tabular) for key,l in operation_spec["filter"].items(): conditions[key].extend(l) tabular = csv_analysis.strip_dict(tabular,conditions) metric = operation_spec.get("metric","Balanced_accuracy") parameter = operation_spec.get("parameter","__Dataset__") rel_par = operation_spec.get("related_parameters",["__Dataset__", "Key_Run", "Key_Fold"]) average = operation_spec.get("average",None) if average in rel_par: rel_par.remove(average) if metric in rel_par: rel_par.remove(metric) if parameter in rel_par: rel_par.remove(parameter) reduced_tabular=cls.reduce_tabular(tabular,rel_par,metric,parameter,average) number_processes = 1 processes = processing.Queue() cls._createProcesses(processes, result_directory, reduced_tabular) import shutil shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"results.csv"), os.path.join(result_directory,"results.csv")) shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"metadata.yaml"), os.path.join(result_directory,"metadata.yaml")) # create and return the shuffle operation object return cls(processes, operation_spec, result_directory, number_processes)
def test_time_series_storing(self): if os.path.exists('tmp') is False : os.makedirs('tmp') source = SimpleTimeSeriesSourceNode() sink = TimeSeriesSinkNode() sink.register_input_node(source) sink.set_run_number(0) sink.process_current_split() result_collection = sink.get_result_dataset() result_collection.store('tmp') #sink.store_results("test_time_series_storing.tmp") reloaded_collection = BaseDataset.load('tmp') reloader = TimeSeriesSourceNode() reloader.set_input_dataset(reloaded_collection) #set_permanent_attributes(time_series_file = "test_time_series_storing.tmp") orig_data = list(source.request_data_for_testing()) restored_data = list(reloader.request_data_for_testing()) # Check that the two list have the same length self.assertEqual(len(orig_data), len(restored_data), "Numbers of time series before storing and after reloading are not equal!") # Check that there is a one-to-one correspondence for orig_datapoint, orig_label in orig_data: found = False for restored_datapoint, restored_label in restored_data: found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \ and (orig_label == restored_label) if found: break self.assert_(found, "One of the original time series cannot not be found after reloading") shutil.rmtree('tmp') # Cleaning up...
def prepare_training(self, training_files, potentials, operation, nullmarker_stride_ms=None): """ Prepares pyspace live for training. Prepares everything for training of pyspace live, i.e. creates flows based on the dataflow specs and configures them. """ online_logger.info("Preparing Training") self.potentials = potentials self.operation = operation self.nullmarker_stride_ms = nullmarker_stride_ms if self.nullmarker_stride_ms == None: online_logger.warn( 'Nullmarker stride interval is %s. You can specify it in your parameter file.' % self.nullmarker_stride_ms) else: online_logger.info('Nullmarker stride interval is set to %s ms ' % self.nullmarker_stride_ms) online_logger.info("Creating flows..") for key in self.potentials.keys(): spec_base = self.potentials[key]["configuration"].spec_dir if self.operation == "train": self.potentials[key]["node_chain"] = os.path.join( spec_base, self.potentials[key]["node_chain"]) online_logger.info("node_chain_spec:" + self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): self.potentials[key]["prewindowing_flow"] = os.path.join( spec_base, self.potentials[key]["prewindowing_flow"]) online_logger.info("prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": self.potentials[key]["postprocess_flow"] = os.path.join( spec_base, self.potentials[key]["postprocess_flow"]) online_logger.info("postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"]) self.training_active_potential[key] = multiprocessing.Value( "b", False) online_logger.info("Path variables set for NodeChains") # check if multiple potentials are given for training if isinstance(training_files, list): self.training_data = training_files else: self.training_data = [training_files] # Training is done in separate processes, we send the time series # windows to these threads via two queues online_logger.info("Initializing Queues") for key in self.potentials.keys(): self.queue[key] = multiprocessing.Queue() def flow_generator(key): """create a generator to yield all the abri flow windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block=True, timeout=None) if window == None: break yield window # Create the actual data flows for key in self.potentials.keys(): if self.operation == "train": self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["node_chain"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): online_logger.info("loading prewindowing flow..") online_logger.info( "file: " + str(self.potentials[key]["prewindowing_flow"])) self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["prewindowing_flow"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["postprocess_flow"])) replace_start_and_end_markers = False final_collection = TimeSeriesDataset() final_collection_path = os.path.join( self.prewindowed_data_directory, key, "all_train_data") # delete previous training collection if os.path.exists(final_collection_path): online_logger.info( "deleting old training data collection for " + key) shutil.rmtree(final_collection_path) # load all prewindowed collections and # append data to the final collection prewindowed_sets = \ glob.glob(os.path.join(self.prewindowed_data_directory, key, "*")) if len(prewindowed_sets) == 0: online_logger.error( "Couldn't find data, please do prewindowing first!") raise Exception online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets)) for s, d in enumerate(prewindowed_sets): collection = BaseDataset.load(d) data = collection.get_data(0, 0, "train") for d, (sample, label) in enumerate(data): if replace_start_and_end_markers: # in case we concatenate multiple 'Window' labeled # sets we have to remove every start- and endmarker for k in sample.marker_name.keys(): # find '{S,s} 8' or '{S,s} 9' m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE) if m is not None: online_logger.info( str("remove %s from %d %d" % (m.group(), s, d))) del (sample.marker_name[m.group()]) if s == len(prewindowed_sets)-1 and \ d == len(data)-1: # insert endmarker sample.marker_name["S 9"] = [0.0] online_logger.info("added endmarker" + str(s) + " " + str(d)) if s == 0 and d == 0: # insert startmarker sample.marker_name["S 8"] = [0.0] online_logger.info("added startmarker" + str(s) + " " + str(d)) final_collection.add_sample(sample, label, True) # save final collection (just for debugging) os.mkdir(final_collection_path) final_collection.store(final_collection_path) online_logger.info("stored final collection at " + final_collection_path) # load final collection again for training online_logger.info("loading data from " + final_collection_path) self.prewindowed_data[key] = BaseDataset.load( final_collection_path) self.node_chains[key][0].set_input_dataset( self.prewindowed_data[key]) flow = open(self.potentials[key]["postprocess_flow"]) # create window_stream for every potential if self.operation in ("prewindowing"): window_spec_file = os.path.join( spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]) self.window_stream[key] = \ self.stream_manager.request_window_stream(window_spec_file, nullmarker_stride_ms = self.nullmarker_stride_ms) elif self.operation in ("prewindowing_offline"): pass elif self.operation in ("train"): pass self.node_chain_definitions[key] = yaml.load(flow) flow.close() # TODO: check if the prewindowing flow is still needed when using the stream mode! if self.operation in ("train"): online_logger.info("Removing old flows...") try: shutil.rmtree(self.flow_storage) except: online_logger.info("Could not delete flow storage directory") os.mkdir(self.flow_storage) elif self.operation in ("prewindowing", "prewindowing_offline"): # follow this policy: # - delete prewindowed data older than 12 hours # - always delete trained/stored flows now = datetime.datetime.now() then = now - datetime.timedelta(hours=12) if not os.path.exists(self.prewindowed_data_directory): os.mkdir(self.prewindowed_data_directory) if not os.path.exists(self.flow_storage): os.mkdir(self.flow_storage) for key in self.potentials.keys(): found = self.find_files_older_than(then, \ os.path.join(self.prewindowed_data_directory, key)) if found is not None: for f in found: online_logger.info( str("recursively deleting files in \'%s\'" % f)) try: shutil.rmtree(os.path.abspath(f)) except Exception as e: # TODO: find a smart solution for this! pass # dir was probably already deleted.. if os.path.exists( os.path.join(self.prewindowed_data_directory, key, "all_train_data")): shutil.rmtree( os.path.join(self.prewindowed_data_directory, key, "all_train_data")) online_logger.info( "deleted concatenated training data for " + key) online_logger.info("Training preparations finished") return 0
def _merge_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection **Parameters** :target_collection_path: Path of the dataset, in which the data of all other datasets is assembled. :source_collection_pathes: Paths of the datasets to be merged. :train_set_name_suffix: Either 'train' or 'test'. Specifies if datasets are merged for training or testing. :target_collection_params: Dictionary with all the parameters of the target dataset. """ # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author" : author, "date" : date, "dataset_directory" : target_collection_path, "train_test" : True, "parameter_setting" : target_collection_params, "input_collection_name" : source_collection_pathes[0][len( pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): target_data = target_collection.get_data(run, split, train_set_name_suffix) if self.set_flag: for ts, l in target_data: if ts.specs == None: ts.specs = {"new_set": False} elif ts.specs.has_key("new_set"): break else: ts.specs["new_set"]= False data = source_collection.get_data(run, split, train_set_name_suffix) if self.set_flag: for i, (ts, l) in enumerate(data): # flag first element of the concatenated data list if ts.specs == None: ts.specs = {"new_set": i==0} else: ts.specs["new_set"] = (i==0) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert("test" == key[2]) value = target_collection.data.pop(key) key = (key[0],key[1],"train") target_collection.data[key] = value # we store the data in the same format as before target_collection.store(target_collection_path, target_collection.meta_data["storage_format"])
def _merge_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection **Parameters** :target_collection_path: Path of the dataset, in which the data of all other datasets is assembled. :source_collection_pathes: Paths of the datasets to be merged. :train_set_name_suffix: Either 'train' or 'test'. Specifies if datasets are merged for training or testing. :target_collection_params: Dictionary with all the parameters of the target dataset. """ # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params[ "__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author": author, "date": date, "dataset_directory": target_collection_path, "train_test": True, "parameter_setting": target_collection_params, "input_dataset_name": source_collection_pathes[0][len(pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): target_data = target_collection.get_data( run, split, train_set_name_suffix) if self.set_flag: for ts, l in target_data: if ts.specs == None: ts.specs = {"new_set": False} elif ts.specs.has_key("new_set"): break else: ts.specs["new_set"] = False data = source_collection.get_data(run, split, train_set_name_suffix) if self.set_flag: for i, (ts, l) in enumerate(data): # flag first element of the concatenated data list if ts.specs == None: ts.specs = {"new_set": i == 0} else: ts.specs["new_set"] = (i == 0) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert ("test" == key[2]) value = target_collection.data.pop(key) key = (key[0], key[1], "train") target_collection.data[key] = value # we store the data in the same format as before target_collection.store(target_collection_path, target_collection.meta_data["storage_format"])
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates an Analysis operation based on the information given in the operation specification operation_spec. If debug is TRUE the creation of the Analysis Processes will not be in a separated thread. """ assert(operation_spec["type"] == "comp_analysis") input_path = operation_spec["input_path"] summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)) data_dict = summary.data ## Done # Determine the parameters that should be analyzed parameters = operation_spec["parameters"] # Determine dependent parameters, which don't get extra resolution try: dep_par = operation_spec["dep_par"] except KeyError: dep_par=[] # Determine the metrics that should be plotted spec_metrics = operation_spec["metrics"] metrics=[] for metric in spec_metrics: if data_dict.has_key(metric): metrics.append(metric) else: import warnings warnings.warn('The metric "' + metric + '" is not contained in the results csv file.') if len(metrics)==0: warnings.warn('No metric available from spec file, default to first dict entry.') metrics.append(data_dict.keys()[0]) # Determine how many processes will be created number_parameter_values = [len(set(data_dict[param])) for param in parameters] number_processes = cls._numberOfProcesses(0, number_parameter_values)+1 logscale = False if operation_spec.has_key('logscale'): logscale = operation_spec['logscale'] markertype='x' if operation_spec.has_key('markertype'): markertype = operation_spec['markertype'] if debug == True: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, data_dict, parameters, dep_par, metrics, logscale, markertype, True) return cls( processes, operation_spec, result_directory, number_processes) else: # Create all plot processes by calling a recursive helper method in # another thread so that already created processes can be executed # although creation of processes is not finished yet. Therefore a queue # is used which size is limited to guarantee that not to much objects # are created (since this costs memory). However, the actual number # of 100 is arbitrary and might be reviewed. processes = processing.Queue(100) create_process = processing.Process(target=cls._createProcesses, args=( processes, result_directory, data_dict, parameters, dep_par, metrics, logscale, markertype, True)) create_process.start() # create and return the comp_analysis operation object return cls( processes, operation_spec, result_directory, number_processes, create_process)
def _merge_pickle_files(self, target_dataset_path, source_dataset_pathes): """ Concatenate all datasets in source_dataset_pathes and store them in the target dataset""" # sort the dataset source_dataset_pathes.sort() # load a first dataset, in which the data of all other datasets is assembled target_dataset = BaseDataset.load(source_dataset_pathes[0]) # Determine author and date try: author = getpass.getuser() except : author = "Unknown" date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_dataset.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it params = target_dataset.meta_data.pop("parameter_setting") params["__INPUT_DATASET__"] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_dataset_pathes] params["__RESULT_DIRECTORY__"] = self.result_directory target_dataset.meta_data.update({"author" : author, "date" : date, "dataset_directory" : target_dataset_path, "train_test" : False, "parameter_setting" : params, "changed_time" : self.change_time, "input_dataset_name" : source_dataset_pathes[0][len( pySPACE.configuration.storage):] }) # Concatenate data of all other datasets to target dataset for source_dataset_path in source_dataset_pathes[1:]: source_dataset = BaseDataset.load(source_dataset_path) for run in source_dataset.get_run_numbers(): for split in source_dataset.get_split_numbers(): target_data = target_dataset.get_data(run, split, "test") if self.change_time: # ensure sorted target_data # TODO: encode this in meta data? target_data.sort(key=lambda t: t[0].end_time) last_end_time = target_data[-1][0].end_time for ts, l in target_data: if ts.specs == None: ts.specs = {"new_set": False} elif ts.specs.has_key("new_set"): break else: ts.specs["new_set"]= False data = source_dataset.get_data(run, split, "test") if self.change_time: # ensure sorted target_data # TODO: encode this in meta data? data.sort(key=lambda t: t[0].end_time) # flag the first element of the concatenated data list for i, (ts, l) in enumerate(data): if ts.specs == None: ts.specs = {"new_set": i==0} else: ts.specs["new_set"] = (i==0) if self.change_time: ts.start_time = last_end_time + ts.start_time ts.end_time = last_end_time + ts.end_time # actual data is stored in a list that has to be extended target_data.extend(data) target_dataset.store(target_dataset_path)
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(MergeProcess, self).pre_benchmarking() # For all input collections for source_test_collection_path in self.input_collections: # Check if the input data is splitted # e.g. only a single test file is in the source directory source_files = glob.glob( os.sep.join( [source_test_collection_path, "data_run0", "*test*"])) splitted = len(source_files) > 1 assert (not splitted) source_file_name = str(source_files[-1]) # check if train sets are also present train_data_present = len(glob.glob(os.sep.join( [source_test_collection_path,"data_run0",\ "*train*"]))) > 0 # if training data is present -> use train and test sets separately if train_data_present: train_set_name_suffix = "train" else: train_set_name_suffix = "test" # We create the collection Rest_vs_Collection source_test_collection_name = \ source_test_collection_path.split(os.sep)[-2] test_base_collection_name = \ source_test_collection_name.strip("}{").split("}{")[0] if self.reverse: target_collection_name = source_test_collection_name.replace( test_base_collection_name, test_base_collection_name + "_vs_" + self.name_pattern) key = "train" else: target_collection_name = source_test_collection_name.replace( test_base_collection_name, self.name_pattern + "_vs_" + test_base_collection_name) key = "test" target_collection_path = os.sep.join( [self.result_directory, target_collection_name]) # determine the parameter_settings of the test collection test_collection = BaseDataset.load(source_test_collection_path) target_collection_params = \ test_collection.meta_data["parameter_setting"] target_collection_params["__INPUT_DATASET__"] = \ {key: source_test_collection_name} if source_file_name.endswith("arff"): file_ending = "arff" # Copy arff file from input collection to target collection source_test_file_path = os.sep.join([ source_test_collection_path, "data_run0", "features_sp0" + train_set_name_suffix + ".arff" ]) target_test_file_path = os.sep.join([ target_collection_path, "data_run0", "features_sp0_" + key + ".arff" ]) else: file_ending = source_file_name.split(".")[-1] source_test_file_path = source_test_collection_path target_test_file_path = target_collection_path source_train_pathes = [] for source_train_collection_path in self.input_collections: source_train_collection_name = \ source_train_collection_path.split(os.sep)[-2] # We must not use data originating from the same input # collection both in train and test files if source_test_collection_name == source_train_collection_name: continue # Check that all constraints are fulfilled for this pair of # input collections if not all(eval(constraint_template % \ {'source_train_collection_name': source_train_collection_name, 'source_test_collection_name': source_test_collection_name}) for constraint_template in self.collection_constraints): continue # check if all parameters are stored in the target path source_collection = \ BaseDataset.load(source_train_collection_path) source_collection_params = \ source_collection.meta_data["parameter_setting"] remaining_params = \ [param for param in source_collection_params.items() \ if param not in target_collection_params.items() and \ param[0] not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__", "__INPUT_COLLECTION__" ]] # for old data if remaining_params != []: for k, v in remaining_params: target_collection_path += "{%s#%s}" % (k, str(v)) target_collection_params[k] = v if "arff" == file_ending: source_train_file_path = \ os.sep.join([source_train_collection_path, "data_run0", "features_sp0_" + \ train_set_name_suffix + ".arff"]) else: source_train_file_path = source_train_collection_path source_train_pathes.append(source_train_file_path) if "arff" == file_ending: target_train_file_path = os.sep.join([ target_collection_path, "data_run0", "features_sp0_" + key + ".arff" ]) else: target_train_file_path = target_collection_path if len(source_train_pathes) == 0: continue create_directory(os.sep.join([target_collection_path, "data_run0"])) if "arff" == file_ending: self._copy_arff_file(source_test_file_path, target_test_file_path, source_test_collection_name, target_collection_name) self._merge_arff_files(target_train_file_path, source_train_pathes, target_collection_name) # Copy metadata.yaml # TODO: Adapt to new collection input_meta = BaseDataset.load_meta_data( source_test_collection_path) BaseDataset.store_meta_data(target_collection_path, input_meta) else: self._copy_file(source_test_collection_path, target_collection_path, train_set_name_suffix) self._merge_files(target_train_file_path, source_train_pathes, train_set_name_suffix, target_collection_params) ############## Clean up after benchmarking ############## super(MergeProcess, self).post_benchmarking()
def __init__(self, dataset_dir, command_template, parametrization, cv_folds, run_number, operation_result_dir): super(WEKAClassificationProcess, self).__init__() # Load the abbreviations abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir, 'operations/weka_templates', 'abbreviations.yaml'), 'r') self.abbreviations = yaml.load(abbreviations_file) abbreviations_file.close() # Determine the directory in which the process' results # are stored self.result_directory = operation_result_dir # Create collection collection = BaseDataset.load(dataset_dir) # The parametrization that is independent of the collection type # and the specific weka command template that is executed self.params = {"collection_name": dataset_dir.strip(os.sep).split(os.sep)[-1], "run_number": run_number, "cv_folds": cv_folds, "weka_class_path": pySPACE.configuration.weka_class_path, "temp_results": self.result_directory, "unique_id": WEKAClassificationProcess.unique_id} # Collection dependent parameters if not collection.meta_data["train_test"] \ and collection.meta_data["splits"] == 1: raise NotImplementedError() else: # The pattern of the train and test files generated by crossvalidation data_pattern = os.path.join(dataset_dir, collection.meta_data["data_pattern"]) # One example arff file in which WEKa can look up relation name etc. sample_dataset = data_pattern.replace("_run", "_run0")\ .replace("_sp_","_sp0_")\ .replace("_tt","_train") self.params.update({"sample_dataset": sample_dataset, "data_pattern": data_pattern}) # Add custom parameters for the weka command template for parameter_name, parameter_value in parametrization.iteritems(): self.params[parameter_name + "_abbr"] = parameter_value # Auto-expand abbreviations if parameter_value in self.abbreviations: parameter_value = self.abbreviations[parameter_value] elif parameter_name == 'classifier': import warnings warnings.warn("Did not find classifier abbreviation %s. " " Expecting full name." % parameter_value) self.params[parameter_name] = parameter_value # Build the WEKA command by repeatedly replacing all placeholders in # the template while True: instantiated_template = command_template % self.params if instantiated_template == command_template: # All placeholders replace self.weka_command = instantiated_template break else: # We have to continue since we are not converged command_template = instantiated_template self.handler_class = None WEKAClassificationProcess.unique_id += 1
def prepare_training(self, training_files, potentials, operation): """ Prepares pyspace live for training. Prepares everything for training of pyspace live, i.e. creates flows based on the dataflow specs and configures them. """ online_logger.info( "Preparing Training") self.potentials = potentials self.operation = operation online_logger.info( "Creating flows..") for key in self.potentials.keys(): spec_base = self.potentials[key]["configuration"].spec_dir if self.operation == "train": self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"]) online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"]) else: self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"]) online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"]) else: self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"]) online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"]) self.training_active_potential[key] = multiprocessing.Value("b",False) online_logger.info("Path variables set for NodeChains") # check if multiple potentials are given for training if isinstance(training_files, list): self.training_data = training_files else: self.training_data = [training_files] # Training is done in separate processes, we send the time series # windows to these threads via two queues online_logger.info( "Initializing Queues") for key in self.potentials.keys(): self.queue[key] = multiprocessing.Queue() def flow_generator(key): """create a generator to yield all the abri flow windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block = True, timeout = None) if window == None: break yield window # Create the actual data flows for key in self.potentials.keys(): if self.operation == "train": self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["node_chain"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): online_logger.info("loading prewindowing flow..") online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"])) self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["prewindowing_flow"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"])) # create windower online_logger.info( "Creating Windower") online_logger.info(self.potentials[key]["windower_spec_path_train"]) self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"])) replace_start_and_end_markers = True else: self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"])) replace_start_and_end_markers = False final_collection = TimeSeriesDataset() final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data") # delete previous training collection if os.path.exists(final_collection_path): online_logger.info("deleting old training data collection for " + key) shutil.rmtree(final_collection_path) # load all prewindowed collections and # append data to the final collection prewindowed_sets = \ glob.glob(os.path.join(self.prewindowed_data_directory, key, "*")) if len(prewindowed_sets) == 0: online_logger.error("Couldn't find data, please do prewindowing first!") raise Exception online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets)) for s,d in enumerate(prewindowed_sets): collection = BaseDataset.load(d) data = collection.get_data(0, 0, "train") for d,(sample,label) in enumerate(data): if replace_start_and_end_markers: # in case we concatenate multiple 'Window' labeled # sets we have to remove every start- and endmarker for k in sample.marker_name.keys(): # find '{S,s} 8' or '{S,s} 9' m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE) if m is not None: online_logger.info(str("remove %s from %d %d" % (m.group(), s, d))) del(sample.marker_name[m.group()]) if s == len(prewindowed_sets)-1 and \ d == len(data)-1: # insert endmarker sample.marker_name["S 9"] = [0.0] online_logger.info("added endmarker" + str(s) + " " + str(d)) if s == 0 and d == 0: # insert startmarker sample.marker_name["S 8"] = [0.0] online_logger.info("added startmarker" + str(s) + " " + str(d)) final_collection.add_sample(sample, label, True) # save final collection (just for debugging) os.mkdir(final_collection_path) final_collection.store(final_collection_path) online_logger.info("stored final collection at " + final_collection_path) # load final collection again for training online_logger.info("loading data from " + final_collection_path) self.prewindowed_data[key] = BaseDataset.load(final_collection_path) self.node_chains[key][0].set_input_dataset(self.prewindowed_data[key]) flow = open(self.potentials[key]["postprocess_flow"]) self.node_chain_definitions[key] = yaml.load(flow) flow.close() # TODO: check if the prewindowing flow is still needed # when using the stream mode! if self.operation in ("train"): online_logger.info( "Removing old flows...") try: shutil.rmtree(self.flow_storage) except: online_logger.info("Could not delete flow storage directory") os.mkdir(self.flow_storage) elif self.operation in ("prewindowing", "prewindowing_offline"): # follow this policy: # - delete prewindowed data older than 12 hours # - always delete trained/stored flows now = datetime.datetime.now() then = now - datetime.timedelta(hours=12) if not os.path.exists(self.prewindowed_data_directory): os.mkdir(self.prewindowed_data_directory) if not os.path.exists(self.flow_storage): os.mkdir(self.flow_storage) for key in self.potentials.keys(): found = self.find_files_older_than(then, \ os.path.join(self.prewindowed_data_directory, key)) if found is not None: for f in found: online_logger.info(str("recursively deleting files in \'%s\'" % f)) try: shutil.rmtree(os.path.abspath(f)) except Exception as e: # TODO: find a smart solution for this! pass # dir was probably already deleted.. if os.path.exists(os.path.join(self.prewindowed_data_directory, key, "all_train_data")): shutil.rmtree(os.path.join(self.prewindowed_data_directory, key, "all_train_data")) online_logger.info("deleted concatenated training data for " + key) online_logger.info( "Training preparations finished") return 0
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates an Analysis operation based on the information given in the operation specification operation_spec. If debug is TRUE the creation of the Analysis Processes will not be in a separated thread. """ assert (operation_spec["type"] == "comp_analysis") input_path = operation_spec["input_path"] summary = BaseDataset.load( os.path.join(pySPACE.configuration.storage, input_path)) data_dict = summary.data ## Done # Determine the parameters that should be analyzed parameters = operation_spec["parameters"] # Determine dependent parameters, which don't get extra resolution try: dep_par = operation_spec["dep_par"] except KeyError: dep_par = [] # Determine the metrics that should be plotted spec_metrics = operation_spec["metrics"] metrics = [] for metric in spec_metrics: if data_dict.has_key(metric): metrics.append(metric) else: import warnings warnings.warn('The metric "' + metric + '" is not contained in the results csv file.') if len(metrics) == 0: warnings.warn( 'No metric available from spec file, default to first dict entry.' ) metrics.append(data_dict.keys()[0]) # Determine how many processes will be created number_parameter_values = [ len(set(data_dict[param])) for param in parameters ] number_processes = cls._numberOfProcesses(0, number_parameter_values) + 1 logscale = False if operation_spec.has_key('logscale'): logscale = operation_spec['logscale'] markertype = 'x' if operation_spec.has_key('markertype'): markertype = operation_spec['markertype'] if debug == True: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, data_dict, parameters, dep_par, metrics, logscale, markertype, True) return cls(processes, operation_spec, result_directory, number_processes) else: # Create all plot processes by calling a recursive helper method in # another thread so that already created processes can be executed # although creation of processes is not finished yet. Therefore a queue # is used which size is limited to guarantee that not to much objects # are created (since this costs memory). However, the actual number # of 100 is arbitrary and might be reviewed. processes = processing.Queue(100) create_process = processing.Process( target=cls._createProcesses, args=(processes, result_directory, data_dict, parameters, dep_par, metrics, logscale, markertype, True)) create_process.start() # create and return the comp_analysis operation object return cls(processes, operation_spec, result_directory, number_processes, create_process)
def _merge_pickle_files(self, target_dataset_path, source_dataset_pathes): """ Concatenate all datasets in source_dataset_pathes and store them in the target dataset""" # sort the dataset source_dataset_pathes.sort() # load a first dataset, in which the data of all other datasets is assembled target_dataset = BaseDataset.load(source_dataset_pathes[0]) # Determine author and date try: author = getpass.getuser() except : author = "Unknown" date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_dataset.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it params = target_dataset.meta_data.pop("parameter_setting") params["__INPUT_DATASET__"] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_dataset_pathes] params["__RESULT_DIRECTORY__"] = self.result_directory target_dataset.meta_data.update({"author" : author, "date" : date, "dataset_directory" : target_dataset_path, "train_test" : False, "parameter_setting" : params, "changed_time" : self.change_time, "input_dataset_name" : source_dataset_pathes[0][len( pySPACE.configuration.storage):] }) # Concatenate data of all other datasets to target dataset for source_dataset_path in source_dataset_pathes[1:]: source_dataset = BaseDataset.load(source_dataset_path) for run in source_dataset.get_run_numbers(): for split in source_dataset.get_split_numbers(): target_data = target_dataset.get_data(run, split, "test") if self.change_time: # ensure sorted target_data # TODO: encode this in meta data? target_data.sort(key=lambda t: t[0].end_time) last_end_time = target_data[-1][0].end_time for ts, l in target_data: if ts.specs == None: ts.specs = {"new_set": False} elif ts.specs.has_key("new_set"): break else: ts.specs["new_set"]= False data = source_dataset.get_data(run, split, "test") if self.change_time: # ensure sorted target_data # TODO: encode this in meta data? data.sort(key=lambda t: t[0].end_time) # flag the first element of the concatenated data list for i, (ts, l) in enumerate(data): if ts.specs == None: ts.specs = {"new_set": i==0} else: ts.specs["new_set"] = (i==0) if self.change_time: ts.start_time = last_end_time + ts.start_time ts.end_time = last_end_time + ts.end_time # actual data is stored in a list that has to be extended target_data.extend(data) target_dataset.store(target_dataset_path)
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(MergeProcess, self).pre_benchmarking() # For all input collections for source_test_collection_path in self.input_collections: # Check if the input data is splitted # e.g. only a single test file is in the source directory source_files = glob.glob(os.sep.join([source_test_collection_path, "data_run0", "*test*"])) splitted = len(source_files) > 1 assert(not splitted) source_file_name = str(source_files[-1]) # check if train sets are also present train_data_present = len(glob.glob(os.sep.join( [source_test_collection_path,"data_run0",\ "*train*"]))) > 0 # if training data is present -> use train and test sets separately if train_data_present: train_set_name_suffix = "train" else: train_set_name_suffix = "test" # We create the collection Rest_vs_Collection source_test_collection_name = \ source_test_collection_path.split(os.sep)[-2] test_base_collection_name = \ source_test_collection_name.strip("}{").split("}{")[0] if self.reverse: target_collection_name = source_test_collection_name.replace( test_base_collection_name, test_base_collection_name + "_vs_Rest") key = "train" else: target_collection_name = source_test_collection_name.replace( test_base_collection_name, "Rest_vs_" + test_base_collection_name) key = "test" target_collection_path = os.sep.join([self.result_directory, target_collection_name]) # determine the parameter_settings of the test collection test_collection = BaseDataset.load(source_test_collection_path) target_collection_params = \ test_collection.meta_data["parameter_setting"] target_collection_params["__INPUT_DATASET__"] = \ {key: source_test_collection_name} if source_file_name.endswith("arff"): file_ending = "arff" # Copy arff file from input collection to target collection source_test_file_path = os.sep.join([source_test_collection_path, "data_run0","features_sp0" + train_set_name_suffix + ".arff"]) target_test_file_path = os.sep.join([target_collection_path, "data_run0","features_sp0_"+key+".arff"]) elif source_file_name.endswith("pickle"): file_ending = "pickle" source_test_file_path = source_test_collection_path target_test_file_path = target_collection_path else: raise NotImplementedError("File type not supported in " \ "MergeOperation") source_train_pathes = [] for source_train_collection_path in self.input_collections: source_train_collection_name = \ source_train_collection_path.split(os.sep)[-2] # We must not use data originating from the same input # collection both in train and test files if source_test_collection_name == source_train_collection_name: continue # Check that all constraints are fulfilled for this pair of # input collections if not all(eval(constraint_template % \ {'source_train_collection_name': source_train_collection_name, 'source_test_collection_name': source_test_collection_name}) for constraint_template in self.collection_constraints): continue # check if all parameters are stored in the target path source_collection = \ BaseDataset.load(source_train_collection_path) source_collection_params = \ source_collection.meta_data["parameter_setting"] remaining_params = \ [param for param in source_collection_params.items() \ if param not in target_collection_params.items() and \ param[0] not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__", "__INPUT_COLLECTION__" ]] # for old data if remaining_params != []: for k,v in remaining_params: target_collection_path += "{%s#%s}" % (k,str(v)) target_collection_params[k]=v if "arff" == file_ending: source_train_file_path = \ os.sep.join([source_train_collection_path, "data_run0", "features_sp0_" + \ train_set_name_suffix + ".arff"]) elif "pickle" == file_ending: source_train_file_path = source_train_collection_path else: raise NotImplementedError("File type not supported in " \ "MergeOperation!") source_train_pathes.append(source_train_file_path) if "arff" == file_ending: target_train_file_path = os.sep.join([target_collection_path, "data_run0","features_sp0_"+key+".arff"]) elif "pickle" == file_ending: target_train_file_path = target_collection_path else: raise NotImplementedError("File type not supported in " "MergeOperation!") if len(source_train_pathes) == 0: continue create_directory(os.sep.join([target_collection_path, "data_run0"])) if "arff" == file_ending: self._copy_arff_file(source_test_file_path, target_test_file_path, source_test_collection_name, target_collection_name) self._merge_arff_files(target_train_file_path, source_train_pathes, target_collection_name) # Copy metadata.yaml # TODO: Adapt to new collection input_meta = BaseDataset.load_meta_data(source_test_collection_path) BaseDataset.store_meta_data(target_collection_path,input_meta) elif "pickle" == file_ending: self._copy_pickle_file(source_test_collection_path, target_collection_path, train_set_name_suffix) self._merge_pickle_files(target_train_file_path, source_train_pathes, train_set_name_suffix, target_collection_params) else: raise NotImplementedError("File type not supported in merge_operation") ############## Clean up after benchmarking ############## super(MergeProcess, self).post_benchmarking()