def _merge_pickle_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection""" # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params[ "__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author": author, "date": date, "dataset_directory": target_collection_path, "train_test": True, "parameter_setting": target_collection_params, "input_collection_name": source_collection_pathes[0][len(pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): data = source_collection.get_data(run, split, train_set_name_suffix) target_data = target_collection.get_data( run, split, train_set_name_suffix) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert ("test" == key[2]) value = target_collection.data.pop(key) key = (key[0], key[1], "train") target_collection.data[key] = value target_collection.store(target_collection_path)
def _merge_pickle_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection""" # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) try: author = pwd.getpwuid(os.getuid())[4] except: author = "unknown" self._log("Author could not be resolved.",level=logging.WARNING) date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author" : author, "date" : date, "dataset_directory" : target_collection_path, "train_test" : True, "parameter_setting" : target_collection_params, "input_collection_name" : source_collection_pathes[0][len( pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): data = source_collection.get_data(run, split, train_set_name_suffix) target_data = target_collection.get_data(run, split, train_set_name_suffix) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert("test" == key[2]) value = target_collection.data.pop(key) key = (key[0],key[1],"train") target_collection.data[key] = value target_collection.store(target_collection_path)
def store(self, result_dir, s_format = "None"): if not s_format == "None": self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL) return # Update the meta data author = get_author() self.update_meta_data({"type": "only output of individual nodes stored", "storage_format": s_format, "author" : author, "data_pattern": "no data stored"}) # Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data)
def _createProcesses(cls, processes, result_directory, operation_spec, parameter_settings, input_collections, command_template): # For each combination of classifier, input-collection and # run number, create one WEKA_process for dataset_dir in input_collections: collection = BaseDataset.load(dataset_dir) # Determine the number of iterations and splits to be used iterations = collection.meta_data["runs"] splits = collection.meta_data["splits"] if "runs" in operation_spec: assert(iterations in [1, operation_spec["runs"]]) iterations = operation_spec["runs"] if "cv_folds" in operation_spec: assert(splits in [1, operation_spec["cv_folds"]]) splits = operation_spec["cv_folds"] for parametrization in parameter_settings: for run_number in range(iterations): process = WEKAClassificationProcess(dataset_dir, command_template, parametrization, splits, run_number, result_directory) processes.put(process) # give executing process the sign that creation is now finished processes.put(False)
def _copy_file(self, source_collection_path, target_collection_path, train_set_name_suffix): """ Copy a dataset to a new destination **Parameters** :source_collection_path: The path to the dataset that has to be copied. :target_collection_path: The path to where the dataset should be copied. :train_set_name_suffix: Either 'train' or 'test'. Specifies if the target dataset is handeled as training or testing data. """ source_collection = BaseDataset.load(source_collection_path) # if only test data was given, the "Rest_vs" collection is stored as # training data if self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in source_collection.data.keys(): assert ("test" == key[2]) value = source_collection.data.pop(key) key = (key[0], key[1], "train") source_collection.data[key] = value # we store the data in the same format as before source_collection.store(target_collection_path, source_collection.meta_data["storage_format"])
def store(self, result_dir, s_format="None"): if not s_format == "None": self._log("The format %s is not supported!" % s_format, level=logging.CRITICAL) return # Update the meta data author = get_author() self.update_meta_data({ "type": "only output of individual nodes stored", "storage_format": s_format, "author": author, "data_pattern": "no data stored" }) # Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def _copy_file(self, source_collection_path, target_collection_path, train_set_name_suffix): """ Copy a dataset to a new destination **Parameters** :source_collection_path: The path to the dataset that has to be copied. :target_collection_path: The path to where the dataset should be copied. :train_set_name_suffix: Either 'train' or 'test'. Specifies if the target dataset is handeled as training or testing data. """ source_collection = BaseDataset.load(source_collection_path) # if only test data was given, the "Rest_vs" collection is stored as # training data if self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in source_collection.data.keys(): assert("test" == key[2]) value = source_collection.data.pop(key) key = (key[0],key[1],"train") source_collection.data[key] = value # we store the data in the same format as before source_collection.store(target_collection_path, source_collection.meta_data["storage_format"])
def __init__(self, dataset_dir, command_template, parametrization, run_number, split_number, operation_result_dir, hide_parameters = []): super(WEKAFilterProcess, self).__init__() # Determine the directory in which the of the process' results # are stored result_collection_name = dataset_dir.split(os.sep)[-2] for parameter_name, parameter_value in parametrization.iteritems(): # If this is a parameter that should not be hidden, then we have to # encode it in the result collection name if not parameter_name in hide_parameters: result_collection_name += "{__%s__:%s}" % (parameter_name.upper(), parameter_value) self.result_directory = os.path.join(operation_result_dir, result_collection_name) # Create directory for intermediate results if it does not exist yet create_directory(self.result_directory + os.sep + "data_run%s" % run_number) # Create collection collection = BaseDataset.load(dataset_dir) # The parametrization that is independent of the collection type # and the specific weka command template that is executed self.params = {"dataset_name": dataset_dir.replace('/','_'), "dataset_dir": dataset_dir, "run_number": run_number, "split_number": split_number, "weka_class_path": pySPACE.configuration.weka_class_path, "temp_results": self.result_directory} # Load the abbreviations abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir, 'operations/weka_templates', 'abbreviations.yaml'), 'r') self.abbreviations = yaml.load(abbreviations_file) # Add custom parameters for the weka command template for parameter_name, parameter_value in parametrization.iteritems(): # Auto-expand abbreviations if parameter_value in self.abbreviations: parameter_value = self.abbreviations[parameter_value] self.params[parameter_name] = parameter_value # Build the WEKA command by repeatedly replacing all placeholders in # the template while True: instantiated_template = command_template % self.params if instantiated_template == command_template: # All placeholders replace self.weka_command = instantiated_template break else: # We have to continue since we are not converged command_template = instantiated_template self.handler_class = None
def store(self, result_dir, s_format = "None"): if not s_format == "None": self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL) return # Update the meta data try: author = pwd.getpwuid(os.getuid())[4] except: author = "unknown" self._log("Author could not be resolved.",level=logging.WARNING) self.update_meta_data({"type": "only output of individual nodes stored", "storage_format": s_format, "author" : author, "data_pattern": "no data stored"}) # Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data)
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates an WEKA operation based on the information given in the operation specification operation_spec """ assert(operation_spec["type"] == "weka_classification") # Determine all parameter combinations that should be tested parameter_settings = cls._get_parameter_space(operation_spec) # Read the command template from a file template_file = open(os.path.join(pySPACE.configuration.spec_dir, "operations", "weka_templates", operation_spec["template"]), 'r') command_template = template_file.read() template_file.close() # number of processes if "runs" in operation_spec: number_processes = len(input_paths) * len(parameter_settings) * \ operation_spec["runs"] else: # approximate the number of processes runs = [] for dataset_dir in input_paths: collection = BaseDataset.load(dataset_dir) runs.append(collection.meta_data["runs"]) runs = max(runs) number_processes = len(input_paths) * len(parameter_settings) * \ runs if debug == True: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, operation_spec, parameter_settings, input_paths, command_template) # create and return the weka operation object return cls(processes, operation_spec, result_directory, number_processes) else: # Create all processes by calling a recursive helper method in # another thread so that already created processes can be executed in # parallel. Therefore a queue is used which size is maximized to # guarantee that not to much objects are created (because this costs # memory). However, the actual number of 100 is arbitrary and might # be reviewed. processes = processing.Queue(100) create_process = processing.Process(target=cls._createProcesses, args=( processes, result_directory, operation_spec, parameter_settings, input_paths, command_template)) create_process.start() # create and return the weka operation object return cls(processes, operation_spec, result_directory, number_processes, create_process)
def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting, hide_parameters): """ Determines the name of the result directory Determines the name of the result directory based on the input_dataset_dir, the node_chain_name and the parameter setting. """ input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1] input_name = input_name.strip("{}") # If the input is already the result of an operation if input_name.count("}{") > 0: input_name_parts = input_name.split("}{") input_name = input_name_parts[0] # Load the input meta data dataset_dir = os.sep.join([pySPACE.configuration.storage, input_dataset_dir]) dataset_md = BaseDataset.load_meta_data(dataset_dir) # We are going to change the parameter_setting and don't want to # interfere with later runs so we work on a copy parameter_setting = copy.deepcopy(parameter_setting) # Ignore pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in parameter_setting: parameter_setting.pop("__PREPARE_OPERATION__") # Add the input parameters meta data to the given parameter setting if "parameter_setting" in dataset_md: parameter_setting.update(dataset_md["parameter_setting"]) # We have to remove ' characters from the parameter value since # Weka does ignore them for key, value in parameter_setting.iteritems(): if isinstance(value, basestring) and value.count("'") > 1: parameter_setting[key] = eval(value) # Determine the result_directory name # String between Key and value changed from ":" to "#", # because ot problems in windows and with windows file servers parameter_str = "}{".join(("%s#%s" % (key, value)) for key, value in parameter_setting.iteritems() if key not in hide_parameters) result_name = "{%s}" % input_name if parameter_str != "": result_name += "{%s}" % (parameter_str) # Determine the path where this result will be stored # and create the directory if necessary result_dir = base_dir result_dir += os.sep + result_name create_directory(result_dir) return result_dir
def __call__(self): """ Executes this process on the respective modality """ # Restore configuration pySPACE.configuration = self.configuration # reduce log_level for processing a second time and # set communication possibility for nodes to backend pySPACE.configuration.min_log_level = self.min_log_level pySPACE.configuration.logging_com = self.handler_args pySPACE.configuration.backend_com = self.backend_com ############## Prepare benchmarking ############## super(NodeChainProcess, self).pre_benchmarking() # Load the data and check that it can be processed # Note: This can not be done in the objects constructor since in # that case the whole input would need to be pickled # when doing the remote call abs_dataset_dir = os.sep.join([self.storage, self.rel_dataset_dir]) input_collection = BaseDataset.load(abs_dataset_dir) # We have to remember parameters used for generating this specific # input dataset if 'parameter_setting' in input_collection.meta_data.keys(): # but not __INPUT_DATASET__ and __RESULT_DIRECTORY__ for k, v in input_collection.meta_data['parameter_setting'].items(): if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]: self.parameter_setting[k] = v NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain, input_collection) ############## Do the actual benchmarking ############## self._log("Start benchmarking run %s of node_chain %s on dataset %s" % (self.run, self.node_chain_spec, self.rel_dataset_dir)) # Do the actual benchmarking for this collection/node_chain combination try: result_collection = \ self.node_chain.benchmark(input_collection = input_collection, run = self.run, persistency_directory = self.persistency_dir, store_node_chain = self.store_node_chain) except Exception, exception: # Send Exception to Logger import traceback print traceback.format_exc() self._log(traceback.format_exc(), level = logging.ERROR) raise
def __call__(self): """ Executes this process on the respective modality """ # Restore configuration pySPACE.configuration = self.configuration # reduce log_level for processing a second time and # set communication possibility for nodes to backend pySPACE.configuration.min_log_level = self.min_log_level pySPACE.configuration.logging_com = self.handler_args pySPACE.configuration.backend_com = self.backend_com ############## Prepare benchmarking ############## super(NodeChainProcess, self).pre_benchmarking() # Load the data and check that it can be processed # Note: This can not be done in the objects constructor since in # that case the whole input would need to be pickled # when doing the remote call abs_dataset_dir = os.sep.join([self.storage, self.rel_dataset_dir]) input_collection = BaseDataset.load(abs_dataset_dir) # We have to remember parameters used for generating this specific # input dataset if 'parameter_setting' in input_collection.meta_data.keys(): # but not __INPUT_DATASET__ and __RESULT_DIRECTORY__ for k, v in input_collection.meta_data['parameter_setting'].items(): if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]: self.parameter_setting[k] = v NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain, input_collection) ############## Do the actual benchmarking ############## self._log("Start benchmarking run %s of node_chain %s on dataset %s" % (self.run, self.node_chain_spec, self.rel_dataset_dir)) # Do the actual benchmarking for this collection/node_chain combination try: result_collection = \ self.node_chain.benchmark( input_collection=input_collection, run=self.run, persistency_directory=self.persistency_dir, store_node_chain=self.store_node_chain) except Exception, exception: # Send Exception to Logger import traceback self._log(traceback.format_exc(), level=logging.ERROR) raise
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates a statistic operation based on the information given in the operation specification operation_spec. If debug is TRUE the creation of the statistic processes will not be in a separated thread. """ assert (operation_spec["type"] == "statistic") input_path = operation_spec["input_path"] tabular = BaseDataset.load( os.path.join(pySPACE.configuration.storage, input_path)).data if operation_spec.has_key("filter"): conditions = csv_analysis.empty_dict(tabular) for key, l in operation_spec["filter"].items(): conditions[key].extend(l) tabular = csv_analysis.strip_dict(tabular, conditions) metric = operation_spec.get("metric", "Balanced_accuracy") parameter = operation_spec.get("parameter", "__Dataset__") rel_par = operation_spec.get("related_parameters", ["__Dataset__", "Key_Run", "Key_Fold"]) average = operation_spec.get("average", None) if average in rel_par: rel_par.remove(average) if metric in rel_par: rel_par.remove(metric) if parameter in rel_par: rel_par.remove(parameter) reduced_tabular = cls.reduce_tabular(tabular, rel_par, metric, parameter, average) number_processes = 1 processes = processing.Queue() cls._createProcesses(processes, result_directory, reduced_tabular) import shutil shutil.copy2( os.path.join(pySPACE.configuration.storage, input_path, "results.csv"), os.path.join(result_directory, "results.csv")) shutil.copy2( os.path.join(pySPACE.configuration.storage, input_path, "metadata.yaml"), os.path.join(result_directory, "metadata.yaml")) # create and return the shuffle operation object return cls(processes, operation_spec, result_directory, number_processes)
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates an Analysis operation based on the information given in the operation specification operation_spec """ assert (operation_spec["type"] == "analysis") input_path = operation_spec["input_path"] summary = BaseDataset.load( os.path.join(pySPACE.configuration.storage, input_path)) data_dict = summary.data # Determine the parameters that should be analyzed parameters = operation_spec["parameters"] # Determine the metrics that should be plotted metrics = operation_spec["metrics"] # Determine how many processes will be created number_parameter_values = [ len(set(data_dict[param])) for param in parameters ] number_processes = cls._numberOfProcesses(0, number_parameter_values) + 1 if debug == True: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, data_dict, parameters, metrics, True) return cls(processes, operation_spec, result_directory, number_processes) else: # Create all plot processes by calling a recursive helper method in # another thread so that already created processes can be executed # although creation of processes is not finished yet. Therefore a queue # is used which size is limited to guarantee that not to much objects # are created (since this costs memory). However, the actual number # of 100 is arbitrary and might be changed according to the system at hand. processes = processing.Queue(100) create_process = processing.Process( target=cls._createProcesses, args=(processes, result_directory, data_dict, parameters, metrics, True)) create_process.start() # create and return the operation object return cls(processes, operation_spec, result_directory, number_processes, create_process)
def _copy_pickle_file(self, source_collection_path, target_collection_path, train_set_name_suffix): source_collection = BaseDataset.load(source_collection_path) # if only test data was given, the "Rest_vs" collection is stored as # training data if self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in source_collection.data.keys(): assert ("test" == key[2]) value = source_collection.data.pop(key) key = (key[0], key[1], "train") source_collection.data[key] = value source_collection.store(target_collection_path)
def _copy_pickle_file(self, source_collection_path, target_collection_path, train_set_name_suffix): source_collection = BaseDataset.load(source_collection_path) # if only test data was given, the "Rest_vs" collection is stored as # training data if self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in source_collection.data.keys(): assert("test" == key[2]) value = source_collection.data.pop(key) key = (key[0],key[1],"train") source_collection.data[key] = value source_collection.store(target_collection_path)
def test_time_series_storing(self): if os.path.exists('tmp') is False: os.makedirs('tmp') source = SimpleTimeSeriesSourceNode() sink = TimeSeriesSinkNode() sink.register_input_node(source) sink.set_run_number(0) sink.process_current_split() result_collection = sink.get_result_dataset() result_collection.store('tmp') #sink.store_results("test_time_series_storing.tmp") reloaded_collection = BaseDataset.load('tmp') reloader = TimeSeriesSourceNode() reloader.set_input_dataset(reloaded_collection) #set_permanent_attributes(time_series_file = "test_time_series_storing.tmp") orig_data = list(source.request_data_for_testing()) restored_data = list(reloader.request_data_for_testing()) # Check that the two list have the same length self.assertEqual( len(orig_data), len(restored_data), "Numbers of time series before storing and after reloading are not equal!" ) # Check that there is a one-to-one correspondence for orig_datapoint, orig_label in orig_data: found = False for restored_datapoint, restored_label in restored_data: found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \ and (orig_label == restored_label) if found: break self.assert_( found, "One of the original time series cannot not be found after reloading" ) shutil.rmtree('tmp') # Cleaning up...
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates an Analysis operation based on the information given in the operation specification operation_spec """ assert(operation_spec["type"] == "analysis") input_path = operation_spec["input_path"] summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)) data_dict = summary.data # Determine the parameters that should be analyzed parameters = operation_spec["parameters"] # Determine the metrics that should be plotted metrics = operation_spec["metrics"] # Determine how many processes will be created number_parameter_values = [len(set(data_dict[param])) for param in parameters] number_processes = cls._numberOfProcesses(0, number_parameter_values)+1 if debug == True: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, data_dict, parameters, metrics, True) return cls( processes, operation_spec, result_directory, number_processes) else: # Create all plot processes by calling a recursive helper method in # another thread so that already created processes can be executed # although creation of processes is not finished yet. Therefore a queue # is used which size is limited to guarantee that not to much objects # are created (since this costs memory). However, the actual number # of 100 is arbitrary and might be changed according to the system at hand. processes = processing.Queue(100) create_process = processing.Process(target=cls._createProcesses, args=( processes, result_directory, data_dict, parameters, metrics, True)) create_process.start() # create and return the operation object return cls( processes, operation_spec, result_directory, number_processes, create_process)
def test_time_series_storing(self): if os.path.exists('tmp') is False : os.makedirs('tmp') source = SimpleTimeSeriesSourceNode() sink = TimeSeriesSinkNode() sink.register_input_node(source) sink.set_run_number(0) sink.process_current_split() result_collection = sink.get_result_dataset() result_collection.store('tmp') #sink.store_results("test_time_series_storing.tmp") reloaded_collection = BaseDataset.load('tmp') reloader = TimeSeriesSourceNode() reloader.set_input_dataset(reloaded_collection) #set_permanent_attributes(time_series_file = "test_time_series_storing.tmp") orig_data = list(source.request_data_for_testing()) restored_data = list(reloader.request_data_for_testing()) # Check that the two list have the same length self.assertEqual(len(orig_data), len(restored_data), "Numbers of time series before storing and after reloading are not equal!") # Check that there is a one-to-one correspondence for orig_datapoint, orig_label in orig_data: found = False for restored_datapoint, restored_label in restored_data: found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \ and (orig_label == restored_label) if found: break self.assert_(found, "One of the original time series cannot not be found after reloading") shutil.rmtree('tmp') # Cleaning up...
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates a statistic operation based on the information given in the operation specification operation_spec. If debug is TRUE the creation of the statistic processes will not be in a separated thread. """ assert(operation_spec["type"] == "statistic") input_path = operation_spec["input_path"] tabular = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)).data if operation_spec.has_key("filter"): conditions= csv_analysis.empty_dict(tabular) for key,l in operation_spec["filter"].items(): conditions[key].extend(l) tabular = csv_analysis.strip_dict(tabular,conditions) metric = operation_spec.get("metric","Balanced_accuracy") parameter = operation_spec.get("parameter","__Dataset__") rel_par = operation_spec.get("related_parameters",["__Dataset__", "Key_Run", "Key_Fold"]) average = operation_spec.get("average",None) if average in rel_par: rel_par.remove(average) if metric in rel_par: rel_par.remove(metric) if parameter in rel_par: rel_par.remove(parameter) reduced_tabular=cls.reduce_tabular(tabular,rel_par,metric,parameter,average) number_processes = 1 processes = processing.Queue() cls._createProcesses(processes, result_directory, reduced_tabular) import shutil shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"results.csv"), os.path.join(result_directory,"results.csv")) shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"metadata.yaml"), os.path.join(result_directory,"metadata.yaml")) # create and return the shuffle operation object return cls(processes, operation_spec, result_directory, number_processes)
def store(self, result_dir, s_format="pickle"): """ Stores this collection in the directory *result_dir*. In contrast to *dump* this method stores the collection not in a single file but as a whole directory structure with meta information etc. The data sets are stored separately for each run, split, train/test combination. **Parameters** :result_dir: The directory in which the collection will be stored. :name: The prefix of the file names in which the individual data sets are stored. The actual file names are determined by appending suffixes that encode run, split, train/test information. (*optional, default: "time_series"*) :format: The format in which the actual data sets should be stored. Possible formats are *pickle*, *text*, *csv* and *MATLAB* (.mat) format. In the MATLAB and text format, all time series objects are concatenated to a single large table containing only integer values. For the csv format comma separated values are taken as default or a specified Python format string. The MATLAB format is a struct that contains the data, the sampling frequency and the channel names. .. note:: For the text and MATLAB format, markers could be added by using a Marker_To_Mux node before (*optional, default: "pickle"*) .. todo:: Put marker to the right time point and also write marker channel. """ name = "time_series" if type(s_format) == list: s_type = s_format[1] s_format = s_format[0] else: s_type = "%.18e" if s_format in ["text", "matlab"]: s_type = "%i" if s_format == "csv" and s_type == "real": s_type = "%.18e" # Update the meta data author = get_author() self.update_meta_data({ "type": "time_series", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format }) # Iterate through splits and runs in this dataset for key, time_series in self.data.iteritems(): # load data, if necessary # (due to the lazy loading, the data might be not loaded already) if isinstance(time_series, basestring): time_series = self.get_data(key[0], key[1], key[2]) if self.sort_string is not None: time_series.sort(key=eval(self.sort_string)) # Construct result directory result_path = result_dir + os.sep + "data" + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format in ["pickle", "cpickle", "cPickle"]: result_file = open( os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(time_series, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format in ["text", "csv"]: self.update_meta_data({ "type": "stream", "marker_column": "marker" }) result_file = open( os.path.join(result_path, name + key_str + ".csv"), "w") csvwriter = csv.writer(result_file) channel_names = copy.deepcopy(time_series[0][0].channel_names) if s_format == "csv": channel_names.append("marker") csvwriter.writerow(channel_names) for (data, key) in time_series: if s_format == "text": numpy.savetxt(result_file, data, delimiter=",", fmt=s_type) if not key is None: result_file.write(str(key)) result_file.flush() elif data.marker_name is not None \ and len(data.marker_name) > 0: result_file.write(str(data.marker_name)) result_file.flush() else: first_line = True marker = "" if not key is None: marker = str(key) elif data.marker_name is not None \ and len(data.marker_name) > 0: marker = str(data.marker_name) for line in data: l = list(line) l.append(marker) csvwriter.writerow(list(l)) if first_line: first_line = False marker = "" result_file.flush() elif s_format in ["mat"]: result_file = open( os.path.join(result_path, name + key_str + ".mat"), "w") # extract a first time series object to get meta data merged_time_series = time_series.pop(0)[0] # collect all important information in the collection_object collection_object = { "sampling_frequency": merged_time_series.sampling_frequency, "channel_names": merged_time_series.channel_names } # merge all data for (data, key) in time_series: merged_time_series = numpy.vstack( (merged_time_series, data)) collection_object["data"] = merged_time_series mdict = dict() mdict[name + key_str] = collection_object import scipy.io scipy.io.savemat(result_file, mdict=mdict) elif s_format in ["eeg"]: result_file = open( os.path.join(result_path, name + key_str + ".eeg"), "a+") result_file_mrk = open( os.path.join(result_path, name + key_str + ".vmrk"), "w") result_file_mrk.write( "Brain Vision Data Exchange Marker File, " "Version 1.0\n") result_file_mrk.write("; Data stored by pySPACE\n") result_file_mrk.write("[Common Infos]\n") result_file_mrk.write("Codepage=UTF-8\n") result_file_mrk.write("DataFile=%s\n" % str(name + key_str + ".eeg")) result_file_mrk.write("\n[Marker Infos]\n") markerno = 1 datapoint = 1 sf = None channel_names = None for t in time_series: if sf is None: sf = t[0].sampling_frequency if channel_names is None: channel_names = t[0].get_channel_names() for mrk in t[0].marker_name.keys(): for tm in t[0].marker_name[mrk]: result_file_mrk.write( str("Mk%d=Stimulus,%s,%d,1,0\n" % (markerno, mrk, datapoint + (tm * sf / 1000.0)))) markerno += 1 data_ = t[0].astype(numpy.int16) data_.tofile(result_file) datapoint += data_.shape[0] result_hdr = open( os.path.join(result_path, name + key_str + ".vhdr"), "w") result_hdr.write("Brain Vision Data Exchange Header " "File Version 1.0\n") result_hdr.write("; Data stored by pySPACE\n\n") result_hdr.write("[Common Infos]\n") result_hdr.write("Codepage=UTF-8\n") result_hdr.write("DataFile=%s\n" % str(name + key_str + ".eeg")) result_hdr.write("MarkerFile=%s\n" % str(name + key_str + ".vmrk")) result_hdr.write("DataFormat=BINARY\n") result_hdr.write("DataOrientation=MULTIPLEXED\n") result_hdr.write("NumberOfChannels=%d\n" % len(channel_names)) result_hdr.write("SamplingInterval=%d\n\n" % (1000000 / sf)) result_hdr.write("[Binary Infos]\n") result_hdr.write("BinaryFormat=INT_16\n\n") result_hdr.write("[Channel Infos]\n") # TODO: Add Resolutions to time_series # 0 = 0.1 [micro]V, # 1 = 0.5 [micro]V, # 2 = 10 [micro]V, # 3 = 152.6 [micro]V (seems to be unused!) resolutions_str = [ unicode("0.1,%sV" % unicode(u"\u03BC")), unicode("0.5,%sV" % unicode(u"\u03BC")), unicode("10,%sV" % unicode(u"\u03BC")), unicode("152.6,%sV" % unicode(u"\u03BC")) ] for i in range(len(channel_names)): result_hdr.write( unicode("Ch%d=%s,,%s\n" % (i + 1, channel_names[i], unicode(resolutions_str[0]))).encode('utf-8')) else: NotImplementedError("Using unavailable storage format:%s!" % s_format) result_file.close() self.update_meta_data({ "channel_names": copy.deepcopy(time_series[0][0].channel_names), "sampling_frequency": time_series[0][0].sampling_frequency }) #Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def store(self, result_dir, s_format="pickle"): """ Stores this collection in the directory *result_dir*. In contrast to *dump* this method stores the collection not in a single file but as a whole directory structure with meta information etc. The data sets are stored separately for each run, split, train/test combination. **Parameters** :result_dir: The directory in which the collection will be stored. :name: The prefix of the file names in which the individual data sets are stored. The actual file names are determined by appending suffixes that encode run, split, train/test information. (*optional, default: "time_series"*) :format: The format in which the actual data sets should be stored. Possible formats are *pickle*, *text*, *csv* and *MATLAB* (.mat) format. In the MATLAB and text format, all time series objects are concatenated to a single large table containing only integer values. For the csv format comma separated values are taken as default or a specified Python format string. The MATLAB format is a struct that contains the data, the sampling frequency and the channel names. .. note:: For the text and MATLAB format, markers could be added by using a Marker_To_Mux node before (*optional, default: "pickle"*) .. todo:: Put marker to the right time point and also write marker channel. """ name = "time_series" if type(s_format) == list: s_type = s_format[1] s_format = s_format[0] else: s_type = "%.18e" if s_format in ["text", "matlab"]: s_type = "%i" if s_format == "csv" and s_type == "real": s_type = "%.18e" # Update the meta data try: author = pwd.getpwuid(os.getuid())[4] except Exception: author = "unknown" self._log("Author could not be resolved.", level=logging.WARNING) self.update_meta_data({"type": "time_series", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format}) # Iterate through splits and runs in this dataset for key, time_series in self.data.iteritems(): # load data, if necessary # (due to the lazy loading, the data might be not loaded already) if isinstance(time_series, basestring): time_series = self.get_data(key[0], key[1], key[2]) if self.sort_string is not None: time_series.sort(key=eval(self.sort_string)) # Construct result directory result_path = result_dir + os.sep + "data" + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format in ["pickle", "cpickle", "cPickle"]: result_file = open(os.path.join(result_path, name+key_str+".pickle"), "w") cPickle.dump(time_series, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format in ["text","csv"]: self.update_meta_data({ "type": "stream", "marker_column": "marker"}) result_file = open(os.path.join(result_path, name + key_str + ".csv"), "w") csvwriter = csv.writer(result_file) channel_names = copy.deepcopy(time_series[0][0].channel_names) if s_format == "csv": channel_names.append("marker") csvwriter.writerow(channel_names) for (data, key) in time_series: if s_format == "text": numpy.savetxt(result_file, data, delimiter=",", fmt=s_type) if not key is None: result_file.write(str(key)) result_file.flush() elif data.marker_name is not None \ and len(data.marker_name) > 0: result_file.write(str(data.marker_name)) result_file.flush() else: first_line = True marker = "" if not key is None: marker = str(key) elif data.marker_name is not None \ and len(data.marker_name) > 0: marker = str(data.marker_name) for line in data: l = list(line) l.append(marker) csvwriter.writerow(list(l)) if first_line: first_line = False marker = "" result_file.flush() elif s_format in ["mat"]: result_file = open(os.path.join(result_path, name + key_str + ".mat"),"w") # extract a first time series object to get meta data merged_time_series = time_series.pop(0)[0] # collect all important information in the collection_object collection_object = { "sampling_frequency": merged_time_series.sampling_frequency, "channel_names": merged_time_series.channel_names} # merge all data for (data,key) in time_series: merged_time_series = numpy.vstack((merged_time_series, data)) collection_object["data"] = merged_time_series mdict = dict() mdict[name + key_str] = collection_object import scipy.io scipy.io.savemat(result_file, mdict=mdict) else: NotImplementedError("Using unavailable storage format:%s!" % s_format) result_file.close() self.update_meta_data({ "channel_names": copy.deepcopy(time_series[0][0].channel_names), "sampling_frequency": time_series[0][0].sampling_frequency }) #Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def store(self, result_dir, s_format="BrainVision"): # Keep original file name, depends on the AnalyserSinkNode, see it's documentation. if self.meta_data.has_key('eeg_src_file_name') and self.meta_data[ 'eeg_src_file_name'] is None: name = self.meta_data['eeg_src_file_name'] # or use default name from this collection else: name = "Analyzer" if not s_format == "BrainVision": self._log("The format %s is not supported!" % s_format, level=logging.CRITICAL) return # Update the meta data try: author = pwd.getpwuid(os.getuid())[4] except: author = "unknown" self._log("Author could not be resolved.", level=logging.WARNING) self.update_meta_data({ "type": "only output of individual nodes stored", "storage_format": s_format, "author": author, "data_pattern": "Multiplexed" }) # Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data) #self._log("EEG data file %s" % self.collection.data_file) slices = [] slices.append(0) channel_names = [] for key, time_series in self.data.iteritems(): # Sort the Times-Series Array def cmp_start(a, b): return cmp(a[0].start_time, b[0].start_time) time_series.sort(cmp_start) # Check for overlapping Windows and remove them if existent i = 0 while i < len(time_series): ts = time_series[i] #print ts[0].start_time, ts[0].end_time #print len(time_series) if ts[0].start_time >= slices[-1]: slices.append(ts[0].end_time) else: warnings.warn("Ignoring at least one overlapping window!", UserWarning) i = i + 1 # STORE ACTUAL EEG DATA AND WRITE MARKERFILE result_path = result_dir + os.sep + "data_analyzer" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Keep original name if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_eeg = open( os.path.join(result_path, name + ".eeg"), "wb") result_file_mrk = open( os.path.join(result_path, name + ".vmrk"), "w") # or use default name from this collection else: result_file_eeg = open( os.path.join(result_path, name + key_str + ".eeg"), "wb") result_file_mrk = open( os.path.join(result_path, name + key_str + ".vmrk"), "w") # Write Marker header if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_mrk.write(header_mrk % (name)) else: result_file_mrk.write(header_mrk % (name + key_str)) result_file_ms = 0 # Data for padding padding = None count_mrk = 2 num_ch = 0 sampling_int = 0 for ts in time_series: if padding == None: padding = numpy.zeros(len(ts[0].channel_names), dtype='int16') num_ch = len(ts[0].channel_names) channel_names = ts[0].channel_names sampling_int = 1000000 / ts[0].sampling_frequency #print "writing %d channels.." % len(ts[0].channel_names) # Write Padding (zeros) while result_file_ms < ts[0].start_time: result_file_eeg.write(padding.tostring()) result_file_ms += ts[0]._samples_to_ms(1) # Write window ts[0].tofile(result_file_eeg) result_file_ms += ts[0].end_time - ts[0].start_time # Write Marker result_file_mrk.write( "Mk%d=Label,%s,%d,1,0\n" % (count_mrk, ts[1], ts[0]._ms_to_samples(ts[0].start_time))) count_mrk += 1 # WRITE HEADERFILE # Keep original name if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_hdr = open( os.path.join(result_path, name + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int)) # or use default name from this collection else: result_file_hdr = open( os.path.join(result_path, name + key_str + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int)) # Format: Ch1=Fp1,,0.1,\xB5V for i in range(num_ch): result_file_hdr.write("Ch%d=%s,,0.1,\xB5V\n" % (i + 1, channel_names[i])) result_file_hdr.close() result_file_eeg.close() result_file_mrk.close()
def consolidate(self): """ Consolidates the results obtained by the single processes into a consistent structure of collections that are stored on the file system. """ # Consolidate the results directory_pattern = os.sep.join([self.result_directory, "{*",]) dataset_pathes = glob.glob(directory_pattern) # For all collections found for dataset_path in dataset_pathes: # Load their meta_data meta_data = BaseDataset.load_meta_data(dataset_path) # Determine author and date try: author = pwd.getpwuid(os.getuid())[4] except: author = "unknown" self._log("Author could not be resolved.",level=logging.WARNING) date = time.strftime("%Y%m%d_%H_%M_%S") # Update meta data and store it meta_data.update({"author" : author, "date" : date}) BaseDataset.store_meta_data(dataset_path, meta_data) # Copy the input dataset specification file to the result # directory in order to make later analysis of # the results more easy input_meta_path = os.sep.join([pySPACE.configuration.storage, meta_data["input_collection_name"]]) input_meta = BaseDataset.load_meta_data(input_meta_path) BaseDataset.store_meta_data(dataset_path,input_meta, file_name="input_metadata.yaml") # Check if some results consist of several runs # and update the meta data in this case # TODO: This is not a clean solution for dataset_dir in glob.glob(os.sep.join([self.result_directory, "*"])): if not os.path.isdir(dataset_dir): continue # There can be either run dirs, persistency dirs, or both of them. # Check of whichever there are more. If both exist, their numbers # are supposed to be equal. nr_run_dirs = len(glob.glob(os.sep.join([dataset_dir, "data_run*"]))) nr_per_dirs = len(glob.glob(os.sep.join([dataset_dir, "persistency_run*"]))) nr_runs = max(nr_run_dirs, nr_per_dirs) if nr_runs > 1: collection_meta = BaseDataset.load_meta_data(dataset_dir) collection_meta["runs"] = nr_runs BaseDataset.store_meta_data(dataset_dir,collection_meta) # If we don't create a feature vector or time series collection, # we evaluated our classification using a classification performance sink. # The resulting files should be merged to one csv tabular. pathlist = glob.glob(os.path.join(self.result_directory,"results_*")) if len(pathlist)>0: # Do the consolidation the same way as for WekaClassificationOperation self._log("Consolidating results ...") # We load and store the results once into a PerformanceResultSummary # This does the necessary consolidation... self._log("Reading intermediate results...") result_collection = PerformanceResultSummary(dataset_dir=self.result_directory) self._log("done") self._log("Storing result collection") result_collection.store(self.result_directory) self._log("done") PerformanceResultSummary.merge_traces(self.result_directory) if not(self.compression == False): # Since we get one result summary, # we don't need the numerous folders. # So we zip them to make the whole folder more easy visible. import zipfile cwd=os.getcwd() os.chdir(self.result_directory) # If there are to many or to large folders, problems may occur. # This case we want to log, try 64 bit mode, and then skip the zipping. try: pathlist = glob.glob(os.path.join(self.result_directory,"{*}")) if not self.compression == "delete": save_file=zipfile.ZipFile(self.result_directory+'/result_folders.zip',mode="w",compression=self.compression) # we want to have the zipped file relative to the result directory for path in pathlist: for node in os.walk(path): rel_path=os.path.relpath(node[0],self.result_directory) save_file.write(rel_path) for data in node[2]: save_file.write(os.path.join(rel_path,data)) save_file.close() # To still have an easy access to the history of the processing, # we keep one folder. pathlist.pop() for path in pathlist: shutil.rmtree(path) except: self._log("Result files could not be compressed with 32 bit mode, switching to 64 bit mode.", level=logging.CRITICAL) # nearly total code copy, only difference with 64 bit mode try: pathlist = glob.glob(os.path.join(self.result_directory,"{*}")) save_file=zipfile.ZipFile(self.result_directory+'/result_folders.zip',mode="w",compression=self.compression, allowZip64=True) # we want to have the zipped file relative to the result directory for path in pathlist: for node in os.walk(path): rel_path=os.path.relpath(node[0],self.result_directory) save_file.write(rel_path) for data in node[2]: save_file.write(os.path.join(rel_path,data)) save_file.close() # To still have an easy access to the history of the processing, # we keep one folder. pathlist.pop() for path in pathlist: shutil.rmtree(path) except: self._log("64 bit mode also failed. Please check your files and your code or contact your local programmer!", level=logging.CRITICAL) os.chdir(cwd)
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates the processes which form an operation based on the information given in the operation specification, *operation_spec*. In debug mode this is done in serial. In the other default mode, at the moment 4 processes are created in parallel and can be immediately executed. So generation of processes and execution are made in parallel. This kind of process creation is done independently from the backend. For huge parameter spaces this is necessary! Otherwise numerous processes are created and corresponding data is loaded but the concept of spreading the computation to different processors can not really be used, because process creation is blocking only one processor and memory space, but nothing more is done, till the processes are all created. .. todo:: Use :class:`~pySPACE.resources.dataset_defs.dummy.DummyDataset` for empty data, when no input_path is given. """ assert(operation_spec["type"] == "node_chain") # Determine all parameter combinations that should be tested parameter_settings = cls._get_parameter_space(operation_spec) ## Use node_chain parameter if no templates are given ## if not operation_spec.has_key("templates"): if operation_spec.has_key("node_chain"): operation_spec["templates"]=[operation_spec.pop("node_chain")] else: warnings.warn("Specify parameter 'templates' or 'node_chain' in your operation spec!") operation_spec["templates"]=[operation_spec.pop("flow")] elif operation_spec.has_key("node_chain"): operation_spec.pop("node_chain") warnings.warn("node_chain parameter is ignored. Templates are used.") elif type(operation_spec["templates"][0])==str: # load files in templates operation_spec["template_files"]=copy.deepcopy(operation_spec["templates"]) for i in range(len(operation_spec["templates"])): rel_node_chain_file = operation_spec["templates"][i] abs_node_chain_file = open(os.sep.join([pySPACE.configuration.spec_dir, "node_chains", rel_node_chain_file]), 'r') node_chain = yaml.load(abs_node_chain_file) abs_node_chain_file.close() operation_spec["templates"][i] = node_chain storage = pySPACE.configuration.storage if not input_paths : raise Exception("No input datasets found in input_path %s in %s!" % (operation_spec["input_path"],storage)) # Get relative path rel_input_paths = [name[len(storage):] for name in input_paths] # Determine approximate number of runs if "runs" in operation_spec: runs = operation_spec["runs"] else: runs = [] for dataset_dir in rel_input_paths: abs_collection_path = \ pySPACE.configuration.storage + os.sep \ + dataset_dir collection_runs = \ BaseDataset.load_meta_data(abs_collection_path).get('runs',1) runs.append(collection_runs) runs = max(runs) # Determine splits dataset_dir = rel_input_paths[0] abs_collection_path = \ pySPACE.configuration.storage + os.sep + dataset_dir splits = BaseDataset.load_meta_data(abs_collection_path).get('splits', 1) # Determine how many processes will be created number_processes = len(operation_spec["templates"]) * \ len(parameter_settings) * len(rel_input_paths) * \ runs * splits if debug == True: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, operation_spec, parameter_settings, rel_input_paths) # create and return the operation object return cls(processes, operation_spec, result_directory, number_processes) else: # Create all processes by calling a recursive helper method in # another thread so that already created processes can be executed in # parallel. Therefore a queue is used which size is maximized to # guarantee that not to much objects are created (because this costs # memory). However, the actual number of 4 is arbitrary and might # be changed according to the system at hand. processes = processing.Queue(4) create_process = \ processing.Process(target=cls._createProcesses, args=(processes, result_directory, operation_spec, parameter_settings, rel_input_paths)) create_process.start() # create and return the operation object return cls(processes, operation_spec, result_directory, number_processes, create_process)
def _merge_pickle_files(self, target_dataset_path, source_dataset_pathes): """ Concatenate all datasets in source_dataset_pathes and store them in the target dataset""" # sort the dataset source_dataset_pathes.sort() # load a first dataset, in which the data of all other datasets is assembled target_dataset = BaseDataset.load(source_dataset_pathes[0]) # Determine author and date try: author = getpass.getuser() except : author = "Unknown" date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_dataset.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it params = target_dataset.meta_data.pop("parameter_setting") params["__INPUT_DATASET__"] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_dataset_pathes] params["__RESULT_DIRECTORY__"] = self.result_directory target_dataset.meta_data.update({"author" : author, "date" : date, "dataset_directory" : target_dataset_path, "train_test" : False, "parameter_setting" : params, "changed_time" : self.change_time, "input_dataset_name" : source_dataset_pathes[0][len( pySPACE.configuration.storage):] }) # Concatenate data of all other datasets to target dataset for source_dataset_path in source_dataset_pathes[1:]: source_dataset = BaseDataset.load(source_dataset_path) for run in source_dataset.get_run_numbers(): for split in source_dataset.get_split_numbers(): target_data = target_dataset.get_data(run, split, "test") if self.change_time: # ensure sorted target_data # TODO: encode this in meta data? target_data.sort(key=lambda t: t[0].end_time) last_end_time = target_data[-1][0].end_time for ts, l in target_data: if ts.specs == None: ts.specs = {"new_set": False} elif ts.specs.has_key("new_set"): break else: ts.specs["new_set"]= False data = source_dataset.get_data(run, split, "test") if self.change_time: # ensure sorted target_data # TODO: encode this in meta data? data.sort(key=lambda t: t[0].end_time) # flag the first element of the concatenated data list for i, (ts, l) in enumerate(data): if ts.specs == None: ts.specs = {"new_set": i==0} else: ts.specs["new_set"] = (i==0) if self.change_time: ts.start_time = last_end_time + ts.start_time ts.end_time = last_end_time + ts.end_time # actual data is stored in a list that has to be extended target_data.extend(data) target_dataset.store(target_dataset_path)
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(MergeProcess, self).pre_benchmarking() # For all input collections for source_test_collection_path in self.input_collections: # Check if the input data is splitted # e.g. only a single test file is in the source directory source_files = glob.glob( os.sep.join( [source_test_collection_path, "data_run0", "*test*"])) splitted = len(source_files) > 1 assert (not splitted) source_file_name = str(source_files[-1]) # check if train sets are also present train_data_present = len(glob.glob(os.sep.join( [source_test_collection_path,"data_run0",\ "*train*"]))) > 0 # if training data is present -> use train and test sets separately if train_data_present: train_set_name_suffix = "train" else: train_set_name_suffix = "test" # We create the collection Rest_vs_Collection source_test_collection_name = \ source_test_collection_path.split(os.sep)[-2] test_base_collection_name = \ source_test_collection_name.strip("}{").split("}{")[0] if self.reverse: target_collection_name = source_test_collection_name.replace( test_base_collection_name, test_base_collection_name + "_vs_" + self.name_pattern) key = "train" else: target_collection_name = source_test_collection_name.replace( test_base_collection_name, self.name_pattern + "_vs_" + test_base_collection_name) key = "test" target_collection_path = os.sep.join( [self.result_directory, target_collection_name]) # determine the parameter_settings of the test collection test_collection = BaseDataset.load(source_test_collection_path) target_collection_params = \ test_collection.meta_data["parameter_setting"] target_collection_params["__INPUT_DATASET__"] = \ {key: source_test_collection_name} if source_file_name.endswith("arff"): file_ending = "arff" # Copy arff file from input collection to target collection source_test_file_path = os.sep.join([ source_test_collection_path, "data_run0", "features_sp0" + train_set_name_suffix + ".arff" ]) target_test_file_path = os.sep.join([ target_collection_path, "data_run0", "features_sp0_" + key + ".arff" ]) else: file_ending = source_file_name.split(".")[-1] source_test_file_path = source_test_collection_path target_test_file_path = target_collection_path source_train_pathes = [] for source_train_collection_path in self.input_collections: source_train_collection_name = \ source_train_collection_path.split(os.sep)[-2] # We must not use data originating from the same input # collection both in train and test files if source_test_collection_name == source_train_collection_name: continue # Check that all constraints are fulfilled for this pair of # input collections if not all(eval(constraint_template % \ {'source_train_collection_name': source_train_collection_name, 'source_test_collection_name': source_test_collection_name}) for constraint_template in self.collection_constraints): continue # check if all parameters are stored in the target path source_collection = \ BaseDataset.load(source_train_collection_path) source_collection_params = \ source_collection.meta_data["parameter_setting"] remaining_params = \ [param for param in source_collection_params.items() \ if param not in target_collection_params.items() and \ param[0] not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__", "__INPUT_COLLECTION__" ]] # for old data if remaining_params != []: for k, v in remaining_params: target_collection_path += "{%s#%s}" % (k, str(v)) target_collection_params[k] = v if "arff" == file_ending: source_train_file_path = \ os.sep.join([source_train_collection_path, "data_run0", "features_sp0_" + \ train_set_name_suffix + ".arff"]) else: source_train_file_path = source_train_collection_path source_train_pathes.append(source_train_file_path) if "arff" == file_ending: target_train_file_path = os.sep.join([ target_collection_path, "data_run0", "features_sp0_" + key + ".arff" ]) else: target_train_file_path = target_collection_path if len(source_train_pathes) == 0: continue create_directory(os.sep.join([target_collection_path, "data_run0"])) if "arff" == file_ending: self._copy_arff_file(source_test_file_path, target_test_file_path, source_test_collection_name, target_collection_name) self._merge_arff_files(target_train_file_path, source_train_pathes, target_collection_name) # Copy metadata.yaml # TODO: Adapt to new collection input_meta = BaseDataset.load_meta_data( source_test_collection_path) BaseDataset.store_meta_data(target_collection_path, input_meta) else: self._copy_file(source_test_collection_path, target_collection_path, train_set_name_suffix) self._merge_files(target_train_file_path, source_train_pathes, train_set_name_suffix, target_collection_params) ############## Clean up after benchmarking ############## super(MergeProcess, self).post_benchmarking()
def _createProcesses(cls, processes, result_directory, operation_spec, parameter_settings, input_collections): try: storage_format = operation_spec["storage_format"] if "storage_format" \ in operation_spec else None # Determine whether the node_chain should be stored after data processing store_node_chain = operation_spec["store_node_chain"] \ if "store_node_chain" in operation_spec else False # Determine whether certain parameters should not be remembered hide_parameters = [] if "hide_parameters" not in operation_spec \ else list(operation_spec["hide_parameters"]) hide_parameters.append("__INPUT_COLLECTION__") hide_parameters.append("__INPUT_DATASET__") hide_parameters.append("__RESULT_DIRECTORY__") hide_parameters.append("__OUTPUT_BUNDLE__") operation_spec["hide_parameters"] = hide_parameters # Create all combinations of collections, runs and splits collection_run_split_combinations = [] for input_dataset_dir in input_collections: # Determine number of runs to be conducted for this collection abs_collection_path = \ pySPACE.configuration.storage + os.sep \ + input_dataset_dir collection_runs = \ BaseDataset.load_meta_data(abs_collection_path).get('runs', 1) # D.get(k[,d]) -> D[k] if k in D, else d. if "runs" not in operation_spec: requested_runs = collection_runs else: requested_runs = operation_spec["runs"] assert collection_runs == requested_runs \ or collection_runs == 1, \ "Requested %s runs but input collection %s provides "\ "data for %s runs." % (requested_runs, input_dataset_dir, collection_runs) for run in range(max(requested_runs, collection_runs)): collection_splits = BaseDataset.load_meta_data( abs_collection_path).get('splits', 1) for split in range(collection_splits): collection_run_split_combinations.append( (input_dataset_dir, run, split)) # Shuffle order of dataset-run-split combinations. This should help to # avoid that all processes work on the same data which can cause # problems due to locking etc. random.shuffle(collection_run_split_combinations) # For all templates for node_chain_spec in operation_spec["templates"]: # For all possible parameter instantiations of this template for parameter_setting in parameter_settings: # For all input collections-run combinations for input_dataset_dir, run, split in \ collection_run_split_combinations: # We are going to change the parameter_setting and don't # want to interfere with later runs so we work on a copy parameter_setting_cp = copy.deepcopy(parameter_setting) # Add input and output path to parameter # setting parameter_setting_cp["__INPUT_DATASET__"] = \ input_dataset_dir.split(os.sep)[-2] parameter_setting_cp["__RESULT_DIRECTORY__"] = \ result_directory if len(operation_spec["templates"]) > 1: index = operation_spec["templates"].index( node_chain_spec) parameter_setting_cp["__Template__"]=\ operation_spec["template_files"][index] # Load the input meta data dataset_dir = os.sep.join( [pySPACE.configuration.storage, input_dataset_dir]) dataset_md = BaseDataset.load_meta_data(dataset_dir) # Add the input parameter's meta data # to the given parameter setting if "parameter_setting" in dataset_md: dataset_md["parameter_setting"].update( parameter_setting_cp) all_parameters = dataset_md["parameter_setting"] else: all_parameters = parameter_setting_cp def check_constraint(constraint, parameters): for key, value in parameters.iteritems(): constraint = constraint.replace( key, str(value)) return eval(constraint) if not all( check_constraint(constraint_def, all_parameters) for constraint_def in operation_spec.get( 'old_parameter_constraints', [])): continue # Determine directory in which the result of this # process should be written result_dataset_directory = \ NodeChainOperation._get_result_dataset_dir( result_directory, input_dataset_dir, parameter_setting_cp, hide_parameters) # Create the respective process and put it to the # executing-queue of processes process = NodeChainProcess( node_chain_spec=node_chain_spec, parameter_setting=parameter_setting_cp, rel_dataset_dir=input_dataset_dir, run=run, split=split, storage_format=storage_format, result_dataset_directory=result_dataset_directory, store_node_chain=store_node_chain, hide_parameters=hide_parameters) processes.put(process) finally: # give executing process the sign that creation is now finished processes.put(False)
def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting, hide_parameters): """ Determines the name of the result directory Determines the name of the result directory based on the input_dataset_dir, the node_chain_name and the parameter setting. """ # Determine the result_directory name # String between Key and value changed from ":" to "#", # because ot problems in windows and with windows file servers def _get_result_dir_name(parameter_setting, hide_parameters, method=None): """ internal function to create result dir name in different ways""" if not method: parameter_str = "}{".join( ("%s#%s" % (key, value)) for key, value in parameter_setting.iteritems() if key not in hide_parameters ) elif method == "hash": parameter_str = "}{".join( ("%s#%s" % (key, hash(str(value).replace(" ", "")))) for key, value in parameter_setting.iteritems() if key not in hide_parameters ) parameter_str = parameter_str.replace("'", "") parameter_str = parameter_str.replace(" ", "") parameter_str = parameter_str.replace("[", "") parameter_str = parameter_str.replace("]", "") parameter_str = parameter_str.replace(os.sep, "") result_name = "{%s}" % input_name if parameter_str != "": result_name += "{%s}" % (parameter_str) # Determine the path where this result will be stored # and create the directory if necessary result_dir = base_dir result_dir += os.sep + result_name # filename is to long # (longer than allowed including optional offsets for pyspace # result csv naming conventions) # create a md5 hash of the result name and use that one import platform CURRENTOS = platform.system() if CURRENTOS == "Windows": # the maximum length for a filename on Windows is 255 if len(result_dir) > 255 - 32: result_name = "{" + hashlib.md5(result_name).hexdigest() + "}" result_dir = base_dir result_dir += os.sep + result_name return result_dir else: if len(result_dir) > os.pathconf(os.curdir, "PC_NAME_MAX") - 32: result_name = "{" + hashlib.md5(result_name).hexdigest() + "}" result_dir = base_dir result_dir += os.sep + result_name return result_dir input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1] input_name = input_name.strip("{}") # If the input is already the result of an operation if input_name.count("}{") > 0: input_name_parts = input_name.split("}{") input_name = input_name_parts[0] # Load the input meta data dataset_dir = os.sep.join([pySPACE.configuration.storage, input_dataset_dir]) dataset_md = BaseDataset.load_meta_data(dataset_dir) # We are going to change the parameter_setting and don't want to # interfere with later runs so we work on a copy parameter_setting = copy.deepcopy(parameter_setting) # Ignore pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in parameter_setting: parameter_setting.pop("__PREPARE_OPERATION__") # Add the input parameters meta data to the given parameter setting if "parameter_setting" in dataset_md: parameter_setting.update(dataset_md["parameter_setting"]) # We have to remove ' characters from the parameter value since # Weka does ignore them for key, value in parameter_setting.iteritems(): if isinstance(value, basestring) and value.count("'") > 1: parameter_setting[key] = eval(value) result_dir = _get_result_dir_name(parameter_setting, hide_parameters) try: create_directory(result_dir) except OSError as e: if e.errno == 36: # filename is too long result_dir = _get_result_dir_name(parameter_setting, hide_parameters, "hash") create_directory(result_dir) return result_dir
def consolidate(self, _=None): """ Consolidates the results obtained by the single processes into a consistent structure of collections that are stored on the file system. """ # Consolidate the results directory_pattern = os.sep.join([ self.result_directory, "{*", ]) dataset_pathes = glob.glob(directory_pattern) # For all collections found for dataset_path in dataset_pathes: try: # Load their meta_data meta_data = BaseDataset.load_meta_data(dataset_path) # Determine author and date author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Update meta data and store it meta_data.update({"author": author, "date": date}) # There can be either run dirs, persistency dirs, or both of them. # Check of whichever there are more. If both exist, their numbers # are supposed to be equal. nr_run_dirs = len( glob.glob(os.path.join(dataset_path, "data_run*"))) nr_per_dirs = len( glob.glob(os.path.join(dataset_path, "persistency_run*"))) nr_runs = max(nr_run_dirs, nr_per_dirs) if nr_runs > 1: meta_data["runs"] = nr_runs # Store the metadata BaseDataset.store_meta_data(dataset_path, meta_data) # Copy the input dataset specification file to the result # directory in order to make later analysis of # the results more easy # THA: Split the first "/" from the input collection name, because otherwise it will be treated # as an absolute path input_collection_name = meta_data["input_collection_name"][1:] if \ meta_data["input_collection_name"][0] == os.sep else meta_data["input_collection_name"] input_meta_path = os.path.join(pySPACE.configuration.storage, input_collection_name) try: input_meta = BaseDataset.load_meta_data(input_meta_path) BaseDataset.store_meta_data( dataset_path, input_meta, file_name="input_metadata.yaml") except (IOError, OSError) as e: self._log("Error copying the input_metadata.yaml: {error}". format(error=e.message), level=logging.CRITICAL) except Exception as e: logging.getLogger("%s" % self).exception( "Error updating the metadata: {error!s}".format(error=e)) raise e # If we don't create a feature vector or time series collection, # we evaluated our classification using a classification performance sink. # The resulting files should be merged to one csv tabular. pathlist = glob.glob(os.path.join(self.result_directory, "results_*")) if len(pathlist) > 0: # Do the consolidation the same way as for WekaClassificationOperation self._log("Consolidating results ...") # We load and store the results once into a PerformanceResultSummary # This does the necessary consolidation... self._log("Reading intermediate results...") try: result_collection = PerformanceResultSummary( dataset_dir=self.result_directory) self._log("done") self._log("Storing result collection") result_collection.store(self.result_directory) self._log("done") PerformanceResultSummary.merge_traces(self.result_directory) except Exception as e: logging.getLogger("%s" % self).exception( "Error merging the result collection: {error!s}".format( error=e)) if self.compression: # Since we get one result summary, # we don't need the numerous folders. # So we zip them to make the whole folder more easy visible. import zipfile cwd = os.getcwd() os.chdir(self.result_directory) # If there are to many or to large folders, problems may occur. # This case we want to log, try 64 bit mode, # and then skip the zipping. try: pathlist = glob.glob( os.path.join(self.result_directory, "{*}")) if not self.compression == "delete": save_file = zipfile.ZipFile( self.result_directory + '/result_folders.zip', mode="w", compression=self.compression) # we want to have the zipped file relative to the # result directory for path in pathlist: for node in os.walk(path): rel_path = os.path.relpath( node[0], self.result_directory) save_file.write(rel_path) for data in node[2]: save_file.write( os.path.join(rel_path, data)) save_file.close() # To still have an easy access to the history of the # processing, we keep one folder. pathlist.pop() for path in pathlist: shutil.rmtree(path) except Exception, e: self._log("Result files could not be compressed with 32" + " bit mode, switching to 64 bit mode", level=logging.CRITICAL) # nearly total code copy, only difference with 64 bit mode try: pathlist = glob.glob( os.path.join(self.result_directory, "{*}")) save_file = zipfile.ZipFile( self.result_directory + '/result_folders.zip', mode="w", compression=self.compression, allowZip64=True) # we want to have the zipped file relative to the # result directory for path in pathlist: for node in os.walk(path): rel_path = os.path.relpath( node[0], self.result_directory) save_file.write(rel_path) for data in node[2]: save_file.write( os.path.join(rel_path, data)) save_file.close() # To still have an easy access to the history of the # processing, we keep one folder. pathlist.pop() for path in pathlist: shutil.rmtree(path) except: self._log( "64 bit mode also failed. Please check your files and your code or contact your local programmer!", level=logging.CRITICAL) os.chdir(cwd)
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(MergeProcess, self).pre_benchmarking() # For all input collections for source_test_collection_path in self.input_collections: # Check if the input data is splitted # e.g. only a single test file is in the source directory source_files = glob.glob(os.sep.join([source_test_collection_path, "data_run0", "*test*"])) splitted = len(source_files) > 1 assert(not splitted) source_file_name = str(source_files[-1]) # check if train sets are also present train_data_present = len(glob.glob(os.sep.join( [source_test_collection_path,"data_run0",\ "*train*"]))) > 0 # if training data is present -> use train and test sets separately if train_data_present: train_set_name_suffix = "train" else: train_set_name_suffix = "test" # We create the collection Rest_vs_Collection source_test_collection_name = \ source_test_collection_path.split(os.sep)[-2] test_base_collection_name = \ source_test_collection_name.strip("}{").split("}{")[0] if self.reverse: target_collection_name = source_test_collection_name.replace( test_base_collection_name, test_base_collection_name + "_vs_Rest") key = "train" else: target_collection_name = source_test_collection_name.replace( test_base_collection_name, "Rest_vs_" + test_base_collection_name) key = "test" target_collection_path = os.sep.join([self.result_directory, target_collection_name]) # determine the parameter_settings of the test collection test_collection = BaseDataset.load(source_test_collection_path) target_collection_params = \ test_collection.meta_data["parameter_setting"] target_collection_params["__INPUT_DATASET__"] = \ {key: source_test_collection_name} if source_file_name.endswith("arff"): file_ending = "arff" # Copy arff file from input collection to target collection source_test_file_path = os.sep.join([source_test_collection_path, "data_run0","features_sp0" + train_set_name_suffix + ".arff"]) target_test_file_path = os.sep.join([target_collection_path, "data_run0","features_sp0_"+key+".arff"]) elif source_file_name.endswith("pickle"): file_ending = "pickle" source_test_file_path = source_test_collection_path target_test_file_path = target_collection_path else: raise NotImplementedError("File type not supported in " \ "MergeOperation") source_train_pathes = [] for source_train_collection_path in self.input_collections: source_train_collection_name = \ source_train_collection_path.split(os.sep)[-2] # We must not use data originating from the same input # collection both in train and test files if source_test_collection_name == source_train_collection_name: continue # Check that all constraints are fulfilled for this pair of # input collections if not all(eval(constraint_template % \ {'source_train_collection_name': source_train_collection_name, 'source_test_collection_name': source_test_collection_name}) for constraint_template in self.collection_constraints): continue # check if all parameters are stored in the target path source_collection = \ BaseDataset.load(source_train_collection_path) source_collection_params = \ source_collection.meta_data["parameter_setting"] remaining_params = \ [param for param in source_collection_params.items() \ if param not in target_collection_params.items() and \ param[0] not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__", "__INPUT_COLLECTION__" ]] # for old data if remaining_params != []: for k,v in remaining_params: target_collection_path += "{%s#%s}" % (k,str(v)) target_collection_params[k]=v if "arff" == file_ending: source_train_file_path = \ os.sep.join([source_train_collection_path, "data_run0", "features_sp0_" + \ train_set_name_suffix + ".arff"]) elif "pickle" == file_ending: source_train_file_path = source_train_collection_path else: raise NotImplementedError("File type not supported in " \ "MergeOperation!") source_train_pathes.append(source_train_file_path) if "arff" == file_ending: target_train_file_path = os.sep.join([target_collection_path, "data_run0","features_sp0_"+key+".arff"]) elif "pickle" == file_ending: target_train_file_path = target_collection_path else: raise NotImplementedError("File type not supported in " "MergeOperation!") if len(source_train_pathes) == 0: continue create_directory(os.sep.join([target_collection_path, "data_run0"])) if "arff" == file_ending: self._copy_arff_file(source_test_file_path, target_test_file_path, source_test_collection_name, target_collection_name) self._merge_arff_files(target_train_file_path, source_train_pathes, target_collection_name) # Copy metadata.yaml # TODO: Adapt to new collection input_meta = BaseDataset.load_meta_data(source_test_collection_path) BaseDataset.store_meta_data(target_collection_path,input_meta) elif "pickle" == file_ending: self._copy_pickle_file(source_test_collection_path, target_collection_path, train_set_name_suffix) self._merge_pickle_files(target_train_file_path, source_train_pathes, train_set_name_suffix, target_collection_params) else: raise NotImplementedError("File type not supported in merge_operation") ############## Clean up after benchmarking ############## super(MergeProcess, self).post_benchmarking()
def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting, hide_parameters): """ Determines the name of the result directory Determines the name of the result directory based on the input_dataset_dir, the node_chain_name and the parameter setting. """ # Determine the result_directory name # String between Key and value changed from ":" to "#", # because ot problems in windows and with windows file servers def _get_result_dir_name(parameter_setting, hide_parameters, method=None): """ internal function to create result dir name in different ways""" if not method: parameter_str = "}{".join( ("%s#%s" % (key, value)) for key, value in parameter_setting.iteritems() if key not in hide_parameters) elif method == "hash": parameter_str = "}{".join( ("%s#%s" % (key, hash(str(value).replace(' ', '')))) for key, value in parameter_setting.iteritems() if key not in hide_parameters) parameter_str = parameter_str.replace("'", "") parameter_str = parameter_str.replace(" ", "") parameter_str = parameter_str.replace("[", "") parameter_str = parameter_str.replace("]", "") parameter_str = parameter_str.replace(os.sep, "") result_name = "{%s}" % input_name if parameter_str != "": result_name += "{%s}" % (parameter_str) # Determine the path where this result will be stored # and create the directory if necessary result_dir = base_dir result_dir += os.sep + result_name # filename is to long # (longer than allowed including optional offsets for pyspace # result csv naming conventions) # create a md5 hash of the result name and use that one import platform CURRENTOS = platform.system() if CURRENTOS == "Windows": # the maximum length for a filename on Windows is 255 if len(result_dir) > 255 - 32: result_name = "{" + hashlib.md5( result_name).hexdigest() + "}" result_dir = base_dir result_dir += os.sep + result_name return result_dir else: if len(result_dir) > os.pathconf(os.curdir, 'PC_NAME_MAX') - 32: result_name = "{" + hashlib.md5( result_name).hexdigest() + "}" result_dir = base_dir result_dir += os.sep + result_name return result_dir input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1] input_name = input_name.strip("{}") # If the input is already the result of an operation if input_name.count("}{") > 0: input_name_parts = input_name.split("}{") input_name = input_name_parts[0] # Load the input meta data dataset_dir = os.sep.join( [pySPACE.configuration.storage, input_dataset_dir]) dataset_md = BaseDataset.load_meta_data(dataset_dir) # We are going to change the parameter_setting and don't want to # interfere with later runs so we work on a copy parameter_setting = copy.deepcopy(parameter_setting) # Ignore pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in parameter_setting: parameter_setting.pop("__PREPARE_OPERATION__") # Add the input parameters meta data to the given parameter setting if "parameter_setting" in dataset_md: parameter_setting.update(dataset_md["parameter_setting"]) # We have to remove ' characters from the parameter value since # Weka does ignore them for key, value in parameter_setting.iteritems(): if isinstance(value, basestring) and value.count("'") > 1: parameter_setting[key] = eval(value) result_dir = _get_result_dir_name(parameter_setting, hide_parameters) try: create_directory(result_dir) except OSError as e: if e.errno == 36: # filename is too long result_dir = _get_result_dir_name(parameter_setting, hide_parameters, "hash") create_directory(result_dir) return result_dir
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates the processes which form an operation based on the information given in the operation specification, *operation_spec*. In debug mode this is done in serial. In the other default mode, at the moment 4 processes are created in parallel and can be immediately executed. So generation of processes and execution are made in parallel. This kind of process creation is done independently from the backend. For huge parameter spaces this is necessary! Otherwise numerous processes are created and corresponding data is loaded but the concept of spreading the computation to different processors can not really be used, because process creation is blocking only one processor and memory space, but nothing more is done, till the processes are all created. .. todo:: Use :class:`~pySPACE.resources.dataset_defs.dummy.DummyDataset` for empty data, when no input_path is given. """ assert (operation_spec["type"] == "node_chain") # Determine all parameter combinations that should be tested parameter_settings = cls._get_parameter_space(operation_spec) ## Use node_chain parameter if no templates are given ## if not operation_spec.has_key("templates"): if operation_spec.has_key("node_chain"): operation_spec["templates"] = [ operation_spec.pop("node_chain") ] # extract_key_str(operation_spec["base_file"], # keyword="node_chain")] # operation_spec.pop("node_chain") else: warnings.warn( "Specify parameter 'templates' or 'node_chain' in your operation spec!" ) elif operation_spec.has_key("node_chain"): operation_spec.pop("node_chain") warnings.warn( "node_chain parameter is ignored. Templates are used.") # load files in templates as dictionaries elif type(operation_spec["templates"][0]) == str: operation_spec["template_files"] = \ copy.deepcopy(operation_spec["templates"]) for i in range(len(operation_spec["templates"])): rel_node_chain_file = operation_spec["templates"][i] abs_node_chain_file_name = os.sep.join([ pySPACE.configuration.spec_dir, "node_chains", rel_node_chain_file ]) with open(abs_node_chain_file_name, "r") as read_file: node_chain = read_file.read() #node_chain = yaml.load(read_file) operation_spec["templates"][i] = node_chain storage = pySPACE.configuration.storage if not input_paths: raise Exception("No input datasets found in input_path %s in %s!" % (operation_spec["input_path"], storage)) # Get relative path rel_input_paths = [name[len(storage):] for name in input_paths] # Determine approximate number of runs if "runs" in operation_spec: runs = operation_spec["runs"] else: runs = [] for dataset_dir in rel_input_paths: abs_collection_path = \ pySPACE.configuration.storage + os.sep \ + dataset_dir collection_runs = \ BaseDataset.load_meta_data(abs_collection_path).get('runs',1) runs.append(collection_runs) runs = max(runs) # Determine splits dataset_dir = rel_input_paths[0] abs_collection_path = \ pySPACE.configuration.storage + os.sep + dataset_dir splits = BaseDataset.load_meta_data(abs_collection_path).get( 'splits', 1) # Determine how many processes will be created number_processes = len(operation_spec["templates"]) * \ len(parameter_settings) * len(rel_input_paths) * \ runs * splits if debug: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, operation_spec, parameter_settings, rel_input_paths) # create and return the operation object return cls(processes, operation_spec, result_directory, number_processes) else: # Create all processes by calling a recursive helper method in # another thread so that already created processes can be executed in # parallel. Therefore a queue is used which size is maximized to # guarantee that not to much objects are created (because this costs # memory). However, the actual number of 4 is arbitrary and might # be changed according to the system at hand. processes = processing.Queue(4) create_process = \ processing.Process(target=cls._createProcesses, args=(processes, result_directory, operation_spec, parameter_settings, rel_input_paths)) create_process.start() # create and return the operation object return cls(processes, operation_spec, result_directory, number_processes, create_process)
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(ShuffleProcess, self).pre_benchmarking() for dataset_dir1 in self.input_datasets: for dataset_dir2 in self.input_datasets: dataset_name1 = dataset_dir1.split(os.sep)[-2] dataset_name2 = dataset_dir2.split(os.sep)[-2] # Check if the input data is split splitted = len( glob.glob(os.sep.join([dataset_dir1, "data_run0", "*" ]))) > 1 # Check that all constraints are fulfilled for this pair of # input datasets if not all( eval( constraint_template % { 'dataset_name1': dataset_name1, 'dataset_name2': dataset_name2 }) for constraint_template in self.dataset_constraints): continue if dataset_name1 == dataset_name2: if splitted: # Copy the data os.symlink( dataset_dir1, os.sep.join([self.result_directory, dataset_name1])) continue # Determine names of the original data sets the input # datasets are based on base_dataset1 = dataset_name1.strip("}{").split("}{")[0] base_dataset2 = dataset_name2.strip("}{").split("}{")[0] # Determine target dataset name and create directory # for it mixed_base_dataset = "%s_vs_%s" % (base_dataset1, base_dataset2) target_dataset_name = dataset_name1.replace( base_dataset1, mixed_base_dataset) target_dataset_dir = os.sep.join( [self.result_directory, target_dataset_name]) create_directory(os.sep.join([target_dataset_dir, "data_run0"])) if splitted: # For each split, copy the train data from dataset 1 and # the test data from dataset 2 to the target dataset for source_train_file_name in glob.glob( os.sep.join( [dataset_dir1, "data_run0", "*_sp*_train.*"])): # TODO: We have $n$ train sets and $n$ test sets, we "metadata.yaml"])), # could use all $n*n$ combinations target_train_file_name = source_train_file_name.replace( dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace( dataset_dir1, dataset_dir2) source_test_file_name = source_test_file_name.replace( "train.", "test.") target_test_file_name = target_train_file_name.replace( "train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) else: # Use the data set from dataset 1 as training set and # the data set from dataset 2 as test data for source_train_file_name in glob.glob( os.sep.join( [dataset_dir1, "data_run0", "*_sp*_test.*"])): target_train_file_name = source_train_file_name.replace( "test.", "train.") target_train_file_name = target_train_file_name.replace( dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace( dataset_dir1, dataset_dir2) target_test_file_name = target_train_file_name.replace( "train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) # Write metadata.yaml based on input meta data input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1) output_dataset_meta = dict(input_dataset1_meta) output_dataset_meta['train_test'] = True output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S") output_dataset_meta['author'] = get_author() BaseDataset.store_meta_data(target_dataset_dir, output_dataset_meta) ############## Clean up after benchmarking ############## super(ShuffleProcess, self).post_benchmarking()
def store(self, result_dir, s_format=["pickle", "real"]): """ store the collection in *result_dir*""" name = "predictions" # Update the meta data author = get_author() self.update_meta_data({ "type": "prediction_vector", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format[0] }) if not s_format in ["csv", "arff", "pickle"]: self._log("Storage format not supported! Using default.", level=logging.ERROR) s_format = "pickle" for key, prediction_vectors in self.data.iteritems(): # Construct result directory result_path = result_dir + os.sep + "data" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format == "pickle": result_file = open( os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(prediction_vectors, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format == "csv": # Write as Comma Separated Value result_file = open( os.path.join(result_path, name + key_str + ".csv"), "w") if self.meta_data["num_predictors"] == 1: result_file.write( "Predicted Label, Prediction Score, True Label \n") for pv in prediction_vectors: result_file.write( "%s, %s, %s\n" % (pv[0].label[0], pv[0].prediction[0], pv[1])) else: # we begin by dealing with the header of the csv file base_header = "Predicted %(index)d Label, Prediction %(index)d Score, " base_result = "%(label)s, %(score)s," header = "" for i in range(self.meta_data["num_predictors"]): header += base_header % dict(index=i + 1) header += "True Label\n" result_file.write(header) # and now we can write each of the prediction vectors in turn for pv in prediction_vectors: result = "" for i in range(self.meta_data["num_predictors"]): result += base_result % dict( label=pv[0].label[i], score=pv[0].prediction[i]) result += str(pv[1]) + "\n" result_file.write(result) #Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def _merge_files(self, target_collection_path, source_collection_pathes, train_set_name_suffix, target_collection_params): """ Merge all collections in source_collection_pathes and store them \ in the target collection **Parameters** :target_collection_path: Path of the dataset, in which the data of all other datasets is assembled. :source_collection_pathes: Paths of the datasets to be merged. :train_set_name_suffix: Either 'train' or 'test'. Specifies if datasets are merged for training or testing. :target_collection_params: Dictionary with all the parameters of the target dataset. """ # load a first collection, in which the data of all other collections # is assembled target_collection = BaseDataset.load(source_collection_pathes[0]) author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Delete node_chain file name try: target_collection.meta_data.pop("node_chain_file_name") except: pass # Update meta data and store it k = "test" if self.reverse else "train" target_collection_params["__INPUT_DATASET__"][k] = \ [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes] target_collection_params[ "__RESULT_DIRECTORY__"] = self.result_directory target_collection.meta_data.update({ "author": author, "date": date, "dataset_directory": target_collection_path, "train_test": True, "parameter_setting": target_collection_params, "input_dataset_name": source_collection_pathes[0][len(pySPACE.configuration.storage):] }) # merge data of all other collections to target collection for source_collection_path in source_collection_pathes[1:]: source_collection = BaseDataset.load(source_collection_path) for run in source_collection.get_run_numbers(): for split in source_collection.get_split_numbers(): target_data = target_collection.get_data( run, split, train_set_name_suffix) if self.set_flag: for ts, l in target_data: if ts.specs == None: ts.specs = {"new_set": False} elif ts.specs.has_key("new_set"): break else: ts.specs["new_set"] = False data = source_collection.get_data(run, split, train_set_name_suffix) if self.set_flag: for i, (ts, l) in enumerate(data): # flag first element of the concatenated data list if ts.specs == None: ts.specs = {"new_set": i == 0} else: ts.specs["new_set"] = (i == 0) # actual data is stored in a list that has to be extended target_data.extend(data) # if only test data was given, the "Rest_vs" collection is stored as # training data if not self.reverse and "test" == train_set_name_suffix: # exchange the "test" in key tuple to "train" before storing for key in target_collection.data.keys(): assert ("test" == key[2]) value = target_collection.data.pop(key) key = (key[0], key[1], "train") target_collection.data[key] = value # we store the data in the same format as before target_collection.store(target_collection_path, target_collection.meta_data["storage_format"])
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(ShuffleProcess, self).pre_benchmarking() for dataset_dir1 in self.input_datasets: for dataset_dir2 in self.input_datasets: dataset_name1 = dataset_dir1.split(os.sep)[-2] dataset_name2 = dataset_dir2.split(os.sep)[-2] # Check if the input data is split splitted = len(glob.glob(os.sep.join([dataset_dir1, "data_run0", "*"]))) > 1 # Check that all constraints are fulfilled for this pair of # input datasets if not all(eval(constraint_template % {'dataset_name1': dataset_name1, 'dataset_name2': dataset_name2}) for constraint_template in self.dataset_constraints): continue if dataset_name1 == dataset_name2: if splitted: # Copy the data os.symlink(dataset_dir1, os.sep.join([self.result_directory, dataset_name1])) continue # Determine names of the original data sets the input # datasets are based on base_dataset1 = dataset_name1.strip("}{").split("}{")[0] base_dataset2 = dataset_name2.strip("}{").split("}{")[0] # Determine target dataset name and create directory # for it mixed_base_dataset = "%s_vs_%s" % (base_dataset1, base_dataset2) target_dataset_name = dataset_name1.replace(base_dataset1, mixed_base_dataset) target_dataset_dir = os.sep.join([self.result_directory, target_dataset_name]) create_directory(os.sep.join([target_dataset_dir, "data_run0"])) if splitted: # For each split, copy the train data from dataset 1 and # the test data from dataset 2 to the target dataset for source_train_file_name in glob.glob(os.sep.join([dataset_dir1, "data_run0", "*_sp*_train.*"])): # TODO: We have $n$ train sets and $n$ test sets, we "metadata.yaml"])), # could use all $n*n$ combinations target_train_file_name = source_train_file_name.replace(dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace(dataset_dir1, dataset_dir2) source_test_file_name = source_test_file_name.replace("train.", "test.") target_test_file_name = target_train_file_name.replace("train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) else: # Use the data set from dataset 1 as training set and # the data set from dataset 2 as test data for source_train_file_name in glob.glob(os.sep.join([dataset_dir1, "data_run0", "*_sp*_test.*"])): target_train_file_name = source_train_file_name.replace("test.", "train.") target_train_file_name = target_train_file_name.replace(dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace(dataset_dir1, dataset_dir2) target_test_file_name = target_train_file_name.replace("train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) # Write metadata.yaml based on input meta data input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1) output_dataset_meta = dict(input_dataset1_meta) output_dataset_meta['train_test'] = True output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S") try: output_dataset_meta['author'] = pwd.getpwuid(os.getuid())[4] except : self._log("Author could not be resolved.",level=logging.WARNING) output_dataset_meta['author'] = "unknown" BaseDataset.store_meta_data(target_dataset_dir,output_dataset_meta) ############## Clean up after benchmarking ############## super(ShuffleProcess, self).post_benchmarking()
def store(self, result_dir, s_format = "BrainVision"): self.merged = False scale = 10.0 # is used to scale up the eeg sample values. The data samples are converted to int16 # when saving, so scaling is necessary to keep maintain the resolutions. # Keep original file name, depends on the AnalyserSinkNode, see it's documentation. if self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] is not None: name = self.meta_data['eeg_src_file_name'] # or use default name from this collection else: name = "Analyzer" if not s_format == "BrainVision": self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL) return # Update the meta data author = get_author() self.update_meta_data({"type": "only output of individual nodes stored", "storage_format": s_format, "author" : author, "data_pattern": "Multiplexed"}) # Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data) #self._log("EEG data file %s" % self.collection.data_file) slices = [] slices.append(0) channel_names = [] for key, time_series in self.data.iteritems(): # Sort the Times-Series Array def cmp_start(a, b): return cmp(a[0].start_time, b[0].start_time) time_series.sort(cmp_start) # Check for overlapping Windows and remove them if existent i = 0 while i < len(time_series): ts = time_series[i] #print ts[0].start_time, ts[0].end_time #print len(time_series) if ts[0].start_time >= slices[-1]: slices.append(ts[0].end_time) else: warnings.warn("Ignoring at least one overlapping window!", UserWarning) i = i+1 # STORE ACTUAL EEG DATA AND WRITE MARKERFILE result_path = result_dir + os.sep + "data_analyzer" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Keep original name if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_eeg = open(os.path.join(result_path, name + ".eeg"), "wb") result_file_mrk = open(os.path.join(result_path, name + ".vmrk"), "w") # or use default name from this collection else: result_file_eeg = open(os.path.join(result_path, name + key_str + ".eeg"), "wb") result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"), "w") # Write Marker header if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_mrk.write(header_mrk % (name)) else: result_file_mrk.write(header_mrk % (name + key_str)) result_file_ms = 0 # Data for padding padding = None count_mrk = 2 num_ch = 0 sampling_int = 0 for ts in time_series: ts0 = ts[0] * scale ts0 = ts0.astype(numpy.int16) if padding == None: padding = numpy.zeros(len(ts[0].channel_names), dtype='int16') num_ch = len(ts[0].channel_names) channel_names = ts[0].channel_names sampling_int = 1000000/ts[0].sampling_frequency #print "writing %d channels.." % len(ts[0].channel_names) # Write Padding (zeros) while result_file_ms < ts[0].start_time - sampling_int/1000.0: result_file_eeg.write(padding.tostring()) result_file_ms += ts[0]._samples_to_ms(1) # Write window ts0.tofile(result_file_eeg) result_file_ms += ts[0].end_time - (ts[0].start_time - sampling_int/1000.0) # Write Marker markers = [] if(len(ts[0].marker_name) > 0): mk_keys = ts[0].marker_name.keys() mk_values = ts[0].marker_name.values() for mk in range(len(mk_keys)): for mv in range(len(mk_values[mk])): markers.append((mk_keys[mk], mk_values[mk][mv])) markers = sorted(markers, key=lambda tup: tup[1]) for i in range(len(markers)): if 'R' in markers[i][0]: event_type = 'Response' elif 'S' in markers[i][0]: event_type = 'Stimulus' else: event_type = 'Label' result_file_mrk.write("Mk%d=%s,%s,%d,1,0\n" % (count_mrk, event_type, markers[i][0], (ts[0].start_time + markers[i][1])*ts[0].sampling_frequency/1000.0)) count_mrk += 1 # WRITE HEADERFILE # Keep original name if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_hdr = open(os.path.join(result_path, name + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int)) # or use default name from this collection else: result_file_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int)) # Format: Ch1=Fp1,,0.1,\xB5V for i in range(num_ch): result_file_hdr.write("Ch%d=%s,,%.2f,\xB5V\n" % (i+1,channel_names[i], 1./scale)) result_file_hdr.close() result_file_eeg.close() result_file_mrk.close()
def consolidate(self, _=None): """ Consolidates the results obtained by the single processes into a consistent structure of collections that are stored on the file system. """ # Consolidate the results directory_pattern = os.sep.join([self.result_directory, "{*",]) dataset_pathes = glob.glob(directory_pattern) # For all collections found for dataset_path in dataset_pathes: try: # Load their meta_data meta_data = BaseDataset.load_meta_data(dataset_path) # Determine author and date author = get_author() date = time.strftime("%Y%m%d_%H_%M_%S") # Update meta data and store it meta_data.update({"author": author, "date": date}) # There can be either run dirs, persistency dirs, or both of them. # Check of whichever there are more. If both exist, their numbers # are supposed to be equal. nr_run_dirs = len(glob.glob(os.path.join(dataset_path, "data_run*"))) nr_per_dirs = len(glob.glob(os.path.join(dataset_path, "persistency_run*"))) nr_runs = max(nr_run_dirs, nr_per_dirs) if nr_runs > 1: meta_data["runs"] = nr_runs # Store the metadata BaseDataset.store_meta_data(dataset_path, meta_data) # Copy the input dataset specification file to the result # directory in order to make later analysis of # the results more easy # THA: Split the first "/" from the input collection name, because otherwise it will be treated # as an absolute path input_collection_name = meta_data["input_dataset_name"][1:] if \ meta_data["input_dataset_name"][0] == os.sep else meta_data["input_dataset_name"] input_meta_path = os.path.join(pySPACE.configuration.storage, input_collection_name) try: input_meta = BaseDataset.load_meta_data(input_meta_path) BaseDataset.store_meta_data(dataset_path, input_meta, file_name="input_metadata.yaml") except (IOError, OSError) as e: self._log("Error copying the input_metadata.yaml: {error}".format(error=e.message), level=logging.CRITICAL) except Exception as e: logging.getLogger("%s" % self).exception("Error updating the metadata: {error!s}".format(error=e)) raise e # If we don't create a feature vector or time series collection, # we evaluated our classification using a classification performance sink. # The resulting files should be merged to one csv tabular. pathlist = glob.glob(os.path.join(self.result_directory,"results_*")) if len(pathlist)>0: # Do the consolidation the same way as for WekaClassificationOperation self._log("Consolidating results ...") # We load and store the results once into a PerformanceResultSummary # This does the necessary consolidation... self._log("Reading intermediate results...") try: result_collection = PerformanceResultSummary(dataset_dir=self.result_directory) self._log("done") self._log("Storing result collection") result_collection.store(self.result_directory) self._log("done") PerformanceResultSummary.merge_traces(self.result_directory) except Exception as e: logging.getLogger("%s" % self).exception("Error merging the result collection: {error!s}".format( error=e)) if self.compression: # Since we get one result summary, # we don't need the numerous folders. # So we zip them to make the whole folder more easy visible. import zipfile cwd = os.getcwd() os.chdir(self.result_directory) # If there are to many or to large folders, problems may occur. # This case we want to log, try 64 bit mode, # and then skip the zipping. try: pathlist = glob.glob(os.path.join(self.result_directory,"{*}")) if not self.compression == "delete": save_file = zipfile.ZipFile( self.result_directory+'/result_folders.zip', mode="w", compression=self.compression) # we want to have the zipped file relative to the # result directory for path in pathlist: for node in os.walk(path): rel_path=os.path.relpath(node[0], self.result_directory) save_file.write(rel_path) for data in node[2]: save_file.write(os.path.join(rel_path, data)) save_file.close() # To still have an easy access to the history of the # processing, we keep one folder. pathlist.pop() for path in pathlist: shutil.rmtree(path) except Exception, e: self._log("Result files could not be compressed with 32"+ " bit mode, switching to 64 bit mode", level=logging.CRITICAL) # nearly total code copy, only difference with 64 bit mode try: pathlist = glob.glob(os.path.join(self.result_directory,"{*}")) save_file=zipfile.ZipFile( self.result_directory+'/result_folders.zip', mode="w", compression=self.compression, allowZip64=True) # we want to have the zipped file relative to the # result directory for path in pathlist: for node in os.walk(path): rel_path = os.path.relpath(node[0], self.result_directory) save_file.write(rel_path) for data in node[2]: save_file.write(os.path.join(rel_path,data)) save_file.close() # To still have an easy access to the history of the # processing, we keep one folder. pathlist.pop() for path in pathlist: shutil.rmtree(path) except: self._log("64 bit mode also failed. Please check your files and your code or contact your local programmer!", level=logging.CRITICAL) os.chdir(cwd)
def _createProcesses(cls, processes, result_directory, operation_spec, parameter_settings, input_collections): storage_format = operation_spec["storage_format"] if "storage_format" \ in operation_spec else None # Determine whether the node_chain should be stored after data processing store_node_chain = operation_spec["store_node_chain"] \ if "store_node_chain" in operation_spec else False # Determine whether certain parameters should not be remembered hide_parameters = [] if "hide_parameters" not in operation_spec \ else list(operation_spec["hide_parameters"]) hide_parameters.append("__INPUT_COLLECTION__") hide_parameters.append("__INPUT_DATASET__") hide_parameters.append("__RESULT_DIRECTORY__") hide_parameters.append("__OUTPUT_BUNDLE__") # Create all combinations of collections, runs and splits collection_run_split_combinations = [] for input_dataset_dir in input_collections: # Determine number of runs to be conducted for this collection abs_collection_path = \ pySPACE.configuration.storage + os.sep \ + input_dataset_dir collection_runs = \ BaseDataset.load_meta_data(abs_collection_path).get('runs', 1) # D.get(k[,d]) -> D[k] if k in D, else d. if "runs" not in operation_spec: requested_runs = collection_runs else: requested_runs = operation_spec["runs"] assert collection_runs == requested_runs \ or collection_runs == 1, \ "Requested %s runs but input collection %s provides "\ "data for %s runs." % (requested_runs, input_dataset_dir, collection_runs) for run in range(max(requested_runs, collection_runs)): collection_splits = \ BaseDataset.load_meta_data(abs_collection_path).get('splits', 1) for split in range(collection_splits): collection_run_split_combinations.append((input_dataset_dir, run, split)) # Shuffle order of dataset-run-split combinations. This should help to # avoid that all processes work on the same data which can cause # problems due to locking etc. random.shuffle(collection_run_split_combinations) # For all templates for node_chain_spec in operation_spec["templates"]: # For all possible parameter instantiations of this template for parameter_setting in parameter_settings: # For all input collections-run combinations for input_dataset_dir, run, split in collection_run_split_combinations: # We are going to change the parameter_setting and don't want to # interfere with later runs so we work on a copy parameter_setting_cp = copy.deepcopy(parameter_setting) # Add input and output path to parameter # setting parameter_setting_cp["__INPUT_DATASET__"] = \ input_dataset_dir.split(os.sep)[-2] parameter_setting_cp["__RESULT_DIRECTORY__"] = \ result_directory if len(operation_spec["templates"])>1: index = operation_spec["templates"].index(node_chain_spec) parameter_setting_cp["__Template__"]=\ operation_spec["template_files"][index] # Load the input meta data dataset_dir = os.sep.join([pySPACE.configuration.storage, input_dataset_dir]) dataset_md = BaseDataset.load_meta_data(dataset_dir) # Add the input parameters meta data to the given parameter setting if "parameter_setting" in dataset_md: dataset_md["parameter_setting"].update(parameter_setting_cp) all_parameters = dataset_md["parameter_setting"] else: all_parameters = parameter_setting_cp def check_constraint(constraint, parameters): for key, value in parameters.iteritems(): constraint = constraint.replace(key, str(value)) return eval(constraint) if not all(check_constraint(constraint_def, all_parameters) for \ constraint_def in \ operation_spec.get('old_parameter_constraints',[])): continue # Determine directory in which the result of this # process should be written result_dataset_directory = \ NodeChainOperation._get_result_dataset_dir(result_directory, input_dataset_dir, parameter_setting_cp, hide_parameters) # Create the respective process and put it to the # executing-queue of processes process = NodeChainProcess(node_chain_spec= node_chain_spec, parameter_setting = parameter_setting_cp, rel_dataset_dir = input_dataset_dir, run = run, split = split, storage_format = storage_format, result_dataset_directory = result_dataset_directory, store_node_chain = store_node_chain) processes.put(process) # give executing process the sign that creation is now finished processes.put(False)
def prepare_training(self, training_files, potentials, operation, nullmarker_stride_ms=None): """ Prepares pyspace live for training. Prepares everything for training of pyspace live, i.e. creates flows based on the dataflow specs and configures them. """ online_logger.info("Preparing Training") self.potentials = potentials self.operation = operation self.nullmarker_stride_ms = nullmarker_stride_ms if self.nullmarker_stride_ms == None: online_logger.warn( 'Nullmarker stride interval is %s. You can specify it in your parameter file.' % self.nullmarker_stride_ms) else: online_logger.info('Nullmarker stride interval is set to %s ms ' % self.nullmarker_stride_ms) online_logger.info("Creating flows..") for key in self.potentials.keys(): spec_base = self.potentials[key]["configuration"].spec_dir if self.operation == "train": self.potentials[key]["node_chain"] = os.path.join( spec_base, self.potentials[key]["node_chain"]) online_logger.info("node_chain_spec:" + self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): self.potentials[key]["prewindowing_flow"] = os.path.join( spec_base, self.potentials[key]["prewindowing_flow"]) online_logger.info("prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": self.potentials[key]["postprocess_flow"] = os.path.join( spec_base, self.potentials[key]["postprocess_flow"]) online_logger.info("postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"]) self.training_active_potential[key] = multiprocessing.Value( "b", False) online_logger.info("Path variables set for NodeChains") # check if multiple potentials are given for training if isinstance(training_files, list): self.training_data = training_files else: self.training_data = [training_files] # Training is done in separate processes, we send the time series # windows to these threads via two queues online_logger.info("Initializing Queues") for key in self.potentials.keys(): self.queue[key] = multiprocessing.Queue() def flow_generator(key): """create a generator to yield all the abri flow windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block=True, timeout=None) if window == None: break yield window # Create the actual data flows for key in self.potentials.keys(): if self.operation == "train": self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["node_chain"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): online_logger.info("loading prewindowing flow..") online_logger.info( "file: " + str(self.potentials[key]["prewindowing_flow"])) self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["prewindowing_flow"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": self.node_chains[key] = NodeChainFactory.flow_from_yaml( Flow_Class=NodeChain, flow_spec=file(self.potentials[key]["postprocess_flow"])) replace_start_and_end_markers = False final_collection = TimeSeriesDataset() final_collection_path = os.path.join( self.prewindowed_data_directory, key, "all_train_data") # delete previous training collection if os.path.exists(final_collection_path): online_logger.info( "deleting old training data collection for " + key) shutil.rmtree(final_collection_path) # load all prewindowed collections and # append data to the final collection prewindowed_sets = \ glob.glob(os.path.join(self.prewindowed_data_directory, key, "*")) if len(prewindowed_sets) == 0: online_logger.error( "Couldn't find data, please do prewindowing first!") raise Exception online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets)) for s, d in enumerate(prewindowed_sets): collection = BaseDataset.load(d) data = collection.get_data(0, 0, "train") for d, (sample, label) in enumerate(data): if replace_start_and_end_markers: # in case we concatenate multiple 'Window' labeled # sets we have to remove every start- and endmarker for k in sample.marker_name.keys(): # find '{S,s} 8' or '{S,s} 9' m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE) if m is not None: online_logger.info( str("remove %s from %d %d" % (m.group(), s, d))) del (sample.marker_name[m.group()]) if s == len(prewindowed_sets)-1 and \ d == len(data)-1: # insert endmarker sample.marker_name["S 9"] = [0.0] online_logger.info("added endmarker" + str(s) + " " + str(d)) if s == 0 and d == 0: # insert startmarker sample.marker_name["S 8"] = [0.0] online_logger.info("added startmarker" + str(s) + " " + str(d)) final_collection.add_sample(sample, label, True) # save final collection (just for debugging) os.mkdir(final_collection_path) final_collection.store(final_collection_path) online_logger.info("stored final collection at " + final_collection_path) # load final collection again for training online_logger.info("loading data from " + final_collection_path) self.prewindowed_data[key] = BaseDataset.load( final_collection_path) self.node_chains[key][0].set_input_dataset( self.prewindowed_data[key]) flow = open(self.potentials[key]["postprocess_flow"]) # create window_stream for every potential if self.operation in ("prewindowing"): window_spec_file = os.path.join( spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]) self.window_stream[key] = \ self.stream_manager.request_window_stream(window_spec_file, nullmarker_stride_ms = self.nullmarker_stride_ms) elif self.operation in ("prewindowing_offline"): pass elif self.operation in ("train"): pass self.node_chain_definitions[key] = yaml.load(flow) flow.close() # TODO: check if the prewindowing flow is still needed when using the stream mode! if self.operation in ("train"): online_logger.info("Removing old flows...") try: shutil.rmtree(self.flow_storage) except: online_logger.info("Could not delete flow storage directory") os.mkdir(self.flow_storage) elif self.operation in ("prewindowing", "prewindowing_offline"): # follow this policy: # - delete prewindowed data older than 12 hours # - always delete trained/stored flows now = datetime.datetime.now() then = now - datetime.timedelta(hours=12) if not os.path.exists(self.prewindowed_data_directory): os.mkdir(self.prewindowed_data_directory) if not os.path.exists(self.flow_storage): os.mkdir(self.flow_storage) for key in self.potentials.keys(): found = self.find_files_older_than(then, \ os.path.join(self.prewindowed_data_directory, key)) if found is not None: for f in found: online_logger.info( str("recursively deleting files in \'%s\'" % f)) try: shutil.rmtree(os.path.abspath(f)) except Exception as e: # TODO: find a smart solution for this! pass # dir was probably already deleted.. if os.path.exists( os.path.join(self.prewindowed_data_directory, key, "all_train_data")): shutil.rmtree( os.path.join(self.prewindowed_data_directory, key, "all_train_data")) online_logger.info( "deleted concatenated training data for " + key) online_logger.info("Training preparations finished") return 0
def store(self, result_dir, s_format="pickle"): """ Stores this collection in the directory *result_dir*. In contrast to *dump* this method stores the collection not in a single file but as a whole directory structure with meta information etc. The data sets are stored separately for each run, split, train/test combination. **Parameters** :result_dir: The directory in which the collection will be stored. :name: The prefix of the file names in which the individual data sets are stored. The actual file names are determined by appending suffixes that encode run, split, train/test information. (*optional, default: "time_series"*) :s_format: The format in which the actual data sets should be stored. Possible formats are 'pickle', 'text', 'csv' and 'mat' (matlab) format. If s_format is a list, the second element further specifies additional options for storing. - pickle: Standard Python format - text: In the text format, all time series objects are concatenated to a single large table containing only integer values. - csv: For the csv format comma separated values are taken as default or a specified Python format string. - mat: Scipy's savemat function is used for storing. Thereby the data is stored as 3 dimensional array. Also meta data information, like sampling frequency and channel names are saved. As an additional parameter the orientation of the data arrays can be given as 'channelXtime' or 'timeXchannel' .. note:: For the text and MATLAB format, markers could be added by using a Marker_To_Mux node before (*optional, default: "pickle"*) .. todo:: Put marker to the right time point and also write marker channel. .. todo:: Shouldn't be 'text' and 'csv' format part of the stream data set?! """ name = "time_series" # for some storage procedures we need further specifications s_type = None if type(s_format) == list: # file format is first position f_format = s_format[0] if len(s_format) > 1: s_type = s_format[1] else: f_format = s_format if f_format == "text" and s_type is None: s_type = "%i" elif f_format == "csv" and s_type == "real": s_type = "%.18e" # Update the meta data author = get_author() self.update_meta_data({"type": "time_series", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + f_format}) # Iterate through splits and runs in this dataset for key, time_series in self.data.iteritems(): # load data, if necessary # (due to the lazy loading, the data might be not loaded already) if isinstance(time_series, basestring): time_series = self.get_data(key[0], key[1], key[2]) if self.sort_string is not None: time_series.sort(key=eval(self.sort_string)) # Construct result directory result_path = result_dir + os.sep + "data" + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if f_format in ["pickle", "cpickle", "cPickle"]: result_file = open(os.path.join(result_path, name+key_str+".pickle"), "w") cPickle.dump(time_series, result_file, cPickle.HIGHEST_PROTOCOL) result_file.close() elif f_format in ["text","csv"]: self.update_meta_data({ "type": "stream", "marker_column": "marker"}) result_file = open(os.path.join(result_path, name + key_str + ".csv"), "w") csvwriter = csv.writer(result_file) channel_names = copy.deepcopy(time_series[0][0].channel_names) if f_format == "csv": channel_names.append("marker") csvwriter.writerow(channel_names) for (data, key) in time_series: if f_format == "text": numpy.savetxt(result_file, data, delimiter=",", fmt=s_type) if not key is None: result_file.write(str(key)) result_file.flush() elif data.marker_name is not None \ and len(data.marker_name) > 0: result_file.write(str(data.marker_name)) result_file.flush() else: first_line = True marker = "" if not key is None: marker = str(key) elif data.marker_name is not None \ and len(data.marker_name) > 0: marker = str(data.marker_name) for line in data: l = list(line) l.append(marker) csvwriter.writerow(list(l)) if first_line: first_line = False marker = "" result_file.flush() result_file.close() elif f_format in ["matlab", "mat", "MATLAB"]: # todo: handle all the other attributes of ts objects! import scipy.io result_file_name = os.path.join(result_path, name + key_str + ".mat") # extract a first time series object to get meta data ts1 = time_series[0][0] # collect all important information in the collection_object dataset_dict = { "sampling_frequency": ts1.sampling_frequency, "channel_names": ts1.channel_names} # we have to extract the data and labels separatly if 'channelXtime' in s_format: dataset_dict["data"] = [data.T for data, _ in time_series] else: dataset_dict["data"] = [data for data, _ in time_series] dataset_dict["labels"] = [label for _, label in time_series] # construct numpy 3d array (e.g., channelXtimeXtrials) dataset_dict["data"] = numpy.rollaxis(numpy.array( dataset_dict["data"]), 0, 3) scipy.io.savemat(result_file_name, mdict=dataset_dict) elif f_format in ["bp_eeg"]: result_file = open(os.path.join(result_path, name + key_str + ".eeg"),"a+") result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"),"w") result_file_mrk.write("Brain Vision Data Exchange Marker File, " "Version 1.0\n") result_file_mrk.write("; Data stored by pySPACE\n") result_file_mrk.write("[Common Infos]\n") result_file_mrk.write("Codepage=UTF-8\n") result_file_mrk.write("DataFile=%s\n" % str(name + key_str + ".eeg")) result_file_mrk.write("\n[Marker Infos]\n") markerno = 1 datapoint = 1 sf = None channel_names = None for t in time_series: if sf is None: sf = t[0].sampling_frequency if channel_names is None: channel_names = t[0].get_channel_names() for mrk in t[0].marker_name.keys(): for tm in t[0].marker_name[mrk]: result_file_mrk.write(str("Mk%d=Stimulus,%s,%d,1,0\n" % (markerno, mrk, datapoint+(tm*sf/1000.0)))) markerno += 1 data_ = t[0].astype(numpy.int16) data_.tofile(result_file) datapoint += data_.shape[0] result_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"),"w") result_hdr.write("Brain Vision Data Exchange Header " "File Version 1.0\n") result_hdr.write("; Data stored by pySPACE\n\n") result_hdr.write("[Common Infos]\n") result_hdr.write("Codepage=UTF-8\n") result_hdr.write("DataFile=%s\n" % str(name + key_str + ".eeg")) result_hdr.write("MarkerFile=%s\n" % str(name + key_str + ".vmrk")) result_hdr.write("DataFormat=BINARY\n") result_hdr.write("DataOrientation=MULTIPLEXED\n") result_hdr.write("NumberOfChannels=%d\n" % len(channel_names)) result_hdr.write("SamplingInterval=%d\n\n" % (1000000/sf)) result_hdr.write("[Binary Infos]\n") result_hdr.write("BinaryFormat=INT_16\n\n") result_hdr.write("[Channel Infos]\n") # TODO: Add Resolutions to time_series # 0 = 0.1 [micro]V, # 1 = 0.5 [micro]V, # 2 = 10 [micro]V, # 3 = 152.6 [micro]V (seems to be unused!) resolutions_str = [unicode("0.1,%sV" % unicode(u"\u03BC")), unicode("0.5,%sV" % unicode(u"\u03BC")), unicode("10,%sV" % unicode(u"\u03BC")), unicode("152.6,%sV" % unicode(u"\u03BC"))] for i in range(len(channel_names)): result_hdr.write(unicode("Ch%d=%s,,%s\n" % (i+1,channel_names[i], unicode(resolutions_str[0]))).encode('utf-8')) result_file.close() else: NotImplementedError("Using unavailable storage format:%s!" % f_format) self.update_meta_data({ "channel_names": copy.deepcopy(time_series[0][0].channel_names), "sampling_frequency": time_series[0][0].sampling_frequency }) #Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def prepare_training(self, training_files, potentials, operation): """ Prepares pyspace live for training. Prepares everything for training of pyspace live, i.e. creates flows based on the dataflow specs and configures them. """ online_logger.info( "Preparing Training") self.potentials = potentials self.operation = operation online_logger.info( "Creating flows..") for key in self.potentials.keys(): spec_base = self.potentials[key]["configuration"].spec_dir if self.operation == "train": self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"]) online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"]) else: self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"]) online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"]) else: self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"]) online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"]) self.training_active_potential[key] = multiprocessing.Value("b",False) online_logger.info("Path variables set for NodeChains") # check if multiple potentials are given for training if isinstance(training_files, list): self.training_data = training_files else: self.training_data = [training_files] # Training is done in separate processes, we send the time series # windows to these threads via two queues online_logger.info( "Initializing Queues") for key in self.potentials.keys(): self.queue[key] = multiprocessing.Queue() def flow_generator(key): """create a generator to yield all the abri flow windows""" # Yield all windows until a None item is found in the queue while True: window = self.queue[key].get(block = True, timeout = None) if window == None: break yield window # Create the actual data flows for key in self.potentials.keys(): if self.operation == "train": self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["node_chain"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["node_chain"]) elif self.operation in ("prewindowing", "prewindowing_offline"): online_logger.info("loading prewindowing flow..") online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"])) self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["prewindowing_flow"])) self.node_chains[key][0].set_generator(flow_generator(key)) flow = open(self.potentials[key]["prewindowing_flow"]) elif self.operation == "prewindowed_train": if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True: self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"])) # create windower online_logger.info( "Creating Windower") online_logger.info(self.potentials[key]["windower_spec_path_train"]) self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"])) replace_start_and_end_markers = True else: self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"])) replace_start_and_end_markers = False final_collection = TimeSeriesDataset() final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data") # delete previous training collection if os.path.exists(final_collection_path): online_logger.info("deleting old training data collection for " + key) shutil.rmtree(final_collection_path) # load all prewindowed collections and # append data to the final collection prewindowed_sets = \ glob.glob(os.path.join(self.prewindowed_data_directory, key, "*")) if len(prewindowed_sets) == 0: online_logger.error("Couldn't find data, please do prewindowing first!") raise Exception online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets)) for s,d in enumerate(prewindowed_sets): collection = BaseDataset.load(d) data = collection.get_data(0, 0, "train") for d,(sample,label) in enumerate(data): if replace_start_and_end_markers: # in case we concatenate multiple 'Window' labeled # sets we have to remove every start- and endmarker for k in sample.marker_name.keys(): # find '{S,s} 8' or '{S,s} 9' m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE) if m is not None: online_logger.info(str("remove %s from %d %d" % (m.group(), s, d))) del(sample.marker_name[m.group()]) if s == len(prewindowed_sets)-1 and \ d == len(data)-1: # insert endmarker sample.marker_name["S 9"] = [0.0] online_logger.info("added endmarker" + str(s) + " " + str(d)) if s == 0 and d == 0: # insert startmarker sample.marker_name["S 8"] = [0.0] online_logger.info("added startmarker" + str(s) + " " + str(d)) final_collection.add_sample(sample, label, True) # save final collection (just for debugging) os.mkdir(final_collection_path) final_collection.store(final_collection_path) online_logger.info("stored final collection at " + final_collection_path) # load final collection again for training online_logger.info("loading data from " + final_collection_path) self.prewindowed_data[key] = BaseDataset.load(final_collection_path) self.node_chains[key][0].set_input_dataset(self.prewindowed_data[key]) flow = open(self.potentials[key]["postprocess_flow"]) self.node_chain_definitions[key] = yaml.load(flow) flow.close() # TODO: check if the prewindowing flow is still needed # when using the stream mode! if self.operation in ("train"): online_logger.info( "Removing old flows...") try: shutil.rmtree(self.flow_storage) except: online_logger.info("Could not delete flow storage directory") os.mkdir(self.flow_storage) elif self.operation in ("prewindowing", "prewindowing_offline"): # follow this policy: # - delete prewindowed data older than 12 hours # - always delete trained/stored flows now = datetime.datetime.now() then = now - datetime.timedelta(hours=12) if not os.path.exists(self.prewindowed_data_directory): os.mkdir(self.prewindowed_data_directory) if not os.path.exists(self.flow_storage): os.mkdir(self.flow_storage) for key in self.potentials.keys(): found = self.find_files_older_than(then, \ os.path.join(self.prewindowed_data_directory, key)) if found is not None: for f in found: online_logger.info(str("recursively deleting files in \'%s\'" % f)) try: shutil.rmtree(os.path.abspath(f)) except Exception as e: # TODO: find a smart solution for this! pass # dir was probably already deleted.. if os.path.exists(os.path.join(self.prewindowed_data_directory, key, "all_train_data")): shutil.rmtree(os.path.join(self.prewindowed_data_directory, key, "all_train_data")) online_logger.info("deleted concatenated training data for " + key) online_logger.info( "Training preparations finished") return 0
def create(cls, operation_spec, result_directory, debug=False, input_paths=[]): """ A factory method that creates an Analysis operation based on the information given in the operation specification operation_spec. If debug is TRUE the creation of the Analysis Processes will not be in a separated thread. """ assert (operation_spec["type"] == "comp_analysis") input_path = operation_spec["input_path"] summary = BaseDataset.load( os.path.join(pySPACE.configuration.storage, input_path)) data_dict = summary.data ## Done # Determine the parameters that should be analyzed parameters = operation_spec["parameters"] # Determine dependent parameters, which don't get extra resolution try: dep_par = operation_spec["dep_par"] except KeyError: dep_par = [] # Determine the metrics that should be plotted spec_metrics = operation_spec["metrics"] metrics = [] for metric in spec_metrics: if data_dict.has_key(metric): metrics.append(metric) else: import warnings warnings.warn('The metric "' + metric + '" is not contained in the results csv file.') if len(metrics) == 0: warnings.warn( 'No metric available from spec file, default to first dict entry.' ) metrics.append(data_dict.keys()[0]) # Determine how many processes will be created number_parameter_values = [ len(set(data_dict[param])) for param in parameters ] number_processes = cls._numberOfProcesses(0, number_parameter_values) + 1 logscale = False if operation_spec.has_key('logscale'): logscale = operation_spec['logscale'] markertype = 'x' if operation_spec.has_key('markertype'): markertype = operation_spec['markertype'] if debug == True: # To better debug creation of processes we don't limit the queue # and create all processes before executing them processes = processing.Queue() cls._createProcesses(processes, result_directory, data_dict, parameters, dep_par, metrics, logscale, markertype, True) return cls(processes, operation_spec, result_directory, number_processes) else: # Create all plot processes by calling a recursive helper method in # another thread so that already created processes can be executed # although creation of processes is not finished yet. Therefore a queue # is used which size is limited to guarantee that not to much objects # are created (since this costs memory). However, the actual number # of 100 is arbitrary and might be reviewed. processes = processing.Queue(100) create_process = processing.Process( target=cls._createProcesses, args=(processes, result_directory, data_dict, parameters, dep_par, metrics, logscale, markertype, True)) create_process.start() # create and return the comp_analysis operation object return cls(processes, operation_spec, result_directory, number_processes, create_process)
def store(self, result_dir, s_format=["pickle", "real"]): """ store the collection in *result_dir*""" name = "predictions" # Update the meta data author = get_author() self.update_meta_data({"type": "prediction_vector", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format[0]}) if not s_format in ["csv", "arff", "pickle"]: self._log("Storage format not supported! Using default.", level=logging.ERROR) s_format = "pickle" for key, prediction_vectors in self.data.iteritems(): # Construct result directory result_path = result_dir + os.sep + "data" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format == "pickle": result_file = open(os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(prediction_vectors, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format == "csv": # Write as Comma Separated Value result_file = open(os.path.join(result_path, name + key_str + ".csv"),"w") if self.meta_data["num_predictors"] == 1: result_file.write("Predicted Label, Prediction Score, True Label \n") for pv in prediction_vectors: result_file.write("%s, %s, %s\n" % (pv[0].label[0], pv[0].prediction[0], pv[1])) else: # we begin by dealing with the header of the csv file base_header = "Predicted %(index)d Label, Prediction %(index)d Score, " base_result = "%(label)s, %(score)s," header = "" for i in range(self.meta_data["num_predictors"]): header+= base_header % dict(index=i+1) header += "True Label\n" result_file.write(header) # and now we can write each of the prediction vectors in turn for pv in prediction_vectors: result = "" for i in range(self.meta_data["num_predictors"]): result += base_result % dict(label=pv[0].label[i], score=pv[0].prediction[i]) result += str(pv[1]) + "\n" result_file.write(result) #Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data)
def store(self, result_dir, s_format=["pickle", "real"]): """ Stores this collection in the directory *result_dir*. In contrast to *dump* this method stores the collection not in a single file but as a whole directory structure with meta information etc. The data sets are stored separately for each run, split, train/test combination. The method expects the following parameters: * *result_dir* The directory in which the collection will be stored * *name* The prefix of the file names in which the individual \ data sets are stored. The actual file names are determined \ by appending suffixes that encode run, split, train/test \ information. Defaults to "features". * *format* A list with information about the format in which the actual data sets should be stored. The first entry specifies the file format. If it is "arff" the second entry specifies the attribute format. Examples: ["arff", "real"], ["arff", "{0,1}"] .. todo:: Someone could implement the format ["fasta"] for sax features To store the data in comma separated values, use ["csv", "real"]. (*optional, default: ["pickle", "real"]*) .. todo:: Adapt storing of csv file to external library instead of doing it manually. """ name = "features" # Update the meta data try: author = pwd.getpwuid(os.getuid())[4] except: author = "unknown" self._log("Author could not be resolved.", level=logging.WARNING) self.update_meta_data({ "type": "feature_vector", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format[0] }) # Iterate through splits and runs in this dataset for key, feature_vectors in self.data.iteritems(): # Construct result directory result_path = result_dir + os.sep + "data" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format[0] == "pickle": result_file = open( os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(feature_vectors, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format[0] == "arff": # Write as ARFF result_file = open( os.path.join(result_path, name + key_str + ".arff"), "w") # Create the arff file header relation_name = result_dir.split(os.sep)[-1] result_file.write('@relation "%s"\n' % relation_name) # Write the type of all features for feature_name in self.meta_data["feature_names"]: result_file.write('@attribute %s %s\n' % (feature_name, s_format[1])) classString = "" + ",".join( sorted(self.meta_data["classes_names"])) + "" result_file.write("@attribute class {%s}\n" % classString) result_file.write("@data\n") # Write all given training data into the ARFF file fv = feature_vectors[0][0] if numpy.issubdtype(fv.dtype, numpy.string_): feature_format = "%s," elif numpy.issubdtype(fv.dtype, numpy.floating): feature_format = "%f," elif numpy.issubdtype(fv.dtype, numpy.integer): feature_format = "%d," for features, class_name in feature_vectors: for feature in features[0]: result_file.write(feature_format % feature) result_file.write("%s\n" % str(class_name)) elif s_format[0] == "csv": # Write as Comma Separated Value result_file = open( os.path.join(result_path, name + key_str + ".csv"), "w") for feature_name in self.meta_data["feature_names"]: result_file.write('%s,' % (feature_name)) result_file.write('\n') fv = feature_vectors[0][0] if numpy.issubdtype(fv.dtype, numpy.floating): feature_format = "%f," elif numpy.issubdtype(fv.dtype, numpy.integer): feature_format = "%d," else: feature_format = "%s," for features, class_name in feature_vectors: f = features.view(numpy.ndarray) for feature in f[0]: result_file.write(feature_format % feature) result_file.write("%s\n" % str(class_name)) result_file.close() #Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data)
def __init__(self, dataset_dir, command_template, parametrization, cv_folds, run_number, operation_result_dir): super(WEKAClassificationProcess, self).__init__() # Load the abbreviations abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir, 'operations/weka_templates', 'abbreviations.yaml'), 'r') self.abbreviations = yaml.load(abbreviations_file) abbreviations_file.close() # Determine the directory in which the process' results # are stored self.result_directory = operation_result_dir # Create collection collection = BaseDataset.load(dataset_dir) # The parametrization that is independent of the collection type # and the specific weka command template that is executed self.params = {"collection_name": dataset_dir.strip(os.sep).split(os.sep)[-1], "run_number": run_number, "cv_folds": cv_folds, "weka_class_path": pySPACE.configuration.weka_class_path, "temp_results": self.result_directory, "unique_id": WEKAClassificationProcess.unique_id} # Collection dependent parameters if not collection.meta_data["train_test"] \ and collection.meta_data["splits"] == 1: raise NotImplementedError() else: # The pattern of the train and test files generated by crossvalidation data_pattern = os.path.join(dataset_dir, collection.meta_data["data_pattern"]) # One example arff file in which WEKa can look up relation name etc. sample_dataset = data_pattern.replace("_run", "_run0")\ .replace("_sp_","_sp0_")\ .replace("_tt","_train") self.params.update({"sample_dataset": sample_dataset, "data_pattern": data_pattern}) # Add custom parameters for the weka command template for parameter_name, parameter_value in parametrization.iteritems(): self.params[parameter_name + "_abbr"] = parameter_value # Auto-expand abbreviations if parameter_value in self.abbreviations: parameter_value = self.abbreviations[parameter_value] elif parameter_name == 'classifier': import warnings warnings.warn("Did not find classifier abbreviation %s. " " Expecting full name." % parameter_value) self.params[parameter_name] = parameter_value # Build the WEKA command by repeatedly replacing all placeholders in # the template while True: instantiated_template = command_template % self.params if instantiated_template == command_template: # All placeholders replace self.weka_command = instantiated_template break else: # We have to continue since we are not converged command_template = instantiated_template self.handler_class = None WEKAClassificationProcess.unique_id += 1
def store(self, result_dir, s_format="BrainVision"): # Keep original file name, depends on the AnalyserSinkNode, see it's documentation. if self.meta_data.has_key("eeg_src_file_name") and self.meta_data["eeg_src_file_name"] is None: name = self.meta_data["eeg_src_file_name"] # or use default name from this collection else: name = "Analyzer" if not s_format == "BrainVision": self._log("The format %s is not supported!" % s_format, level=logging.CRITICAL) return # Update the meta data try: author = pwd.getpwuid(os.getuid())[4] except: author = "unknown" self._log("Author could not be resolved.", level=logging.WARNING) self.update_meta_data( { "type": "only output of individual nodes stored", "storage_format": s_format, "author": author, "data_pattern": "Multiplexed", } ) # Store meta data BaseDataset.store_meta_data(result_dir, self.meta_data) # self._log("EEG data file %s" % self.collection.data_file) slices = [] slices.append(0) channel_names = [] for key, time_series in self.data.iteritems(): # Sort the Times-Series Array def cmp_start(a, b): return cmp(a[0].start_time, b[0].start_time) time_series.sort(cmp_start) # Check for overlapping Windows and remove them if existent i = 0 while i < len(time_series): ts = time_series[i] # print ts[0].start_time, ts[0].end_time # print len(time_series) if ts[0].start_time >= slices[-1]: slices.append(ts[0].end_time) else: warnings.warn("Ignoring at least one overlapping window!", UserWarning) i = i + 1 # STORE ACTUAL EEG DATA AND WRITE MARKERFILE result_path = result_dir + os.sep + "data_analyzer" + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Keep original name if self.meta_data.has_key("eeg_src_file_name") and self.meta_data["eeg_src_file_name"] != None: result_file_eeg = open(os.path.join(result_path, name + ".eeg"), "wb") result_file_mrk = open(os.path.join(result_path, name + ".vmrk"), "w") # or use default name from this collection else: result_file_eeg = open(os.path.join(result_path, name + key_str + ".eeg"), "wb") result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"), "w") # Write Marker header if self.meta_data.has_key("eeg_src_file_name") and self.meta_data["eeg_src_file_name"] != None: result_file_mrk.write(header_mrk % (name)) else: result_file_mrk.write(header_mrk % (name + key_str)) result_file_ms = 0 # Data for padding padding = None count_mrk = 2 num_ch = 0 sampling_int = 0 for ts in time_series: if padding == None: padding = numpy.zeros(len(ts[0].channel_names), dtype="int16") num_ch = len(ts[0].channel_names) channel_names = ts[0].channel_names sampling_int = 1000000 / ts[0].sampling_frequency # print "writing %d channels.." % len(ts[0].channel_names) # Write Padding (zeros) while result_file_ms < ts[0].start_time: result_file_eeg.write(padding.tostring()) result_file_ms += ts[0]._samples_to_ms(1) # Write window ts[0].tofile(result_file_eeg) result_file_ms += ts[0].end_time - ts[0].start_time # Write Marker result_file_mrk.write( "Mk%d=Label,%s,%d,1,0\n" % (count_mrk, ts[1], ts[0]._ms_to_samples(ts[0].start_time)) ) count_mrk += 1 # WRITE HEADERFILE # Keep original name if self.meta_data.has_key("eeg_src_file_name") and self.meta_data["eeg_src_file_name"] != None: result_file_hdr = open(os.path.join(result_path, name + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int)) # or use default name from this collection else: result_file_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int)) # Format: Ch1=Fp1,,0.1,\xB5V for i in range(num_ch): result_file_hdr.write("Ch%d=%s,,0.1,\xB5V\n" % (i + 1, channel_names[i])) result_file_hdr.close() result_file_eeg.close() result_file_mrk.close()
def store(self, result_dir, s_format = "bp_eeg"): self.merged = False scale = 10.0 # is used to scale up the eeg sample values. The data samples are converted to int16 # when saving, so scaling is necessary to keep maintain the resolutions. # Keep original file name, depends on the AnalyserSinkNode, see it's documentation. if self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] is not None: name = self.meta_data['eeg_src_file_name'] # or use default name from this collection else: name = "Analyzer" if not s_format == "bp_eeg": self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL) return # Update the meta data author = get_author() self.update_meta_data({"type": "only output of individual nodes stored", "storage_format": s_format, "author" : author, "data_pattern": "Multiplexed"}) # Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data) #self._log("EEG data file %s" % self.collection.data_file) slices = [] slices.append(0) channel_names = [] for key, time_series in self.data.iteritems(): # Sort the Times-Series Array def cmp_start(a, b): return cmp(a[0].start_time, b[0].start_time) time_series.sort(cmp_start) # Check for overlapping Windows and remove them if existent i = 0 while i < len(time_series): ts = time_series[i] #print ts[0].start_time, ts[0].end_time #print len(time_series) if ts[0].start_time >= slices[-1]: slices.append(ts[0].end_time) else: warnings.warn("Ignoring at least one overlapping window!", UserWarning) i = i+1 # STORE ACTUAL EEG DATA AND WRITE MARKERFILE result_path = result_dir + os.sep + "data_analyzer" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Keep original name if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_eeg = open(os.path.join(result_path, name + ".eeg"), "wb") result_file_mrk = open(os.path.join(result_path, name + ".vmrk"), "w") # or use default name from this collection else: result_file_eeg = open(os.path.join(result_path, name + key_str + ".eeg"), "wb") result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"), "w") # Write Marker header if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_mrk.write(header_mrk % (name)) else: result_file_mrk.write(header_mrk % (name + key_str)) result_file_ms = 0 # Data for padding padding = None count_mrk = 2 num_ch = 0 sampling_int = 0 for ts in time_series: ts0 = ts[0] * scale ts0 = ts0.astype(numpy.int16) if padding == None: padding = numpy.zeros(len(ts[0].channel_names), dtype='int16') num_ch = len(ts[0].channel_names) channel_names = ts[0].channel_names sampling_int = 1000000/ts[0].sampling_frequency #print "writing %d channels.." % len(ts[0].channel_names) # Write Padding (zeros) while result_file_ms < ts[0].start_time - sampling_int/1000.0: result_file_eeg.write(padding.tostring()) result_file_ms += ts[0]._samples_to_ms(1) # Write window ts0.tofile(result_file_eeg) result_file_ms += ts[0].end_time - (ts[0].start_time - sampling_int/1000.0) # Write Marker markers = [] if(len(ts[0].marker_name) > 0): mk_keys = ts[0].marker_name.keys() mk_values = ts[0].marker_name.values() for mk in range(len(mk_keys)): for mv in range(len(mk_values[mk])): markers.append((mk_keys[mk], mk_values[mk][mv])) markers = sorted(markers, key=lambda tup: tup[1]) for i in range(len(markers)): if 'R' in markers[i][0]: event_type = 'Response' elif 'S' in markers[i][0]: event_type = 'Stimulus' else: event_type = 'Label' result_file_mrk.write("Mk%d=%s,%s,%d,1,0\n" % (count_mrk, event_type, markers[i][0], (ts[0].start_time + markers[i][1])*ts[0].sampling_frequency/1000.0)) count_mrk += 1 # WRITE HEADERFILE # Keep original name if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None): result_file_hdr = open(os.path.join(result_path, name + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int)) # or use default name from this collection else: result_file_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"), "w") result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int)) # Format: Ch1=Fp1,,0.1,\xB5V for i in range(num_ch): result_file_hdr.write("Ch%d=%s,,%.2f,\xB5V\n" % (i+1,channel_names[i], 1./scale)) result_file_hdr.close() result_file_eeg.close() result_file_mrk.close()
def store(self, result_dir, s_format = ["pickle", "real"]): """ Stores this collection in the directory *result_dir*. In contrast to *dump* this method stores the collection not in a single file but as a whole directory structure with meta information etc. The data sets are stored separately for each run, split, train/test combination. The method expects the following parameters: * *result_dir* The directory in which the collection will be stored * *name* The prefix of the file names in which the individual \ data sets are stored. The actual file names are determined \ by appending suffixes that encode run, split, train/test \ information. Defaults to "features". * *format* A list with information about the format in which the actual data sets should be stored. The first entry specifies the file format. If it is "arff" the second entry specifies the attribute format. Examples: ["arff", "real"], ["arff", "{0,1}"] .. todo:: Someone could implement the format ["fasta"] for sax features To store the data in comma separated values, use ["csv", "real"]. (*optional, default: ["pickle", "real"]*) .. todo:: Adapt storing of csv file to external library instead of doing it manually. """ name = "features" # Update the meta data try: author = pwd.getpwuid(os.getuid())[4] except: author = "unknown" self._log("Author could not be resolved.",level=logging.WARNING) self.update_meta_data({"type": "feature_vector", "storage_format": s_format, "author": author, "data_pattern": "data_run" + os.sep + name + "_sp_tt." + s_format[0]}) if type(s_format) == list: s_type = s_format[1] s_format = s_format[0] else: s_type = "real" if not s_format in ["csv", "arff", "pickle"]: self._log("Storage format not supported! Using default.", level=logging.ERROR) s_format = "pickle" # Iterate through splits and runs in this dataset for key, feature_vectors in self.data.iteritems(): # Construct result directory result_path = result_dir + os.sep + "data" \ + "_run%s" % key[0] if not os.path.exists(result_path): os.mkdir(result_path) key_str = "_sp%s_%s" % key[1:] # Store data depending on the desired format if s_format == "pickle": result_file = open(os.path.join(result_path, name + key_str + ".pickle"), "w") cPickle.dump(feature_vectors, result_file, cPickle.HIGHEST_PROTOCOL) elif s_format == "arff": # Write as ARFF result_file = open(os.path.join(result_path, name + key_str + ".arff"),"w") # Create the arff file header relation_name = result_dir.split(os.sep)[-1] result_file.write('@relation "%s"\n' % relation_name) # Write the type of all features for feature_name in self.meta_data["feature_names"]: result_file.write("@attribute %s %s\n" % (feature_name, s_type)) classString = "" + ",".join(sorted(self.meta_data["classes_names"])) + "" result_file.write("@attribute class {%s}\n" % classString) result_file.write("@data\n") # Write all given training data into the ARFF file fv = feature_vectors[0][0] if numpy.issubdtype(fv.dtype, numpy.string_): feature_format = "%s," elif numpy.issubdtype(fv.dtype, numpy.floating): feature_format = "%f," elif numpy.issubdtype(fv.dtype, numpy.integer): feature_format = "%d," for features, class_name in feature_vectors: for feature in features[0]: result_file.write(feature_format % feature) result_file.write("%s\n" % str(class_name)) elif s_format == "csv": # Write as Comma Separated Value result_file = open(os.path.join(result_path, name + key_str + ".csv"),"w") for feature_name in self.meta_data["feature_names"]: result_file.write("%s," % (feature_name)) result_file.write("\n") fv = feature_vectors[0][0] if numpy.issubdtype(fv.dtype, numpy.floating): feature_format = "%f," elif numpy.issubdtype(fv.dtype, numpy.integer): feature_format = "%d," else: feature_format = "%s," for features, class_name in feature_vectors: f = features.view(numpy.ndarray) for feature in f[0]: result_file.write(feature_format % feature) result_file.write("%s\n" % str(class_name)) result_file.close() #Store meta data BaseDataset.store_meta_data(result_dir,self.meta_data)
def consolidate(self): """ Consolidates the results obtained by the single WEKA filter processes into a consistent summary of datasets that is stored on the file system. .. todo:: Some of the contents of this method should go into the :class:`~pySPACE.resources.dataset_defs.feature_vector.FeatureVectorDataset` """ # Iterate over all collections and store the collection meta data etc. for entries in os.listdir(self.result_directory): fullpath = os.path.join(self.result_directory, entries) # For each collection if os.path.isdir(fullpath): if entries.startswith("{"): # Extract the parameters from the collection name in order to # adjust the relation name if self.num_parameters > 0: parameter_strings = entries.strip("}{").split("}{")[-self.num_parameters:] parameter_postfix = "{" + "}{".join(parameter_strings) + "}" else: parameter_strings = "" parameter_postfix = "" # Postprocessing of the arff files of this collection for train_arff_file in glob.glob(fullpath + os.sep + "data_run*" + os.sep + "*train.arff"): # Adjust the relation name of the train file content = open(train_arff_file, 'r').readlines() # We strip everything after the last "}" endindex = content[0].rfind("}") content[0] = content[0][:endindex+1] content[0] += parameter_postfix + "'" open(train_arff_file, 'w').writelines(content) # Use relation name of train data for test data test_arff_file = train_arff_file.replace("train.arff", "test.arff") test_content = open(test_arff_file, 'r').readlines() test_content[0] = content[0] + "\n" open(test_arff_file, 'w').writelines(test_content) # Check which features are contained in the arff file feature_names = [] for line in content: if line.startswith("@attribute"): attribute = line.split()[1] if attribute is not "class": feature_names.append(attribute) # Store the collection meta data etc. if self.num_parameters > 0: input_collection_name = \ "{" + "}{".join(entries.strip("}{").split("}{")[:-self.num_parameters]) + "}" else: input_collection_name = entries input_collection_path = os.path.join(self.operation_spec["input_path"], input_collection_name) input_collection_meta = BaseDataset.load_meta_data( pySPACE.configuration.storage + os.sep + input_collection_path) # Store the input collection BaseDataset.store_meta_data(fullpath, input_collection_meta, file_name="input_metadata.yaml") # Adjust collection metadata for the new collection input_collection_meta["feature_names"] = feature_names input_collection_meta["num_features"] = len(feature_names) input_collection_meta["author"] = get_author() input_collection_meta["date"] = time.strftime("%Y%m%d") input_collection_meta["input_collection_name"] = input_collection_name # Write the collection meta information into the folder BaseDataset.store_meta_data(fullpath,input_collection_meta) # Store the command_template command_template_file = open(os.path.join(fullpath, "command_template"), 'w') command_template_file.write(self.command_template) command_template_file.close() else: # training and test arff need the same relation name # otherwise Weka can't relate it to each other; the collection # name and the parameters in {}{}-optic must be the relation # name for further processing self._log("WARNING: Collection name doesn't begin with '{'. Further processing may be collapse!", level= logging.WARNING) # Write the specification of this operation # to the result directory in order to make later # analysis of results more easy source_operation_file = open(os.path.join(self.result_directory, "source_operation.yaml"), 'w') yaml.dump(self.operation_spec, source_operation_file) source_operation_file.close()