def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ if self.store and self.kernel_type == 'LINEAR': node_dir = os.path.join(result_dir, self.__class__.__name__) from pySPACE.tools.filesystem import create_directory create_directory(node_dir) try: self.features except: if type(self.w) == FeatureVector: self.features = self.w elif not self.w is None: self.features = FeatureVector(self.w.T, self.feature_names) else: self.features = None if not self.features is None: # This node stores the learned features name = "%s_sp%s.pickle" % ("features", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(cPickle.dumps(self.features, protocol=2)) result_file.close() name = "%s_sp%s.yaml" % ("features", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(str(self.features)) result_file.close() del self.features
def __init__( self, node_chain_spec, parameter_setting, rel_dataset_dir, run, split, storage_format, result_dataset_directory, store_node_chain=False, hide_parameters=[], ): super(NodeChainProcess, self).__init__() self.node_chain_spec = node_chain_spec self.parameter_setting = parameter_setting self.rel_dataset_dir = rel_dataset_dir self.storage = pySPACE.configuration.storage self.run = run self.storage_format = storage_format self.result_dataset_directory = result_dataset_directory self.persistency_dir = os.sep.join([result_dataset_directory, "persistency_run%s" % run]) create_directory(self.persistency_dir) self.store_node_chain = store_node_chain self.hide_parameters = hide_parameters # reduce_log_level for process creation try: console_log_level = ( eval(pySPACE.configuration.console_log_level) if hasattr(pySPACE.configuration, "console_log_level") else logging.WARNING ) except (AttributeError, NameError): console_log_level = logging.WARNING try: file_log_level = ( eval(pySPACE.configuration.file_log_level) if hasattr(pySPACE.configuration, "file_log_level") else logging.INFO ) except (AttributeError, NameError): file_log_level = logging.INFO self.min_log_level = min(console_log_level, file_log_level) pySPACE.configuration.min_log_level = self.min_log_level # Replace parameters in spec file # self.node_chain_spec = replace_parameters_and_convert( # self.node_chain_spec, self.parameter_setting) self.node_chain_spec = replace_parameters2(self.node_chain_spec, self.parameter_setting) # Create node chain self.node_chain = NodeChainFactory.flow_from_yaml(Flow_Class=BenchmarkNodeChain, flow_spec=self.node_chain_spec) for node in self.node_chain: node.current_split = split # Remove pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in self.parameter_setting: self.parameter_setting = copy.deepcopy(self.parameter_setting) self.parameter_setting.pop("__PREPARE_OPERATION__")
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ if self.store and self.kernel_type == 'LINEAR': node_dir = os.path.join(result_dir, self.__class__.__name__) from pySPACE.tools.filesystem import create_directory create_directory(node_dir) try: self.features except: if type(self.w) == FeatureVector: self.features = self.w elif not self.w is None: self.features = FeatureVector(self.w.T, self.feature_names) else: self.features=None if not self.features is None: # This node stores the learned features name = "%s_sp%s.pickle" % ("features", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(cPickle.dumps(self.features, protocol=2)) result_file.close() name = "%s_sp%s.yaml" % ("features", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(str(self.features)) result_file.close() del self.features
def __init__(self, dataset_dir, command_template, parametrization, run_number, split_number, operation_result_dir, hide_parameters = []): super(WEKAFilterProcess, self).__init__() # Determine the directory in which the of the process' results # are stored result_collection_name = dataset_dir.split(os.sep)[-2] for parameter_name, parameter_value in parametrization.iteritems(): # If this is a parameter that should not be hidden, then we have to # encode it in the result collection name if not parameter_name in hide_parameters: result_collection_name += "{__%s__:%s}" % (parameter_name.upper(), parameter_value) self.result_directory = os.path.join(operation_result_dir, result_collection_name) # Create directory for intermediate results if it does not exist yet create_directory(self.result_directory + os.sep + "data_run%s" % run_number) # Create collection collection = BaseDataset.load(dataset_dir) # The parametrization that is independent of the collection type # and the specific weka command template that is executed self.params = {"dataset_name": dataset_dir.replace('/','_'), "dataset_dir": dataset_dir, "run_number": run_number, "split_number": split_number, "weka_class_path": pySPACE.configuration.weka_class_path, "temp_results": self.result_directory} # Load the abbreviations abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir, 'operations/weka_templates', 'abbreviations.yaml'), 'r') self.abbreviations = yaml.load(abbreviations_file) # Add custom parameters for the weka command template for parameter_name, parameter_value in parametrization.iteritems(): # Auto-expand abbreviations if parameter_value in self.abbreviations: parameter_value = self.abbreviations[parameter_value] self.params[parameter_name] = parameter_value # Build the WEKA command by repeatedly replacing all placeholders in # the template while True: instantiated_template = command_template % self.params if instantiated_template == command_template: # All placeholders replace self.weka_command = instantiated_template break else: # We have to continue since we are not converged command_template = instantiated_template self.handler_class = None
def __init__(self, node_chain_spec, parameter_setting, rel_dataset_dir, run, split, storage_format, result_dataset_directory, store_node_chain=False, hide_parameters=[]): super(NodeChainProcess, self).__init__() self.node_chain_spec = node_chain_spec self.parameter_setting = parameter_setting self.rel_dataset_dir = rel_dataset_dir self.storage = pySPACE.configuration.storage self.run = run self.storage_format = storage_format self.result_dataset_directory = result_dataset_directory self.persistency_dir = os.sep.join( [result_dataset_directory, "persistency_run%s" % run]) create_directory(self.persistency_dir) self.store_node_chain = store_node_chain self.hide_parameters = hide_parameters # reduce_log_level for process creation try: console_log_level = eval(pySPACE.configuration.console_log_level) \ if hasattr(pySPACE.configuration, "console_log_level") \ else logging.WARNING except (AttributeError, NameError): console_log_level = logging.WARNING try: file_log_level = eval(pySPACE.configuration.file_log_level) \ if hasattr(pySPACE.configuration, "file_log_level") \ else logging.INFO except (AttributeError, NameError): file_log_level = logging.INFO self.min_log_level = min(console_log_level, file_log_level) pySPACE.configuration.min_log_level = self.min_log_level # Replace parameters in spec file # self.node_chain_spec = replace_parameters_and_convert( # self.node_chain_spec, self.parameter_setting) self.node_chain_spec = replace_parameters2(self.node_chain_spec, self.parameter_setting) # Create node chain self.node_chain = NodeChainFactory.flow_from_yaml( Flow_Class=BenchmarkNodeChain, flow_spec=self.node_chain_spec) for node in self.node_chain: node.current_split = split # Remove pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in self.parameter_setting: self.parameter_setting = copy.deepcopy(self.parameter_setting) self.parameter_setting.pop("__PREPARE_OPERATION__")
def store_state(self, result_dir, index=None): """ Stores the projection in the given directory *result_dir* """ if self.store: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) name = "%s_sp%s.pickle" % ("projection", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(cPickle.dumps(self.projection, protocol=2)) result_file.close()
def _createProcesses(cls, processes, result_dir, data_dict, parameters, metrics, top_level): """ Recursive function that is used to create the analysis processes Each process creates one plot for each numeric parameter, each pair of numeric parameters, and each nominal parameter based on the data contained in the *data_dict*. The results are stored in *result_dir*. The method calls itself recursively for each value of each parameter. """ # Create the analysis process for the given parameters and the # given data process = AnalysisProcess(result_dir, data_dict, parameters, metrics) processes.put(process) # If we have less than two parameters it does not make sense to # split further if len(parameters) < 2: if top_level == True: # If we have only one parameter to visualize, # we don't need to create any further processes, # and we have to finish the creating process. processes.put(False) return # For each parameter for proj_parameter in parameters: # We split the data based on the values of this parameter remaining_parameters = [parameter for parameter in parameters if parameter != proj_parameter] # For each value the respective projection parameter can take on for value in set(data_dict[proj_parameter]): # Project the result dict onto the rows where the respective # parameter takes on the given value projected_dict = defaultdict(list) entries_added = False for i in range(len(data_dict[parameter])): if data_dict[proj_parameter][i] == value: entries_added = True for column_key in data_dict.keys(): if column_key == proj_parameter: continue projected_dict[column_key].append(data_dict[column_key][i]) # If the projected_dict is empty we continue if not entries_added: continue # Create result_dir and do the recursive call for the # projected data # Parameter is seperated via # proj_result_dir = result_dir + os.sep + "%s#%s" % (proj_parameter, value) create_directory(proj_result_dir) cls._createProcesses(processes, proj_result_dir, projected_dict, remaining_parameters, metrics, False) if top_level == True: # print "last process created" # give executing process the sign that creation is now finished processes.put(False)
def create(cls, operation_spec, base_result_dir=None): """ A factory method that calls the responsible method for creating an operation of the type specified in the operation specification dictionary (*operation_spec*). """ # Determine result directory result_directory = cls.get_unique_result_dir(base_result_dir) print("--> Results will be stored at: \n\t\t %s"%str(result_directory)) # Check if the required directories exist # and create them if necessary create_directory(result_directory) # Determine all input datasets (note: they can be specified by # extended syntax for the glob package) storage = pySPACE.configuration.storage if not operation_spec.has_key("input_path"): warnings.warn("No input path found in operation specification.") input_path_pattern = os.sep.join([storage, operation_spec.get("input_path", ""), "*", ""]) input_paths = glob.glob(input_path_pattern) obsolete_paths=[] for path in input_paths: file_path = os.sep.join([path,"metadata.yaml"]) if os.path.isfile(os.sep.join([path,"metadata.yaml"])): continue elif os.path.isfile(os.sep.join([path,"collection.yaml"])): continue # warning comes, when data is loaded else: obsolete_paths.append(path) warnings.warn('Folder' + str(path) + ' seems not to be a pySPACE'+ ' dataset (no "metadata.yaml" found)! '+ 'Skipping this folder in operation...') for path in obsolete_paths: input_paths.remove(path) op_type = operation_spec["type"] if op_type.endswith("_operation"): l=len("_operation")*-1 op_type=op_type[:l] operation_spec["type"] = op_type warnings.warn("'%s_operation' has the wrong ending. Using '%s' instead."%(op_type,op_type),DeprecationWarning) op_class_name = ''.join([x.title() for x in op_type.split('_')]) op_class_name += "Operation" # dynamic class import: from data_mod_name import col_class_name try: op_module = __import__('pySPACE.missions.operations.%s' % op_type, fromlist=[op_class_name]) except: msg = "Operation module %s is unknown. Trying to use node_chain." % (op_type) from pySPACE.missions.operations.node_chain import NodeChainOperation op_class = NodeChainOperation else: op_class = getattr(op_module,op_class_name) return op_class.create(operation_spec, result_directory, input_paths=input_paths)
def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting, hide_parameters): """ Determines the name of the result directory Determines the name of the result directory based on the input_dataset_dir, the node_chain_name and the parameter setting. """ input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1] input_name = input_name.strip("{}") # If the input is already the result of an operation if input_name.count("}{") > 0: input_name_parts = input_name.split("}{") input_name = input_name_parts[0] # Load the input meta data dataset_dir = os.sep.join([pySPACE.configuration.storage, input_dataset_dir]) dataset_md = BaseDataset.load_meta_data(dataset_dir) # We are going to change the parameter_setting and don't want to # interfere with later runs so we work on a copy parameter_setting = copy.deepcopy(parameter_setting) # Ignore pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in parameter_setting: parameter_setting.pop("__PREPARE_OPERATION__") # Add the input parameters meta data to the given parameter setting if "parameter_setting" in dataset_md: parameter_setting.update(dataset_md["parameter_setting"]) # We have to remove ' characters from the parameter value since # Weka does ignore them for key, value in parameter_setting.iteritems(): if isinstance(value, basestring) and value.count("'") > 1: parameter_setting[key] = eval(value) # Determine the result_directory name # String between Key and value changed from ":" to "#", # because ot problems in windows and with windows file servers parameter_str = "}{".join(("%s#%s" % (key, value)) for key, value in parameter_setting.iteritems() if key not in hide_parameters) result_name = "{%s}" % input_name if parameter_str != "": result_name += "{%s}" % (parameter_str) # Determine the path where this result will be stored # and create the directory if necessary result_dir = base_dir result_dir += os.sep + result_name create_directory(result_dir) return result_dir
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir*. """ if self.store: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) # This node only stores the learned eigenvector and eigenvalues name = "%s_sp%s.pickle" % ("eigenmatrix", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(cPickle.dumps((self.avg, self.v), protocol=2)) result_file.close()
def store_state(self, result_dir, index=None): """ Stores *scikit_alg* """ if self.store: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) name = "%s_sp%s.pickle" % ("Model", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(cPickle.dumps(self.scikit_alg, protocol=2)) result_file.close() super(ScikitPredictor, self).store_state(result_dir, index)
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ from pySPACE.tools.filesystem import create_directory node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) result_file = open(os.path.join(node_dir, "window_definitions.txt"), "w") for window_def in self.window_definition: result_file.write(str(window_def)) result_file.close()
def store_state(self, result_dir, index=None): """ Stores *scikit_alg* """ if self.store: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) name = "%s_sp%s.pickle" % ("Model", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(cPickle.dumps(self.scikit_alg, protocol=2)) result_file.close() super(ScikitPredictor,self).store_state(result_dir, index)
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ if self.store: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) # This node only stores which electrodes have been selected name = "%s_sp%s.txt" % ("electrode_selection", self.current_split) result_file = open(os.path.join(node_dir, name), "wi") result_file.write(str(self.selected_channels)) result_file.close()
def store_state(self, result_dir, index=None): """ Stores transformation and feature names in the given directory *result_dir* """ if self.store: node_dir = os.path.join(result_dir, self.__class__.__name__) # self.__class__.__name__) create_directory(node_dir) name = "%s_sp%s.pickle" % ("FN", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(cPickle.dumps((self.translation, self.mult, self.feature_names), protocol=2)) result_file.close() super(FeatureNormalizationNode,self).store_state(result_dir)
def __init__(self, processes, operation_spec, result_directory): self.processes = processes self.operation_spec = operation_spec self.result_directory = result_directory # Check if the required directories exist # and create them if necessary create_directory(self.result_directory) # Store the specification of this operation in the directory source_operation_file = open(os.sep.join([self.result_directory, "source_operation.yaml"]), "w") yaml.dump(self.operation_spec, source_operation_file) source_operation_file.close()
def store_state(self, result_dir, index=None): """ Store this node in the given directory *result_dir* """ # ..todo :: mapping of flow_id and parameterization?! if self.store: for node in self.flow: node.store_state(result_dir, index) class_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(class_dir) # Store the search history name = "search_history_sp%d.pickle" % self.current_split result_file = open(os.path.join(class_dir, name), "wb") result_file.write(cPickle.dumps(self.search_history, protocol=cPickle.HIGHEST_PROTOCOL)) result_file.close()
def __init__(self, processes, operation_spec, result_directory): self.processes = processes self.operation_spec = operation_spec self.result_directory = result_directory # Check if the required directories exist # and create them if necessary create_directory(self.result_directory) # Store the specification of this operation in the directory source_operation_file = open( os.sep.join([self.result_directory, "source_operation.yaml"]), 'w') yaml.dump(self.operation_spec, source_operation_file) source_operation_file.close()
def store_state(self, result_dir, index=None): """ Store this node in the given directory *result_dir* """ # ..todo :: mapping of flow_id and parameterization?! if self.store: for node in self.flow: node.store_state(result_dir, index) class_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(class_dir) # Store the search history name = "search_history_sp%d.pickle" % self.current_split result_file = open(os.path.join(class_dir, name), "wb") result_file.write( cPickle.dumps(self.search_history, protocol=cPickle.HIGHEST_PROTOCOL)) result_file.close()
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ if self.store or self.visualize_pattern: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) if self.store: # This node only stores the learned CSP patterns name = "%s_sp%s.pickle" % ("patterns", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write( cPickle.dumps(self.filters, protocol=cPickle.HIGHEST_PROTOCOL)) result_file.close() # Store spatial filter plots if desired if self.visualize_pattern: CSPNode._store_spatial_filter_plots(self.filters, self.channel_names, node_dir)
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ if self.store: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) # This node only stores the order of the selected features' indices name = "%s_sp%s.pickle" % ("selected_features", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(cPickle.dumps(self.retained_feature_indices, protocol=2)) result_file.close() # Store feature names name = "feature_names_sp%s.txt" % self.current_split result_file = open(os.path.join(node_dir, name), "w") result_file.write("%s" % self.feature_names) result_file.close()
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ if self.store: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) # This node only stores the order of the selected features' indices name = "%s_sp%s.pickle" % ("selected_features", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write( cPickle.dumps(self.retained_feature_indices, protocol=2)) result_file.close() # Store feature names name = "feature_names_sp%s.txt" % self.current_split result_file = open(os.path.join(node_dir, name), "w") result_file.write("%s" % self.feature_names) result_file.close()
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ if self.store or self.visualize_pattern: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) if self.store: # This node only stores the learned CSP patterns name = "%s_sp%s.pickle" % ("patterns", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(cPickle.dumps(self.filters, protocol=cPickle.HIGHEST_PROTOCOL)) result_file.close() # Store spatial filter plots if desired if self.visualize_pattern: CSPNode._store_spatial_filter_plots(self.filters, self.channel_names, node_dir)
def __init__(self, processes, operation_spec, result_directory): self.processes = processes self.operation_spec = operation_spec self.result_directory = result_directory # Check if the required directories exist # and create them if necessary create_directory(self.result_directory) # Store the specification of this operation in the directory # without the base_file entry base_file = self.operation_spec.pop("base_file", None) source_operation_file = open(os.sep.join([self.result_directory, "source_operation.yaml"]), 'w') yaml.dump(self.operation_spec, source_operation_file) source_operation_file.close() if not base_file is None: self.operation_spec["base_file"] = base_file
def __init__(self, processes, operation_spec, result_directory): self.processes = processes self.operation_spec = operation_spec self.result_directory = result_directory # Check if the required directories exist # and create them if necessary create_directory(self.result_directory) # Store the specification of this operation in the directory # without the base_file entry base_file = self.operation_spec.pop("base_file", None) source_operation_file = open( os.sep.join([self.result_directory, "source_operation.yaml"]), 'w') yaml.dump(self.operation_spec, source_operation_file) source_operation_file.close() if not base_file is None: self.operation_spec["base_file"] = base_file
def store_state(self, result_dir, index=None): """ Stores all generated plots in the given directory *result_dir* """ if self.store: node_dir = os.path.join(result_dir, self.__class__.__name__) if not index == None: node_dir += "_%d" % int(index) create_directory(node_dir) if (self.ts_plot != None): name = 'timeseries_sp%s.pdf' % self.current_split self.ts_plot.savefig(os.path.join(node_dir, name), bbox_inches="tight") if (self.histo_plot != None): name = 'histo_sp%s.pdf' % self.current_split self.histo_plot.savefig(os.path.join(node_dir, name), bbox_inches="tight") for label in self.labeled_corr_matrix.keys(): name = 'Feature_Correlation_%s_sp%s.txt' % (label, self.current_split) pylab.savetxt(os.path.join(node_dir, name), self.labeled_corr_matrix[label], fmt='%s', delimiter=' ') name = 'Feature_Development_%s_sp%s.pdf' % (label, self.current_split) self.feature_development_plot[label].savefig( os.path.join(node_dir, name)) for label in self.corr_plot.keys(): name = 'Feature_Correlation_%s_sp%s.pdf' % (label, self.current_split) self.corr_plot[label].savefig(os.path.join(node_dir, name)) pylab.close("all")
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(MergeProcess, self).pre_benchmarking() # For all input collections for source_test_collection_path in self.input_collections: # Check if the input data is splitted # e.g. only a single test file is in the source directory source_files = glob.glob( os.sep.join( [source_test_collection_path, "data_run0", "*test*"])) splitted = len(source_files) > 1 assert (not splitted) source_file_name = str(source_files[-1]) # check if train sets are also present train_data_present = len(glob.glob(os.sep.join( [source_test_collection_path,"data_run0",\ "*train*"]))) > 0 # if training data is present -> use train and test sets separately if train_data_present: train_set_name_suffix = "train" else: train_set_name_suffix = "test" # We create the collection Rest_vs_Collection source_test_collection_name = \ source_test_collection_path.split(os.sep)[-2] test_base_collection_name = \ source_test_collection_name.strip("}{").split("}{")[0] if self.reverse: target_collection_name = source_test_collection_name.replace( test_base_collection_name, test_base_collection_name + "_vs_" + self.name_pattern) key = "train" else: target_collection_name = source_test_collection_name.replace( test_base_collection_name, self.name_pattern + "_vs_" + test_base_collection_name) key = "test" target_collection_path = os.sep.join( [self.result_directory, target_collection_name]) # determine the parameter_settings of the test collection test_collection = BaseDataset.load(source_test_collection_path) target_collection_params = \ test_collection.meta_data["parameter_setting"] target_collection_params["__INPUT_DATASET__"] = \ {key: source_test_collection_name} if source_file_name.endswith("arff"): file_ending = "arff" # Copy arff file from input collection to target collection source_test_file_path = os.sep.join([ source_test_collection_path, "data_run0", "features_sp0" + train_set_name_suffix + ".arff" ]) target_test_file_path = os.sep.join([ target_collection_path, "data_run0", "features_sp0_" + key + ".arff" ]) else: file_ending = source_file_name.split(".")[-1] source_test_file_path = source_test_collection_path target_test_file_path = target_collection_path source_train_pathes = [] for source_train_collection_path in self.input_collections: source_train_collection_name = \ source_train_collection_path.split(os.sep)[-2] # We must not use data originating from the same input # collection both in train and test files if source_test_collection_name == source_train_collection_name: continue # Check that all constraints are fulfilled for this pair of # input collections if not all(eval(constraint_template % \ {'source_train_collection_name': source_train_collection_name, 'source_test_collection_name': source_test_collection_name}) for constraint_template in self.collection_constraints): continue # check if all parameters are stored in the target path source_collection = \ BaseDataset.load(source_train_collection_path) source_collection_params = \ source_collection.meta_data["parameter_setting"] remaining_params = \ [param for param in source_collection_params.items() \ if param not in target_collection_params.items() and \ param[0] not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__", "__INPUT_COLLECTION__" ]] # for old data if remaining_params != []: for k, v in remaining_params: target_collection_path += "{%s#%s}" % (k, str(v)) target_collection_params[k] = v if "arff" == file_ending: source_train_file_path = \ os.sep.join([source_train_collection_path, "data_run0", "features_sp0_" + \ train_set_name_suffix + ".arff"]) else: source_train_file_path = source_train_collection_path source_train_pathes.append(source_train_file_path) if "arff" == file_ending: target_train_file_path = os.sep.join([ target_collection_path, "data_run0", "features_sp0_" + key + ".arff" ]) else: target_train_file_path = target_collection_path if len(source_train_pathes) == 0: continue create_directory(os.sep.join([target_collection_path, "data_run0"])) if "arff" == file_ending: self._copy_arff_file(source_test_file_path, target_test_file_path, source_test_collection_name, target_collection_name) self._merge_arff_files(target_train_file_path, source_train_pathes, target_collection_name) # Copy metadata.yaml # TODO: Adapt to new collection input_meta = BaseDataset.load_meta_data( source_test_collection_path) BaseDataset.store_meta_data(target_collection_path, input_meta) else: self._copy_file(source_test_collection_path, target_collection_path, train_set_name_suffix) self._merge_files(target_train_file_path, source_train_pathes, train_set_name_suffix, target_collection_params) ############## Clean up after benchmarking ############## super(MergeProcess, self).post_benchmarking()
def store_state(self, result_dir, index=None): """ Stores plots of score distribution and sigmoid fit. """ if self.store : # reliable plot of training (before linear fit) sort_index = numpy.argsort(self.scores) labels = numpy.array(self.labels)[sort_index] predictions = numpy.array(self.scores)[sort_index] plot_scores_train,l_discrete_train=self._discretize(predictions, labels) len_list_train, plot_emp_prob_train = self._empirical_probability(l_discrete_train) # training data after linear fit new_predictions = [] for score in predictions: if score < 0.0: new_predictions.append((score + self.max_range[0]) / \ (2.0 * self.max_range[0])) else: new_predictions.append((score + self.max_range[1]) / \ (2.0 * self.max_range[1])) plot_scores_train_fit, l_discrete_train_fit = \ self._discretize(new_predictions,labels) len_list_train_fit, plot_emp_prob_train_fit = \ self._empirical_probability(l_discrete_train_fit) # test data before sigmoid fit test_scores = [] test_labels = [] for data, label in self.input_node.request_data_for_testing(): test_scores.append(data.prediction) test_labels.append(self.class_labels.index(label)) sort_index = numpy.argsort(test_scores) labels = numpy.array(test_labels)[sort_index] predictions = numpy.array(test_scores)[sort_index] plot_scores_test,l_discrete_test = self._discretize(predictions, labels) len_list_test, plot_emp_prob_test = self._empirical_probability(l_discrete_test) # test data after sigmoid fit new_predictions = [] for score in predictions: if score < -1.0*self.max_range[0]: new_predictions.append(0.0) elif score < 0.0: new_predictions.append((score + self.max_range[0]) / \ (2.0 * self.max_range[0])) elif score < self.max_range[1]: new_predictions.append((score + self.max_range[1]) / \ (2.0 * self.max_range[1])) else: new_predictions.append(1.0) plot_scores_test_fit, l_discrete_test_fit = \ self._discretize(new_predictions,labels) len_list_test_fit, plot_emp_prob_test_fit = \ self._empirical_probability(l_discrete_test_fit) from pySPACE.tools.filesystem import create_directory import os node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) import pylab from matplotlib.transforms import offset_copy pylab.close() fig = pylab.figure(figsize=(10,10)) ax = pylab.subplot(2,2,1) transOffset=offset_copy(ax.transData,fig=fig,x=0.05,y=0.1,units='inches') for x,y,s in zip(plot_scores_train,plot_emp_prob_train[1],len_list_train[1]): pylab.plot((x,),(y,),'ro') pylab.text(x,y,'%d' % s, transform=transOffset) pylab.plot((plot_scores_train[0],plot_scores_train[-1]),(0,1),'-') x1 = numpy.arange(-1.0*self.max_range[0],0.0,.02) x2 = numpy.arange(0.0,self.max_range[1],.02) y1 = (x1+self.max_range[0])/(2*self.max_range[0]) y2 = (x2+self.max_range[1])/(2*self.max_range[1]) pylab.plot(numpy.concatenate((x1,x2)),numpy.concatenate((y1,y2)),'-') pylab.xlim(plot_scores_train[0],plot_scores_train[-1]) pylab.ylim(0,1) pylab.xlabel("SVM prediction Score (training data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2,2,2) transOffset=offset_copy(ax.transData,fig=fig,x=0.05,y=0.1,units='inches') for x, y, s in zip(plot_scores_train_fit, plot_emp_prob_train_fit[1], len_list_train_fit[1]): pylab.plot((x,),(y,),'ro') pylab.text(x,y,'%d' % s, transform=transOffset) pylab.plot((plot_scores_train_fit[0],plot_scores_train_fit[-1]),(0,1),'-') pylab.xlim(plot_scores_train_fit[0],plot_scores_train_fit[-1]) pylab.ylim(0,1) pylab.xlabel("SVM Probability (training data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2,2,3) transOffset=offset_copy(ax.transData,fig=fig,x=0.05,y=0.1,units='inches') for x,y,s in zip(plot_scores_test,plot_emp_prob_test[1],len_list_test[1]): pylab.plot((x,),(y,),'ro') pylab.text(x,y,'%d' % s, transform=transOffset) pylab.plot((plot_scores_test[0],plot_scores_test[-1]),(0,1),'-') x1 = numpy.arange(-1.0*self.max_range[0],0.0,.02) x2 = numpy.arange(0.0,self.max_range[1],.02) y1 = (x1+self.max_range[0])/(2*self.max_range[0]) y2 = (x2+self.max_range[1])/(2*self.max_range[1]) pylab.plot(numpy.concatenate([[plot_scores_test[0],self.max_range[0]], x1,x2,[self.max_range[1],plot_scores_test[-1]]]), numpy.concatenate([[0.0,0.0],y1,y2,[1.0,1.0]]),'-') pylab.xlim(plot_scores_test[0],plot_scores_test[-1]) pylab.ylim(0,1) pylab.xlabel("SVM prediction Score (test data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2,2,4) transOffset=offset_copy(ax.transData,fig=fig,x=0.05,y=0.1,units='inches') for x, y, s in zip(plot_scores_test_fit, plot_emp_prob_test_fit[1], len_list_test_fit[1]): pylab.plot((x,),(y,),'ro') pylab.text(x,y,'%d' % s, transform=transOffset) pylab.plot((plot_scores_test_fit[0],plot_scores_test_fit[-1]),(0,1),'-') pylab.xlim(plot_scores_test_fit[0],plot_scores_test_fit[-1]) pylab.ylim(0,1) pylab.xlabel("SVM Probability (test data)") pylab.ylabel("Empirical Probability") pylab.savefig(node_dir + "/reliable_diagrams_%d.png" % self.current_split)
def store_state(self, result_dir, index=None): """ Stores plots of score distribution and sigmoid fit or/and the calculated probabilities with the corresponding label. .. todo:: change plot calculations to upper if else syntax .. todo:: add the corresponding data point to the saved probabilities """ if self.store : # Create the directory for the stored results from pySPACE.tools.filesystem import create_directory import os node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) # Safe the probabilities in a pickle file if( self.store_probabilities ): import pickle f_name=node_dir + "/probabilities_%d.pickle" % self.current_split pickle.dump(self.probabilities, open(f_name,'w')) if self.store_plots: # reliable plot of training (before sigmoid fit) sort_index = numpy.argsort(self.scores) labels = numpy.array(self.labels)[sort_index] predictions = numpy.array(self.scores)[sort_index] plot_scores_train,l_discrete_train=self._discretize(predictions, labels) len_list_train, plot_emp_prob_train = self._empirical_probability(l_discrete_train) # training data after sigmoid fit fApB = predictions * self.A + self.B new_predictions = [(int(fApB[i]<0)+int(fApB[i]>=0)*numpy.exp(-fApB[i]))/ \ (1.0+numpy.exp((-1)**int(fApB[i]>=0)*fApB[i])) \ for i in range(len(fApB))] plot_scores_train_fit, l_discrete_train_fit = \ self._discretize(new_predictions,labels) len_list_train_fit, plot_emp_prob_train_fit = \ self._empirical_probability(l_discrete_train_fit) # test data before sigmoid fit test_scores = [] test_labels = [] for data, label in self.input_node.request_data_for_testing(): test_scores.append(data.prediction) test_labels.append(self.class_labels.index(label)) sort_index = numpy.argsort(test_scores) labels = numpy.array(test_labels)[sort_index] predictions = numpy.array(test_scores)[sort_index] plot_scores_test,l_discrete_test = self._discretize(predictions, labels) len_list_test, plot_emp_prob_test = self._empirical_probability(l_discrete_test) # test data after sigmoid fit fApB = predictions * self.A + self.B new_predictions = [(int(fApB[i]<0)+int(fApB[i]>=0)*numpy.exp(-fApB[i]))/ \ (1.0+numpy.exp((-1)**int(fApB[i]>=0)*fApB[i])) \ for i in range(len(fApB))] plot_scores_test_fit, l_discrete_test_fit = \ self._discretize(new_predictions,labels) len_list_test_fit, plot_emp_prob_test_fit = \ self._empirical_probability(l_discrete_test_fit) import pylab from matplotlib.transforms import offset_copy pylab.close() fig = pylab.figure(figsize=(10,10)) ax = pylab.subplot(2,2,1) transOffset=offset_copy(ax.transData,fig=fig,x=0.05,y=0.1,units='inches') for x,y,s in zip(plot_scores_train,plot_emp_prob_train[1],len_list_train[1]): pylab.plot((x,),(y,),'ro') pylab.text(x,y,'%d' % s, transform=transOffset) pylab.plot((plot_scores_train[0],plot_scores_train[-1]),(0,1),'-') x = numpy.arange(plot_scores_train[0],plot_scores_train[-1],.02) y = 1/(1+numpy.exp(self.A*x+self.B)) pylab.plot(x,y,'-') pylab.xlim(plot_scores_train[0],plot_scores_train[-1]) pylab.ylim(0,1) pylab.xlabel("SVM prediction Score (training data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2,2,2) transOffset=offset_copy(ax.transData,fig=fig,x=0.05,y=0.1,units='inches') for x, y, s in zip(plot_scores_train_fit, plot_emp_prob_train_fit[1], len_list_train_fit[1]): pylab.plot((x,),(y,),'ro') pylab.text(x,y,'%d' % s, transform=transOffset) pylab.plot((plot_scores_train_fit[0],plot_scores_train_fit[-1]),(0,1),'-') pylab.xlim(plot_scores_train_fit[0],plot_scores_train_fit[-1]) pylab.ylim(0,1) pylab.xlabel("SVM Probability (training data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2,2,3) transOffset=offset_copy(ax.transData,fig=fig,x=0.05,y=0.1,units='inches') for x,y,s in zip(plot_scores_test,plot_emp_prob_test[1],len_list_test[1]): pylab.plot((x,),(y,),'ro') pylab.text(x,y,'%d' % s, transform=transOffset) pylab.plot((plot_scores_test[0],plot_scores_test[-1]),(0,1),'-') x = numpy.arange(plot_scores_test[0],plot_scores_test[-1],.02) y = 1/(1+numpy.exp(self.A*x+self.B)) pylab.plot(x,y,'-') pylab.xlim(plot_scores_test[0],plot_scores_test[-1]) pylab.ylim(0,1) pylab.xlabel("SVM prediction Scores (test data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2,2,4) transOffset=offset_copy(ax.transData,fig=fig,x=0.05,y=0.1,units='inches') for x, y, s in zip(plot_scores_test_fit, plot_emp_prob_test_fit[1], len_list_test_fit[1]): pylab.plot((x,),(y,),'ro') pylab.text(x,y,'%d' % s, transform=transOffset) pylab.plot((plot_scores_test_fit[0],plot_scores_test_fit[-1]),(0,1),'-') pylab.xlim(plot_scores_test_fit[0],plot_scores_test_fit[-1]) pylab.ylim(0,1) pylab.xlabel("SVM Probability (test data)") pylab.ylabel("Empirical Probability") pylab.savefig(node_dir + "/reliable_diagrams_%d.png" % self.current_split)
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(ShuffleProcess, self).pre_benchmarking() for dataset_dir1 in self.input_datasets: for dataset_dir2 in self.input_datasets: dataset_name1 = dataset_dir1.split(os.sep)[-2] dataset_name2 = dataset_dir2.split(os.sep)[-2] # Check if the input data is split splitted = len( glob.glob(os.sep.join([dataset_dir1, "data_run0", "*" ]))) > 1 # Check that all constraints are fulfilled for this pair of # input datasets if not all( eval( constraint_template % { 'dataset_name1': dataset_name1, 'dataset_name2': dataset_name2 }) for constraint_template in self.dataset_constraints): continue if dataset_name1 == dataset_name2: if splitted: # Copy the data os.symlink( dataset_dir1, os.sep.join([self.result_directory, dataset_name1])) continue # Determine names of the original data sets the input # datasets are based on base_dataset1 = dataset_name1.strip("}{").split("}{")[0] base_dataset2 = dataset_name2.strip("}{").split("}{")[0] # Determine target dataset name and create directory # for it mixed_base_dataset = "%s_vs_%s" % (base_dataset1, base_dataset2) target_dataset_name = dataset_name1.replace( base_dataset1, mixed_base_dataset) target_dataset_dir = os.sep.join( [self.result_directory, target_dataset_name]) create_directory(os.sep.join([target_dataset_dir, "data_run0"])) if splitted: # For each split, copy the train data from dataset 1 and # the test data from dataset 2 to the target dataset for source_train_file_name in glob.glob( os.sep.join( [dataset_dir1, "data_run0", "*_sp*_train.*"])): # TODO: We have $n$ train sets and $n$ test sets, we "metadata.yaml"])), # could use all $n*n$ combinations target_train_file_name = source_train_file_name.replace( dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace( dataset_dir1, dataset_dir2) source_test_file_name = source_test_file_name.replace( "train.", "test.") target_test_file_name = target_train_file_name.replace( "train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) else: # Use the data set from dataset 1 as training set and # the data set from dataset 2 as test data for source_train_file_name in glob.glob( os.sep.join( [dataset_dir1, "data_run0", "*_sp*_test.*"])): target_train_file_name = source_train_file_name.replace( "test.", "train.") target_train_file_name = target_train_file_name.replace( dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace( dataset_dir1, dataset_dir2) target_test_file_name = target_train_file_name.replace( "train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) # Write metadata.yaml based on input meta data input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1) output_dataset_meta = dict(input_dataset1_meta) output_dataset_meta['train_test'] = True output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S") output_dataset_meta['author'] = get_author() BaseDataset.store_meta_data(target_dataset_dir, output_dataset_meta) ############## Clean up after benchmarking ############## super(ShuffleProcess, self).post_benchmarking()
def store_state(self, result_dir, #string of results dir index=None): #None or int: number in node chain """ Stores the plots to the *result_dir* and is used for offline plotting and for plotting of average values (online and offline). Plots offline-data for every trial which has not been skipped. Optionally creates movies based on the stored images. Called by base_node. Returns: Nothing. """ if self.store: #set the specific directory for this particular node node_dir = os.path.join(result_dir, self.__class__.__name__) #do we have an index-number? if index is None: #add the index-number... node_dir += "_%d" % int(index) create_directory(node_dir) else: #no specific directory node_dir=None #offline mode? if not self.online and (self.single_trial or self.accum_avg): if not hasattr(self, "_plotValues"): warnings.warn("VisualizationBase:: The node you are using for visualisation " \ "has no function _plotValues! This is most likely not what you intended!" \ "Plotting ignored!") else: pos = 0 for trial_num in range(1, self.trial_counter+1): if trial_num not in self.skipped_trials: if self.single_trial: self._plotValues( values=self.st_list[pos], plot_label="single_trial_no_" + str(trial_num), fig_num=self.initial_fig_num+2, store_dir=node_dir, counter=trial_num) if self.accum_avg: self._plotValues( values=self.accum_list[pos], plot_label="accum_avg_no_"+str(trial_num), fig_num=self.initial_fig_num+3, store_dir=node_dir, counter=trial_num) pos += 1 #plotting of the whole average or storage of the movie may also be possible in online mode if self.online: #set or change the the specific directory for the node to the #execution-path with a timestamp (see __init__) node_dir = self.user_dir #is averaging intended? if self.averaging: if not self.avg_values: warnings.warn("VisualizationBase:: One of your averages has no " \ "instances! Plotting ignored!") else: if hasattr(self, "_plotValues"): self._plotValues(values=self.avg_values, plot_label="average", fig_num=self.initial_fig_num+1, store_dir=node_dir) else: warnings.warn("VisualizationBase:: The node you are using for visualisation " \ "has no function _plotValues! This is most likely not what you intended!" \ "Plotting ignored!") #Finally create a movie if specified if self.create_movie and self.store_data: prefixes = [] if self.single_trial: for trial in range(1, self.trial_counter+1): prefixes.append("single_trial_no_" + str(trial)) if self.accum_avg: for trial in range(1, self.trial_counter+1): prefixes.append("accum_avg_no_" + str(trial)) if self.averaging: prefixes.append('average') self._create_movie(prefixes=prefixes, directory=node_dir) #close the figure windows pylab.close('all')
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ if self.store: try: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) # This node only stores the learned spatial filters name = "%s_sp%s.pickle" % ("patterns", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write(cPickle.dumps((self.filters, self.wi, self.ai), protocol=2)) result_file.close() # Stores the signal to signal plus noise ratio resulted # by the spatial filter #fname = "SNR_sp%s.csv" % ( self.current_split) #numpy.savetxt(os.path.join(node_dir, fname), self.SNR, # delimiter=',', fmt='%2.5e') # Store spatial filter plots if desired if self.visualize_pattern: from pySPACE.missions.nodes.spatial_filtering.csp \ import CSPNode # Compute, accumulate and analyze signal components # estimated by xDAWN vmin = numpy.inf vmax = -numpy.inf signal_components = [] complete_signal = numpy.zeros((self.wi.shape[1], self.ai.shape[1])) for filter_index in range(self.retained_channels): #self.ai.shape[0]): signal_component = numpy.outer(self.wi[filter_index, :], self.ai[filter_index, :]) vmin = min(signal_component.min(), vmin) vmax = max(signal_component.max(), vmax) signal_components.append(signal_component) complete_signal += signal_component # Plotting import pylab for index, signal_component in enumerate(signal_components): pylab.figure(0, figsize=(18,8)) pylab.gcf().clear() # Plot spatial distribution ax=pylab.axes([0.0, 0.0, 0.2, 0.5]) CSPNode._plot_spatial_values(ax, self.wi[index, :], self.channel_names, 'Spatial distribution') # Plot spatial filter ax=pylab.axes([0.0, 0.5, 0.2, 0.5]) CSPNode._plot_spatial_values(ax, self.filters[:, index], self.channel_names, 'Spatial filter') # Plot signal component in electrode coordinate system self._plotTimeSeriesInEC(signal_component, vmin=vmin, vmax=vmax, bb=(0.2, 1.0, 0.0, 1.0)) pylab.savefig("%s%ssignal_component%02d.png" % (node_dir, os.sep, index)) CSPNode._store_spatial_filter_plots( self.filters[:, :self.retained_channels], self.channel_names, node_dir) # Plot entire signal pylab.figure(0, figsize=(15, 8)) pylab.gcf().clear() self._plotTimeSeriesInEC( complete_signal, file_name="%s%ssignal_complete.png" % (node_dir, os.sep) ) pylab.savefig( "%s%ssignal_complete.png" % (node_dir, os.sep)) except Exception as e: print e raise super(XDAWNNode, self).store_state(result_dir)
def create(cls, operation_spec, base_result_dir=None): """ A factory method that calls the responsible method for creating an operation of the type specified in the operation specification dictionary (*operation_spec*). """ # Determine result directory result_directory = cls.get_unique_result_dir(base_result_dir) print("--> Results will be stored at: \n\t\t %s" % str(result_directory)) # Check if the required directories exist # and create them if necessary create_directory(result_directory) # Determine all input datasets (note: they can be specified by # extended syntax for the glob package) storage = pySPACE.configuration.storage if not operation_spec.has_key("input_path"): warnings.warn("No input path found in operation specification.") input_path_pattern = os.sep.join( [storage, operation_spec.get("input_path", ""), "*", ""]) input_paths = glob.glob(input_path_pattern) obsolete_paths = [] for path in input_paths: if os.path.isfile(os.sep.join([path, "metadata.yaml"])): continue elif os.path.isfile(os.sep.join([path, "collection.yaml"])): continue # warning comes, when data is loaded else: obsolete_paths.append(path) warnings.warn('Folder' + str(path) + ' seems not to be a pySPACE' + ' dataset (no "metadata.yaml" found)! ' + 'Skipping this folder in operation...') for path in obsolete_paths: input_paths.remove(path) op_type = operation_spec["type"] if op_type.endswith("_operation"): l = len("_operation") * -1 op_type = op_type[:l] operation_spec["type"] = op_type warnings.warn( "'%s_operation' has the wrong ending. Using '%s' instead." % (op_type, op_type), DeprecationWarning) op_class_name = ''.join([x.title() for x in op_type.split('_')]) op_class_name += "Operation" # dynamic class import: from data_mod_name import col_class_name try: op_module = __import__('pySPACE.missions.operations.%s' % op_type, fromlist=[op_class_name]) except: msg = "Operation module %s is unknown. Trying to use node_chain." % ( op_type) from pySPACE.missions.operations.node_chain import NodeChainOperation op_class = NodeChainOperation else: op_class = getattr(op_module, op_class_name) return op_class.create(operation_spec, result_directory, input_paths=input_paths)
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(ConcatenateProcess, self).pre_benchmarking() # remember what has already been merged merged_dataset_pathes = [] # For all input datasets for source_dataset_path1 in self.input_datasets: if source_dataset_path1 in merged_dataset_pathes: continue # At the moment split data is not supported, so there should be only # a single test file is in the source directory source_files = glob.glob(os.sep.join([source_dataset_path1, "data_run0", "*test*"])) source_pathes = [] is_split = len(source_files) > 1 assert(not is_split),"Multiple test splits as in %s \ are not yet supported."%str(source_files) # At the moment train data is not supported, so check if train sets # are also present train_data_present = len(glob.glob(os.sep.join( [source_dataset_path1,"data_run0",\ "*train*"]))) > 0 assert(not train_data_present),"Using training data is not yet implemented." # We create the "All" dataset source_dataset_name1 = source_dataset_path1.split(os.sep)[-2] base_dataset_name = \ source_dataset_name1.strip("}{").split("}{")[0] if self.name_pattern != None: target_dataset_name = source_dataset_name1.replace( base_dataset_name, eval(self.name_pattern % \ {"dataset_name" : base_dataset_name})) else: target_dataset_name = source_dataset_name1.replace( base_dataset_name, base_dataset_name[:-1]+"_all") source_pathes.append(source_dataset_path1) target_dataset_path = os.sep.join([self.result_directory, target_dataset_name]) for source_dataset_path2 in self.input_datasets: source_dataset_name2 = source_dataset_path2.split(os.sep)[-2] # Do not use data we have already in the source_path list if (source_dataset_path2 == source_dataset_path1) or (source_dataset_path2 in merged_dataset_pathes): continue # Check that all constraints are fulfilled for this pair of # input datasets if not all(eval(constraint_template % \ {'dataset_name1': source_dataset_name1, 'dataset_name2': source_dataset_name2}) for constraint_template in self.dataset_constraints): continue source_pathes.append(source_dataset_path2) merged_dataset_pathes.append(source_dataset_path1) merged_dataset_pathes.append(source_dataset_path2) create_directory(os.sep.join([target_dataset_path, "data_run0"])) self._merge_pickle_files(target_dataset_path, source_pathes) ############## Clean up after benchmarking ############## super(ConcatenateProcess, self).post_benchmarking()
def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting, hide_parameters): """ Determines the name of the result directory Determines the name of the result directory based on the input_dataset_dir, the node_chain_name and the parameter setting. """ # Determine the result_directory name # String between Key and value changed from ":" to "#", # because ot problems in windows and with windows file servers def _get_result_dir_name(parameter_setting, hide_parameters, method=None): """ internal function to create result dir name in different ways""" if not method: parameter_str = "}{".join( ("%s#%s" % (key, value)) for key, value in parameter_setting.iteritems() if key not in hide_parameters) elif method == "hash": parameter_str = "}{".join( ("%s#%s" % (key, hash(str(value).replace(' ', '')))) for key, value in parameter_setting.iteritems() if key not in hide_parameters) parameter_str = parameter_str.replace("'", "") parameter_str = parameter_str.replace(" ", "") parameter_str = parameter_str.replace("[", "") parameter_str = parameter_str.replace("]", "") parameter_str = parameter_str.replace(os.sep, "") result_name = "{%s}" % input_name if parameter_str != "": result_name += "{%s}" % (parameter_str) # Determine the path where this result will be stored # and create the directory if necessary result_dir = base_dir result_dir += os.sep + result_name # filename is to long # (longer than allowed including optional offsets for pyspace # result csv naming conventions) # create a md5 hash of the result name and use that one import platform CURRENTOS = platform.system() if CURRENTOS == "Windows": # the maximum length for a filename on Windows is 255 if len(result_dir) > 255 - 32: result_name = "{" + hashlib.md5( result_name).hexdigest() + "}" result_dir = base_dir result_dir += os.sep + result_name return result_dir else: if len(result_dir) > os.pathconf(os.curdir, 'PC_NAME_MAX') - 32: result_name = "{" + hashlib.md5( result_name).hexdigest() + "}" result_dir = base_dir result_dir += os.sep + result_name return result_dir input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1] input_name = input_name.strip("{}") # If the input is already the result of an operation if input_name.count("}{") > 0: input_name_parts = input_name.split("}{") input_name = input_name_parts[0] # Load the input meta data dataset_dir = os.sep.join( [pySPACE.configuration.storage, input_dataset_dir]) dataset_md = BaseDataset.load_meta_data(dataset_dir) # We are going to change the parameter_setting and don't want to # interfere with later runs so we work on a copy parameter_setting = copy.deepcopy(parameter_setting) # Ignore pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in parameter_setting: parameter_setting.pop("__PREPARE_OPERATION__") # Add the input parameters meta data to the given parameter setting if "parameter_setting" in dataset_md: parameter_setting.update(dataset_md["parameter_setting"]) # We have to remove ' characters from the parameter value since # Weka does ignore them for key, value in parameter_setting.iteritems(): if isinstance(value, basestring) and value.count("'") > 1: parameter_setting[key] = eval(value) result_dir = _get_result_dir_name(parameter_setting, hide_parameters) try: create_directory(result_dir) except OSError as e: if e.errno == 36: # filename is too long result_dir = _get_result_dir_name(parameter_setting, hide_parameters, "hash") create_directory(result_dir) return result_dir
def __init__(self, request_training=True, request_test=True, separate_training_and_test=False, averaging=True, accum_avg=False, single_trial=False, time_stamps=[-1], store=False, store_data=False, create_movie=False, timeshift=0, online=False, user_dir='./', limit2class=None, physiological_arrangement=True, history_index=None, use_FN=True, use_SF=True, SF_channels=None, use_transformation=False, rand_initial_fig=True, covariancing=False, **kwargs): """ Used to initialize the environment. Called by VisualizationBase child-node. Parameters: See description above. Returns: Nothing. """ #should training and test data be handled separately if separate_training_and_test: #if yes: all data has to be requested request_training = True request_test = True #modify request_training accordingly self.request_training = request_training super(VisualizationBase, self).__init__(store=store, **kwargs) if rand_initial_fig: initial_fig_num = int(numpy.random.rand() * 10000) else: initial_fig_num = 0 if create_movie: #store the graphics to the persistency directory used in store_state store = True if not store: store_data = False #if plots are stored in online mode a directory is either specified or #data is stored in execution path if online and store: #is user_dir not set explicitly? if user_dir == './': #set the user directory to the execution-path user_dir = '%s/' % os.getcwd() #add a folder with a timestamp user_dir = os.path.join(user_dir,time.strftime("%Y%m%d_%H_%M_%S") + \ '_Visualization_Plot/') create_directory(user_dir) else: user_dir = None #either offline plotting or store=False self.set_permanent_attributes( request_training=request_training, request_test=request_test, separate_training_and_test=separate_training_and_test, averaging=averaging, accum_avg=accum_avg, single_trial=single_trial, time_stamps=time_stamps, create_movie=create_movie, timeshift=timeshift, online=online, limit2class=limit2class, user_dir=user_dir, store_data=store_data, store=store, trial_counter=0, avg_values=dict(), accum_list=list(), st_list=list(), label_counter=defaultdict(int), skipped_trials=list(), #list of not evaluated trials # whenever _execute was called current_trafo_TS=None, physiological_arrangement=physiological_arrangement, history_index=history_index, use_FN=use_FN, use_SF=use_SF, SF_channels=SF_channels, use_transformation=use_transformation, initial_fig_num=initial_fig_num, covariancing=covariancing, )
def __init__(self, request_training=True, request_test=True, separate_training_and_test=False, averaging=True, accum_avg=False, single_trial=False, time_stamps=[-1], store=False, store_data=False, create_movie=False, timeshift=0, online=False, user_dir='./', limit2class=None, physiological_arrangement=True, rand_initial_fig=True, **kwargs): """ Used to initialize the environment. Called by VisualizationBase child-node. Parameters: See description above. Returns: Nothing. """ #should training and test data be handled separately if separate_training_and_test: #if yes: all data has to be requested request_training = True request_test = True #modify request_training accordingly self.request_training = request_training super(VisualizationBase, self).__init__(store=store, **kwargs) if rand_initial_fig: initial_fig_num=int(numpy.random.rand()*10000) else: initial_fig_num=0 if create_movie: #store the graphics to the persistency directory used in store_state store = True if not store: store_data = False #if plots are stored in online mode a directory is either specified or #data is stored in execution path if online and store: #is user_dir not set explicitly? if user_dir == './': #set the user directory to the execution-path user_dir = '%s/' % os.getcwd() #add a folder with a timestamp user_dir = user_dir + time.strftime("%Y%m%d_%H_%M_%S") + \ '_Visualization_Plot/' create_directory(user_dir) else: user_dir = None #either offline plotting or store=False self.set_permanent_attributes( request_training=request_training, request_test=request_test, separate_training_and_test=separate_training_and_test, averaging=averaging, accum_avg=accum_avg, single_trial=single_trial, time_stamps=time_stamps, create_movie=create_movie, timeshift=timeshift, online=online, limit2class=limit2class, user_dir=user_dir, store_data=store_data, store=store, trial_counter=0, avg_values=dict(), accum_list=list(), st_list=list(), label_counter=defaultdict(int), skipped_trials=list(), # list of not evaluated trials whenever # _execute was called physiological_arrangement=physiological_arrangement, initial_fig_num=initial_fig_num)
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* .. todo:: Documentation! What is stored? And how? """ if self.store: try: # Create metric function lazily since it cannot be pickled metric_fct = self._get_metric_fct() # Determine curve on test data # TODO: Code duplication (mostly already in train) predictions_test = [] labels_test = [] for data, label in self.input_node.request_data_for_testing(): predictions_test.append(data.prediction) labels_test.append(self.classes.index(label)) sort_index = numpy.argsort(predictions_test) labels_test = numpy.array(labels_test)[sort_index] predictions_test = numpy.array(predictions_test)[sort_index] # Determine orientation of hyperplane if self.orientation_up: TP = list(labels_test).count(1) FP = list(labels_test).count(0) TN = 0 FN = 0 else: TP = 0 FP = 0 TN = list(labels_test).count(0) FN = list(labels_test).count(1) self.predictions_test = [[], []] for label, prediction_value, in zip (labels_test, predictions_test): if label == 0 and self.orientation_up: TN += 1 FP -= 1 elif label == 0 and not self.orientation_up: TN -= 1 FP += 1 elif label == 1 and self.orientation_up: FN += 1 TP -= 1 elif label == 1 and not self.orientation_up: FN -= 1 TP += 1 assert (TP >= 0 and FP >= 0 and TN >= 0 and FN >=0), \ "TP: %s FP: %s TN: %s FN: %s" % (TP, FP, TN, FN) metric_value = metric_fct(TP, FP, TN, FN) self.predictions_test[0].append(prediction_value) self.predictions_test[1].append(metric_value) ### Plot ## import pylab pylab.close() fig_width_pt = 307.28987*2 # Get this from LaTeX using \showthe\columnwidth inches_per_pt = 1.0/72.27 # Convert pt to inches fig_width = fig_width_pt*inches_per_pt # width in inches fig_height =fig_width * 0.5 # height in inches fig_size = [fig_width,fig_height] params = {'axes.labelsize': 10, 'text.fontsize': 8, 'legend.fontsize': 8, 'xtick.labelsize': 10, 'ytick.labelsize': 10} pylab.rcParams.update(params) fig = pylab.figure(0, dpi=400,figsize=fig_size) xmin = min(min(self.predictions_train[0]), min(self.predictions_test[0])) xmax = max(max(self.predictions_train[0]), max(self.predictions_test[0])) ymin = min(min(self.predictions_train[1]), min(self.predictions_test[1])) ymax = max(max(self.predictions_train[1]), max(self.predictions_test[1])) pylab.plot(self.predictions_train[0], self.predictions_train[1], 'b', label = 'Training data') pylab.plot(self.predictions_test[0], self.predictions_test[1], 'g', label = 'Unseen test data') pylab.plot([self.classifier_threshold, self.classifier_threshold], [ymin, ymax], 'r', label = 'Original Threshold', lw=5) pylab.plot([self.threshold, self.threshold], [ymin, ymax], 'c', label = 'Optimized Threshold', lw=5) pylab.legend(loc = 0) pylab.xlim((xmin, xmax)) pylab.ylim((ymin, ymax)) pylab.xlabel("Threshold value") pylab.ylabel("Metric: %s" % self.metric) # Store plot from pySPACE.tools.filesystem import create_directory import os node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) pylab.savefig(node_dir + os.sep + "threshold_metric.pdf") except: self._log("To many channels chosen for the retained channels! " "Replaced by maximum number.", level=logging.WARNING) super(ThresholdOptimizationNode,self).store_state(result_dir)
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* """ if self.store: try: node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) # This node only stores the learned spatial filters name = "%s_sp%s.pickle" % ("patterns", self.current_split) result_file = open(os.path.join(node_dir, name), "wb") result_file.write( cPickle.dumps((self.filters, self.wi, self.ai), protocol=2)) result_file.close() # Stores the signal to signal plus noise ratio resulted # by the spatial filter #fname = "SNR_sp%s.csv" % ( self.current_split) #numpy.savetxt(os.path.join(node_dir, fname), self.SNR, # delimiter=',', fmt='%2.5e') # Store spatial filter plots if desired if self.visualize_pattern: from pySPACE.missions.nodes.spatial_filtering.csp \ import CSPNode # Compute, accumulate and analyze signal components # estimated by xDAWN vmin = numpy.inf vmax = -numpy.inf signal_components = [] complete_signal = numpy.zeros( (self.wi.shape[1], self.ai.shape[1])) for filter_index in range(self.retained_channels): #self.ai.shape[0]): signal_component = numpy.outer( self.wi[filter_index, :], self.ai[filter_index, :]) vmin = min(signal_component.min(), vmin) vmax = max(signal_component.max(), vmax) signal_components.append(signal_component) complete_signal += signal_component # Plotting import pylab for index, signal_component in enumerate( signal_components): pylab.figure(0, figsize=(18, 8)) pylab.gcf().clear() # Plot spatial distribution ax = pylab.axes([0.0, 0.0, 0.2, 0.5]) CSPNode._plot_spatial_values(ax, self.wi[index, :], self.channel_names, 'Spatial distribution') # Plot spatial filter ax = pylab.axes([0.0, 0.5, 0.2, 0.5]) CSPNode._plot_spatial_values(ax, self.filters[:, index], self.channel_names, 'Spatial filter') # Plot signal component in electrode coordinate system self._plotTimeSeriesInEC(signal_component, vmin=vmin, vmax=vmax, bb=(0.2, 1.0, 0.0, 1.0)) pylab.savefig("%s%ssignal_component%02d.png" % (node_dir, os.sep, index)) CSPNode._store_spatial_filter_plots( self.filters[:, :self.retained_channels], self.channel_names, node_dir) # Plot entire signal pylab.figure(0, figsize=(15, 8)) pylab.gcf().clear() self._plotTimeSeriesInEC( complete_signal, file_name="%s%ssignal_complete.png" % (node_dir, os.sep)) pylab.savefig("%s%ssignal_complete.png" % (node_dir, os.sep)) except Exception as e: print e raise super(XDAWNNode, self).store_state(result_dir)
def store_state(self, result_dir, index=None): """ Main method which generates and stores the graphics """ if self.store: #set the specific directory for this particular node node_dir = os.path.join(result_dir, self.__class__.__name__) #do we have an index-number? if not index is None: #add the index-number... node_dir += "_%d" % int(index) create_directory(node_dir) colors = ["white", "black", "blue", "red"] if self.mode == "FeatureVector": for label in self.averages: self.averages[label] *= 1.0/self.counter[label] #http://wiki.scipy.org/Cookbook/Matplotlib/Show_colormaps pylab.figure(figsize=(4, 4), dpi=300) pylab.contourf(self.averages[label], 50, cmap="jet", origin="image") pylab.xticks(()) pylab.yticks(()) #pylab.colorbar() f_name = str(node_dir)+str(os.sep)+str(label)+"average" pylab.savefig(f_name + ".png", bbox_inches='tight') for index, input in enumerate(self.inputs): pylab.figure(figsize=(4, 4), dpi=300) pylab.contourf(input, 50, cmap="binary", origin="image") pylab.xticks(()) pylab.yticks(()) #pylab.colorbar() f_name = str(node_dir)+str(os.sep)+"sample"+str(index) pylab.savefig(f_name + ".png", bbox_inches='tight') elif self.mode == "PredictionVector": trafos = self.get_previous_transformations()[-1] trafo = trafos[0] trafo.view(numpy.ndarray) covariance = trafos[1][1] trafo_covariance = numpy.dot(covariance, trafo.flatten()) # covariance free picture number_array = trafo.reshape((28, 28)) fig = pylab.figure(figsize=(4, 4), dpi=300) pylab.contourf(number_array, 50, cmap="jet", origin="image", vmax=abs(number_array).max(), vmin=-abs(number_array).max()) pylab.xticks(()) pylab.yticks(()) #pylab.colorbar() if not self.averages is None: for label in self.averages: self.averages[label] *= 1.0/self.counter[label] pylab.contour( self.averages[label], levels=[50], colors=colors[self.averages.keys().index(label)], linewidths=3, origin="image") f_name = str(node_dir)+str(os.sep)+"classifier" pylab.savefig(f_name + ".png", bbox_inches='tight') pylab.close(fig) # covariance picture (similar code as before) number_array = trafo_covariance.reshape((28, 28)) fig = pylab.figure(figsize=(4, 4), dpi=300) pylab.contourf(number_array, 50, cmap="jet", origin="image", vmax=abs(number_array).max(), vmin=-abs(number_array).max()) pylab.xticks(()) pylab.yticks(()) #pylab.colorbar() if not self.averages is None: for label in self.averages: pylab.contour( self.averages[label], levels=[50], linewidths=3, colors=colors[self.averages.keys().index(label)], origin="image") f_name = str(node_dir)+str(os.sep)+"classifier_cov" pylab.savefig(f_name + ".png", bbox_inches='tight') pylab.close(fig) elif self.mode == "nonlinear": from matplotlib.backends.backend_pdf import PdfPages import datetime with PdfPages(str(node_dir)+str(os.sep)+'sample_vis.pdf') as pdf: index = 0 for sample in self.inputs: index += 1 base_vector = sample.history[self.history_index-1] trafos = self.get_previous_transformations(base_vector)[-1] trafo = trafos[0] trafo.view(numpy.ndarray) covariance = trafos[1][1] trafo_covariance = \ numpy.dot(covariance, trafo.flatten()) covariance_array = trafo_covariance.reshape((28, 28)) base_array = base_vector.reshape((28, 28)) trafo_array = trafo.reshape((28, 28)) #fig = pylab.figure(figsize=(5, 5), dpi=300) #pylab.suptitle(sample.label) # SUBPLOT 1: plot of the derivative #pylab.subplot(2, 2, 1) #pylab.title("Backtransformation") fig = pylab.figure(figsize=(4, 4), dpi=300) pylab.contourf(trafo_array, 50, cmap="jet", origin="image", vmax=abs(trafo_array).max(), vmin=-abs(trafo_array).max()) pylab.xticks(()) pylab.yticks(()) # pylab.colorbar() pylab.contour( base_array, levels=[50], colors=colors[1], origin="image") # store and clean f_name = str(node_dir) + str(os.sep) + "classifier_" \ + str(index) pylab.savefig(f_name + ".png", bbox_inches='tight') pylab.close(fig) fig = pylab.figure(figsize=(4, 4), dpi=300) # SUBPLOT 2: plot of the derivative multiplied with covariance # pylab.subplot(2,2,2) # pylab.title("Backtransformation times Covariance") pylab.contourf(covariance_array, 50, cmap="jet", origin="image", vmax=abs(covariance_array).max(), vmin=-abs(covariance_array).max()) pylab.xticks(()) pylab.yticks(()) # pylab.colorbar() pylab.contour( base_array, levels=[50], colors=colors[1], origin="image") # # SUBPLOT 2: plot of the original feature vector # pylab.subplot(2,2,2) # pylab.title("Original data") # # pylab.contourf(base_array, 50, cmap="binary", origin="image") # pylab.xticks(()) # pylab.yticks(()) # pylab.colorbar() # # SUBPLOT 3: plot of the difference between vectors # pylab.subplot(2,2,3) # pylab.title("Addition") # # pylab.contourf(trafo_array+base_array, 50, cmap="spectral", origin="image") # pylab.xticks(()) # pylab.yticks(()) # pylab.colorbar() # # # SUBPLOT 4: plot of the difference between vectors # pylab.subplot(2,2,4) # pylab.title("Subtraction") # # pylab.contourf(base_array-trafo_array, 50, cmap="spectral", origin="image") # pylab.xticks(()) # pylab.yticks(()) # pylab.colorbar() # pdf.savefig(fig, bbox_inches='tight') # store and clean f_name = str(node_dir) + str(os.sep) + \ "classifier_cov_" + str(index) pylab.savefig(f_name + ".png", bbox_inches='tight') pylab.close(fig) if index == self.max_samples: break # d = pdf.infodict() # d['Title'] = 'Sample visualization' # # d['Author'] = '' # # d['Subject'] = '' # # d['Keywords'] = '' # d['CreationDate'] = datetime.datetime.today() # d['ModDate'] = datetime.datetime.today() pylab.close('all')
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(MergeProcess, self).pre_benchmarking() # For all input collections for source_test_collection_path in self.input_collections: # Check if the input data is splitted # e.g. only a single test file is in the source directory source_files = glob.glob(os.sep.join([source_test_collection_path, "data_run0", "*test*"])) splitted = len(source_files) > 1 assert(not splitted) source_file_name = str(source_files[-1]) # check if train sets are also present train_data_present = len(glob.glob(os.sep.join( [source_test_collection_path,"data_run0",\ "*train*"]))) > 0 # if training data is present -> use train and test sets separately if train_data_present: train_set_name_suffix = "train" else: train_set_name_suffix = "test" # We create the collection Rest_vs_Collection source_test_collection_name = \ source_test_collection_path.split(os.sep)[-2] test_base_collection_name = \ source_test_collection_name.strip("}{").split("}{")[0] if self.reverse: target_collection_name = source_test_collection_name.replace( test_base_collection_name, test_base_collection_name + "_vs_Rest") key = "train" else: target_collection_name = source_test_collection_name.replace( test_base_collection_name, "Rest_vs_" + test_base_collection_name) key = "test" target_collection_path = os.sep.join([self.result_directory, target_collection_name]) # determine the parameter_settings of the test collection test_collection = BaseDataset.load(source_test_collection_path) target_collection_params = \ test_collection.meta_data["parameter_setting"] target_collection_params["__INPUT_DATASET__"] = \ {key: source_test_collection_name} if source_file_name.endswith("arff"): file_ending = "arff" # Copy arff file from input collection to target collection source_test_file_path = os.sep.join([source_test_collection_path, "data_run0","features_sp0" + train_set_name_suffix + ".arff"]) target_test_file_path = os.sep.join([target_collection_path, "data_run0","features_sp0_"+key+".arff"]) elif source_file_name.endswith("pickle"): file_ending = "pickle" source_test_file_path = source_test_collection_path target_test_file_path = target_collection_path else: raise NotImplementedError("File type not supported in " \ "MergeOperation") source_train_pathes = [] for source_train_collection_path in self.input_collections: source_train_collection_name = \ source_train_collection_path.split(os.sep)[-2] # We must not use data originating from the same input # collection both in train and test files if source_test_collection_name == source_train_collection_name: continue # Check that all constraints are fulfilled for this pair of # input collections if not all(eval(constraint_template % \ {'source_train_collection_name': source_train_collection_name, 'source_test_collection_name': source_test_collection_name}) for constraint_template in self.collection_constraints): continue # check if all parameters are stored in the target path source_collection = \ BaseDataset.load(source_train_collection_path) source_collection_params = \ source_collection.meta_data["parameter_setting"] remaining_params = \ [param for param in source_collection_params.items() \ if param not in target_collection_params.items() and \ param[0] not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__", "__INPUT_COLLECTION__" ]] # for old data if remaining_params != []: for k,v in remaining_params: target_collection_path += "{%s#%s}" % (k,str(v)) target_collection_params[k]=v if "arff" == file_ending: source_train_file_path = \ os.sep.join([source_train_collection_path, "data_run0", "features_sp0_" + \ train_set_name_suffix + ".arff"]) elif "pickle" == file_ending: source_train_file_path = source_train_collection_path else: raise NotImplementedError("File type not supported in " \ "MergeOperation!") source_train_pathes.append(source_train_file_path) if "arff" == file_ending: target_train_file_path = os.sep.join([target_collection_path, "data_run0","features_sp0_"+key+".arff"]) elif "pickle" == file_ending: target_train_file_path = target_collection_path else: raise NotImplementedError("File type not supported in " "MergeOperation!") if len(source_train_pathes) == 0: continue create_directory(os.sep.join([target_collection_path, "data_run0"])) if "arff" == file_ending: self._copy_arff_file(source_test_file_path, target_test_file_path, source_test_collection_name, target_collection_name) self._merge_arff_files(target_train_file_path, source_train_pathes, target_collection_name) # Copy metadata.yaml # TODO: Adapt to new collection input_meta = BaseDataset.load_meta_data(source_test_collection_path) BaseDataset.store_meta_data(target_collection_path,input_meta) elif "pickle" == file_ending: self._copy_pickle_file(source_test_collection_path, target_collection_path, train_set_name_suffix) self._merge_pickle_files(target_train_file_path, source_train_pathes, train_set_name_suffix, target_collection_params) else: raise NotImplementedError("File type not supported in merge_operation") ############## Clean up after benchmarking ############## super(MergeProcess, self).post_benchmarking()
def __call__(self): """ Executes this process on the respective modality """ ############## Prepare benchmarking ############## super(ShuffleProcess, self).pre_benchmarking() for dataset_dir1 in self.input_datasets: for dataset_dir2 in self.input_datasets: dataset_name1 = dataset_dir1.split(os.sep)[-2] dataset_name2 = dataset_dir2.split(os.sep)[-2] # Check if the input data is split splitted = len(glob.glob(os.sep.join([dataset_dir1, "data_run0", "*"]))) > 1 # Check that all constraints are fulfilled for this pair of # input datasets if not all(eval(constraint_template % {'dataset_name1': dataset_name1, 'dataset_name2': dataset_name2}) for constraint_template in self.dataset_constraints): continue if dataset_name1 == dataset_name2: if splitted: # Copy the data os.symlink(dataset_dir1, os.sep.join([self.result_directory, dataset_name1])) continue # Determine names of the original data sets the input # datasets are based on base_dataset1 = dataset_name1.strip("}{").split("}{")[0] base_dataset2 = dataset_name2.strip("}{").split("}{")[0] # Determine target dataset name and create directory # for it mixed_base_dataset = "%s_vs_%s" % (base_dataset1, base_dataset2) target_dataset_name = dataset_name1.replace(base_dataset1, mixed_base_dataset) target_dataset_dir = os.sep.join([self.result_directory, target_dataset_name]) create_directory(os.sep.join([target_dataset_dir, "data_run0"])) if splitted: # For each split, copy the train data from dataset 1 and # the test data from dataset 2 to the target dataset for source_train_file_name in glob.glob(os.sep.join([dataset_dir1, "data_run0", "*_sp*_train.*"])): # TODO: We have $n$ train sets and $n$ test sets, we "metadata.yaml"])), # could use all $n*n$ combinations target_train_file_name = source_train_file_name.replace(dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace(dataset_dir1, dataset_dir2) source_test_file_name = source_test_file_name.replace("train.", "test.") target_test_file_name = target_train_file_name.replace("train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) else: # Use the data set from dataset 1 as training set and # the data set from dataset 2 as test data for source_train_file_name in glob.glob(os.sep.join([dataset_dir1, "data_run0", "*_sp*_test.*"])): target_train_file_name = source_train_file_name.replace("test.", "train.") target_train_file_name = target_train_file_name.replace(dataset_dir1, target_dataset_dir) if source_train_file_name.endswith("arff"): self._copy_arff_file(source_train_file_name, target_train_file_name, base_dataset1, mixed_base_dataset) else: os.symlink(source_train_file_name, target_train_file_name) source_test_file_name = source_train_file_name.replace(dataset_dir1, dataset_dir2) target_test_file_name = target_train_file_name.replace("train.", "test.") if source_train_file_name.endswith("arff"): self._copy_arff_file(source_test_file_name, target_test_file_name, base_dataset2, mixed_base_dataset) else: os.symlink(source_test_file_name, target_test_file_name) # Write metadata.yaml based on input meta data input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1) output_dataset_meta = dict(input_dataset1_meta) output_dataset_meta['train_test'] = True output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S") try: output_dataset_meta['author'] = pwd.getpwuid(os.getuid())[4] except : self._log("Author could not be resolved.",level=logging.WARNING) output_dataset_meta['author'] = "unknown" BaseDataset.store_meta_data(target_dataset_dir,output_dataset_meta) ############## Clean up after benchmarking ############## super(ShuffleProcess, self).post_benchmarking()
def _createProcesses(cls, processes, result_dir, data_dict, parameters, dep_par, metrics, logscale, markertype, top_level): """Recursive function that is used to create the analysis processes Each process creates one plot for each numeric parameter, each pair of numeric parameters, and each nominal parameter based on the data contained in the *data_dict*. The results are stored in *result_dir*. The method calls itself recursively for each value of each parameter. """ # Create the analysis process for the given parameters and the # given data and put it in the executing-queue process = CompAnalysisProcess(result_dir, data_dict, parameters, metrics, logscale, markertype) processes.put(process) # If we have less than two parameters it does not make sense to # split further if len(parameters) < 2 or len(parameters) == len(dep_par): # If we have only one parameter to visualize, # we don't need to create any further processes, # and we have to finish the creating process. return # For each parameter for proj_parameter in parameters: if proj_parameter in dep_par: continue # We split the data based on the values of this parameter remaining_parameters = [ parameter for parameter in parameters if parameter != proj_parameter ] # For each value the respective projection parameter can take on for value in set(data_dict[proj_parameter]): # Project the result dict onto the rows where the respective # parameter takes on the given value projected_dict = defaultdict(list) entries_added = False for i in range(len(data_dict[parameter])): if data_dict[proj_parameter][i] == value: entries_added = True for column_key in data_dict.keys(): if column_key == proj_parameter: continue projected_dict[column_key].append( data_dict[column_key][i]) # If the projected_dict is empty we continue if not entries_added: continue # Create result_dir and do the recursive call for the # projected data proj_result_dir = result_dir + os.sep + "%s#%s" % ( proj_parameter, value) create_directory(proj_result_dir) cls._createProcesses(processes, proj_result_dir, projected_dict, remaining_parameters, dep_par, metrics, logscale, markertype, False) if top_level == True: # print "last process created" # give executing process the sign that creation is now finished processes.put(False)
def store_state(self, result_dir, index=None): """ Stores this node in the given directory *result_dir* .. todo:: Documentation! What is stored? And how? """ if self.store: try: # Create metric function lazily since it cannot be pickled metric_fct = self._get_metric_fct() # Determine curve on test data # TODO: Code duplication (mostly already in train) predictions_test = [] labels_test = [] for data, label in self.input_node.request_data_for_testing(): predictions_test.append(data.prediction) labels_test.append(self.classes.index(label)) sort_index = numpy.argsort(predictions_test) labels_test = numpy.array(labels_test)[sort_index] predictions_test = numpy.array(predictions_test)[sort_index] # Determine orientation of hyperplane if self.orientation_up: TP = list(labels_test).count(1) FP = list(labels_test).count(0) TN = 0 FN = 0 else: TP = 0 FP = 0 TN = list(labels_test).count(0) FN = list(labels_test).count(1) self.predictions_test = [[], []] for label, prediction_value, in zip(labels_test, predictions_test): if label == 0 and self.orientation_up: TN += 1 FP -= 1 elif label == 0 and not self.orientation_up: TN -= 1 FP += 1 elif label == 1 and self.orientation_up: FN += 1 TP -= 1 elif label == 1 and not self.orientation_up: FN -= 1 TP += 1 assert (TP >= 0 and FP >= 0 and TN >= 0 and FN >= 0), \ "TP: %s FP: %s TN: %s FN: %s" % (TP, FP, TN, FN) metric_value = metric_fct(TP, FP, TN, FN) self.predictions_test[0].append(prediction_value) self.predictions_test[1].append(metric_value) ### Plot ## import pylab pylab.close() fig_width_pt = 307.28987 * 2 # Get this from LaTeX using \showthe\columnwidth inches_per_pt = 1.0 / 72.27 # Convert pt to inches fig_width = fig_width_pt * inches_per_pt # width in inches fig_height = fig_width * 0.5 # height in inches fig_size = [fig_width, fig_height] params = { 'axes.labelsize': 10, 'text.fontsize': 8, 'legend.fontsize': 8, 'xtick.labelsize': 10, 'ytick.labelsize': 10 } pylab.rcParams.update(params) fig = pylab.figure(0, dpi=400, figsize=fig_size) xmin = min(min(self.predictions_train[0]), min(self.predictions_test[0])) xmax = max(max(self.predictions_train[0]), max(self.predictions_test[0])) ymin = min(min(self.predictions_train[1]), min(self.predictions_test[1])) ymax = max(max(self.predictions_train[1]), max(self.predictions_test[1])) pylab.plot(self.predictions_train[0], self.predictions_train[1], 'b', label='Training data') pylab.plot(self.predictions_test[0], self.predictions_test[1], 'g', label='Unseen test data') pylab.plot( [self.classifier_threshold, self.classifier_threshold], [ymin, ymax], 'r', label='Original Threshold', lw=5) pylab.plot([self.threshold, self.threshold], [ymin, ymax], 'c', label='Optimized Threshold', lw=5) pylab.legend(loc=0) pylab.xlim((xmin, xmax)) pylab.ylim((ymin, ymax)) pylab.xlabel("Threshold value") pylab.ylabel("Metric: %s" % self.metric) # Store plot from pySPACE.tools.filesystem import create_directory import os node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) pylab.savefig(node_dir + os.sep + "threshold_metric.pdf") except: self._log( "To many channels chosen for the retained channels! " "Replaced by maximum number.", level=logging.WARNING) super(ThresholdOptimizationNode, self).store_state(result_dir)
def store_state(self, result_dir, index=None): """ Stores plots of score distribution and sigmoid fit or/and the calculated probabilities with the corresponding label. .. todo:: change plot calculations to upper if else syntax .. todo:: add the corresponding data point to the saved probabilities """ if self.store: # Create the directory for the stored results from pySPACE.tools.filesystem import create_directory import os node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) # Safe the probabilities in a pickle file if (self.store_probabilities): import pickle f_name = node_dir + "/probabilities_%d.pickle" % self.current_split pickle.dump(self.probabilities, open(f_name, 'w')) if self.store_plots: # reliable plot of training (before sigmoid fit) sort_index = numpy.argsort(self.scores) labels = numpy.array(self.labels)[sort_index] predictions = numpy.array(self.scores)[sort_index] plot_scores_train, l_discrete_train = self._discretize( predictions, labels) len_list_train, plot_emp_prob_train = self._empirical_probability( l_discrete_train) # training data after sigmoid fit fApB = predictions * self.A + self.B new_predictions = [(int(fApB[i]<0)+int(fApB[i]>=0)*numpy.exp(-fApB[i]))/ \ (1.0+numpy.exp((-1)**int(fApB[i]>=0)*fApB[i])) \ for i in range(len(fApB))] plot_scores_train_fit, l_discrete_train_fit = \ self._discretize(new_predictions,labels) len_list_train_fit, plot_emp_prob_train_fit = \ self._empirical_probability(l_discrete_train_fit) # test data before sigmoid fit test_scores = [] test_labels = [] for data, label in self.input_node.request_data_for_testing(): test_scores.append(data.prediction) test_labels.append(self.class_labels.index(label)) sort_index = numpy.argsort(test_scores) labels = numpy.array(test_labels)[sort_index] predictions = numpy.array(test_scores)[sort_index] plot_scores_test, l_discrete_test = self._discretize( predictions, labels) len_list_test, plot_emp_prob_test = self._empirical_probability( l_discrete_test) # test data after sigmoid fit fApB = predictions * self.A + self.B new_predictions = [(int(fApB[i]<0)+int(fApB[i]>=0)*numpy.exp(-fApB[i]))/ \ (1.0+numpy.exp((-1)**int(fApB[i]>=0)*fApB[i])) \ for i in range(len(fApB))] plot_scores_test_fit, l_discrete_test_fit = \ self._discretize(new_predictions,labels) len_list_test_fit, plot_emp_prob_test_fit = \ self._empirical_probability(l_discrete_test_fit) import pylab from matplotlib.transforms import offset_copy pylab.close() fig = pylab.figure(figsize=(10, 10)) ax = pylab.subplot(2, 2, 1) transOffset = offset_copy(ax.transData, fig=fig, x=0.05, y=0.1, units='inches') for x, y, s in zip(plot_scores_train, plot_emp_prob_train[1], len_list_train[1]): pylab.plot((x, ), (y, ), 'ro') pylab.text(x, y, '%d' % s, transform=transOffset) pylab.plot((plot_scores_train[0], plot_scores_train[-1]), (0, 1), '-') x = numpy.arange(plot_scores_train[0], plot_scores_train[-1], .02) y = 1 / (1 + numpy.exp(self.A * x + self.B)) pylab.plot(x, y, '-') pylab.xlim(plot_scores_train[0], plot_scores_train[-1]) pylab.ylim(0, 1) pylab.xlabel("SVM prediction Score (training data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2, 2, 2) transOffset = offset_copy(ax.transData, fig=fig, x=0.05, y=0.1, units='inches') for x, y, s in zip(plot_scores_train_fit, plot_emp_prob_train_fit[1], len_list_train_fit[1]): pylab.plot((x, ), (y, ), 'ro') pylab.text(x, y, '%d' % s, transform=transOffset) pylab.plot( (plot_scores_train_fit[0], plot_scores_train_fit[-1]), (0, 1), '-') pylab.xlim(plot_scores_train_fit[0], plot_scores_train_fit[-1]) pylab.ylim(0, 1) pylab.xlabel("SVM Probability (training data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2, 2, 3) transOffset = offset_copy(ax.transData, fig=fig, x=0.05, y=0.1, units='inches') for x, y, s in zip(plot_scores_test, plot_emp_prob_test[1], len_list_test[1]): pylab.plot((x, ), (y, ), 'ro') pylab.text(x, y, '%d' % s, transform=transOffset) pylab.plot((plot_scores_test[0], plot_scores_test[-1]), (0, 1), '-') x = numpy.arange(plot_scores_test[0], plot_scores_test[-1], .02) y = 1 / (1 + numpy.exp(self.A * x + self.B)) pylab.plot(x, y, '-') pylab.xlim(plot_scores_test[0], plot_scores_test[-1]) pylab.ylim(0, 1) pylab.xlabel("SVM prediction Scores (test data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2, 2, 4) transOffset = offset_copy(ax.transData, fig=fig, x=0.05, y=0.1, units='inches') for x, y, s in zip(plot_scores_test_fit, plot_emp_prob_test_fit[1], len_list_test_fit[1]): pylab.plot((x, ), (y, ), 'ro') pylab.text(x, y, '%d' % s, transform=transOffset) pylab.plot((plot_scores_test_fit[0], plot_scores_test_fit[-1]), (0, 1), '-') pylab.xlim(plot_scores_test_fit[0], plot_scores_test_fit[-1]) pylab.ylim(0, 1) pylab.xlabel("SVM Probability (test data)") pylab.ylabel("Empirical Probability") pylab.savefig(node_dir + "/reliable_diagrams_%d.png" % self.current_split)
def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting, hide_parameters): """ Determines the name of the result directory Determines the name of the result directory based on the input_dataset_dir, the node_chain_name and the parameter setting. """ # Determine the result_directory name # String between Key and value changed from ":" to "#", # because ot problems in windows and with windows file servers def _get_result_dir_name(parameter_setting, hide_parameters, method=None): """ internal function to create result dir name in different ways""" if not method: parameter_str = "}{".join( ("%s#%s" % (key, value)) for key, value in parameter_setting.iteritems() if key not in hide_parameters ) elif method == "hash": parameter_str = "}{".join( ("%s#%s" % (key, hash(str(value).replace(" ", "")))) for key, value in parameter_setting.iteritems() if key not in hide_parameters ) parameter_str = parameter_str.replace("'", "") parameter_str = parameter_str.replace(" ", "") parameter_str = parameter_str.replace("[", "") parameter_str = parameter_str.replace("]", "") parameter_str = parameter_str.replace(os.sep, "") result_name = "{%s}" % input_name if parameter_str != "": result_name += "{%s}" % (parameter_str) # Determine the path where this result will be stored # and create the directory if necessary result_dir = base_dir result_dir += os.sep + result_name # filename is to long # (longer than allowed including optional offsets for pyspace # result csv naming conventions) # create a md5 hash of the result name and use that one import platform CURRENTOS = platform.system() if CURRENTOS == "Windows": # the maximum length for a filename on Windows is 255 if len(result_dir) > 255 - 32: result_name = "{" + hashlib.md5(result_name).hexdigest() + "}" result_dir = base_dir result_dir += os.sep + result_name return result_dir else: if len(result_dir) > os.pathconf(os.curdir, "PC_NAME_MAX") - 32: result_name = "{" + hashlib.md5(result_name).hexdigest() + "}" result_dir = base_dir result_dir += os.sep + result_name return result_dir input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1] input_name = input_name.strip("{}") # If the input is already the result of an operation if input_name.count("}{") > 0: input_name_parts = input_name.split("}{") input_name = input_name_parts[0] # Load the input meta data dataset_dir = os.sep.join([pySPACE.configuration.storage, input_dataset_dir]) dataset_md = BaseDataset.load_meta_data(dataset_dir) # We are going to change the parameter_setting and don't want to # interfere with later runs so we work on a copy parameter_setting = copy.deepcopy(parameter_setting) # Ignore pseudo parameter "__PREPARE_OPERATION__" if "__PREPARE_OPERATION__" in parameter_setting: parameter_setting.pop("__PREPARE_OPERATION__") # Add the input parameters meta data to the given parameter setting if "parameter_setting" in dataset_md: parameter_setting.update(dataset_md["parameter_setting"]) # We have to remove ' characters from the parameter value since # Weka does ignore them for key, value in parameter_setting.iteritems(): if isinstance(value, basestring) and value.count("'") > 1: parameter_setting[key] = eval(value) result_dir = _get_result_dir_name(parameter_setting, hide_parameters) try: create_directory(result_dir) except OSError as e: if e.errno == 36: # filename is too long result_dir = _get_result_dir_name(parameter_setting, hide_parameters, "hash") create_directory(result_dir) return result_dir
def store_state( self, result_dir, #string of results dir index=None): #None or int: number in node chain """ Stores the plots to the *result_dir* and is used for offline plotting and for plotting of average values (online and offline). Plots offline-data for every trial which has not been skipped. Optionally creates movies based on the stored images. Called by base_node. Returns: Nothing. """ if self.store: #set the specific directory for this particular node node_dir = os.path.join(result_dir, self.__class__.__name__) #do we have an index-number? if not index is None: #add the index-number... node_dir += "_%d" % int(index) create_directory(node_dir) else: #no specific directory node_dir = None #offline mode? if not self.online and (self.single_trial or self.accum_avg): if not hasattr(self, "_plotValues"): warnings.warn("VisualizationBase:: The node you are using for visualisation " \ "has no function _plotValues! This is most likely not what you intended!" \ "Plotting ignored!") else: pos = 0 for trial_num in range(1, self.trial_counter + 1): if trial_num not in self.skipped_trials: if self.single_trial: self._plotValues(values=self.st_list[pos], plot_label="single_trial_no_" + str(trial_num), fig_num=self.initial_fig_num + 2, store_dir=node_dir, counter=trial_num) if self.accum_avg: self._plotValues(values=self.accum_list[pos], plot_label="accum_avg_no_" + str(trial_num), fig_num=self.initial_fig_num + 3, store_dir=node_dir, counter=trial_num) pos += 1 #plotting of the whole average or storage of the movie may also be possible in online mode if self.online: #set or change the the specific directory for the node to the #execution-path with a timestamp (see __init__) node_dir = self.user_dir #is averaging intended? if self.averaging: if not self.avg_values: warnings.warn("VisualizationBase:: One of your averages has no " \ "instances! Plotting ignored!") else: if hasattr(self, "_plotValues"): self._plotValues(values=self.avg_values, plot_label="average", fig_num=self.initial_fig_num + 1, store_dir=node_dir) else: warnings.warn("VisualizationBase:: The node you are using for visualisation " \ "has no function _plotValues! This is most likely not what you intended!" \ "Plotting ignored!") #Finally create a movie if specified if self.create_movie and self.store_data: prefixes = [] if self.single_trial: for trial in range(1, self.trial_counter + 1): prefixes.append("single_trial_no_" + str(trial)) if self.accum_avg: for trial in range(1, self.trial_counter + 1): prefixes.append("accum_avg_no_" + str(trial)) if self.averaging: prefixes.append('average') self._create_movie(prefixes=prefixes, directory=node_dir) #close the figure windows pylab.close('all')
def store_state(self, result_dir, index=None): """ Stores plots of score distribution and sigmoid fit. """ if self.store: # reliable plot of training (before linear fit) sort_index = numpy.argsort(self.scores) labels = numpy.array(self.labels)[sort_index] predictions = numpy.array(self.scores)[sort_index] plot_scores_train, l_discrete_train = self._discretize( predictions, labels) len_list_train, plot_emp_prob_train = self._empirical_probability( l_discrete_train) # training data after linear fit new_predictions = [] for score in predictions: if score < 0.0: new_predictions.append((score + self.max_range[0]) / \ (2.0 * self.max_range[0])) else: new_predictions.append((score + self.max_range[1]) / \ (2.0 * self.max_range[1])) plot_scores_train_fit, l_discrete_train_fit = \ self._discretize(new_predictions,labels) len_list_train_fit, plot_emp_prob_train_fit = \ self._empirical_probability(l_discrete_train_fit) # test data before sigmoid fit test_scores = [] test_labels = [] for data, label in self.input_node.request_data_for_testing(): test_scores.append(data.prediction) test_labels.append(self.class_labels.index(label)) sort_index = numpy.argsort(test_scores) labels = numpy.array(test_labels)[sort_index] predictions = numpy.array(test_scores)[sort_index] plot_scores_test, l_discrete_test = self._discretize( predictions, labels) len_list_test, plot_emp_prob_test = self._empirical_probability( l_discrete_test) # test data after sigmoid fit new_predictions = [] for score in predictions: if score < -1.0 * self.max_range[0]: new_predictions.append(0.0) elif score < 0.0: new_predictions.append((score + self.max_range[0]) / \ (2.0 * self.max_range[0])) elif score < self.max_range[1]: new_predictions.append((score + self.max_range[1]) / \ (2.0 * self.max_range[1])) else: new_predictions.append(1.0) plot_scores_test_fit, l_discrete_test_fit = \ self._discretize(new_predictions,labels) len_list_test_fit, plot_emp_prob_test_fit = \ self._empirical_probability(l_discrete_test_fit) from pySPACE.tools.filesystem import create_directory import os node_dir = os.path.join(result_dir, self.__class__.__name__) create_directory(node_dir) import pylab from matplotlib.transforms import offset_copy pylab.close() fig = pylab.figure(figsize=(10, 10)) ax = pylab.subplot(2, 2, 1) transOffset = offset_copy(ax.transData, fig=fig, x=0.05, y=0.1, units='inches') for x, y, s in zip(plot_scores_train, plot_emp_prob_train[1], len_list_train[1]): pylab.plot((x, ), (y, ), 'ro') pylab.text(x, y, '%d' % s, transform=transOffset) pylab.plot((plot_scores_train[0], plot_scores_train[-1]), (0, 1), '-') x1 = numpy.arange(-1.0 * self.max_range[0], 0.0, .02) x2 = numpy.arange(0.0, self.max_range[1], .02) y1 = (x1 + self.max_range[0]) / (2 * self.max_range[0]) y2 = (x2 + self.max_range[1]) / (2 * self.max_range[1]) pylab.plot(numpy.concatenate((x1, x2)), numpy.concatenate( (y1, y2)), '-') pylab.xlim(plot_scores_train[0], plot_scores_train[-1]) pylab.ylim(0, 1) pylab.xlabel("SVM prediction Score (training data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2, 2, 2) transOffset = offset_copy(ax.transData, fig=fig, x=0.05, y=0.1, units='inches') for x, y, s in zip(plot_scores_train_fit, plot_emp_prob_train_fit[1], len_list_train_fit[1]): pylab.plot((x, ), (y, ), 'ro') pylab.text(x, y, '%d' % s, transform=transOffset) pylab.plot((plot_scores_train_fit[0], plot_scores_train_fit[-1]), (0, 1), '-') pylab.xlim(plot_scores_train_fit[0], plot_scores_train_fit[-1]) pylab.ylim(0, 1) pylab.xlabel("SVM Probability (training data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2, 2, 3) transOffset = offset_copy(ax.transData, fig=fig, x=0.05, y=0.1, units='inches') for x, y, s in zip(plot_scores_test, plot_emp_prob_test[1], len_list_test[1]): pylab.plot((x, ), (y, ), 'ro') pylab.text(x, y, '%d' % s, transform=transOffset) pylab.plot((plot_scores_test[0], plot_scores_test[-1]), (0, 1), '-') x1 = numpy.arange(-1.0 * self.max_range[0], 0.0, .02) x2 = numpy.arange(0.0, self.max_range[1], .02) y1 = (x1 + self.max_range[0]) / (2 * self.max_range[0]) y2 = (x2 + self.max_range[1]) / (2 * self.max_range[1]) pylab.plot( numpy.concatenate([[plot_scores_test[0], self.max_range[0]], x1, x2, [self.max_range[1], plot_scores_test[-1]]]), numpy.concatenate([[0.0, 0.0], y1, y2, [1.0, 1.0]]), '-') pylab.xlim(plot_scores_test[0], plot_scores_test[-1]) pylab.ylim(0, 1) pylab.xlabel("SVM prediction Score (test data)") pylab.ylabel("Empirical Probability") ax = pylab.subplot(2, 2, 4) transOffset = offset_copy(ax.transData, fig=fig, x=0.05, y=0.1, units='inches') for x, y, s in zip(plot_scores_test_fit, plot_emp_prob_test_fit[1], len_list_test_fit[1]): pylab.plot((x, ), (y, ), 'ro') pylab.text(x, y, '%d' % s, transform=transOffset) pylab.plot((plot_scores_test_fit[0], plot_scores_test_fit[-1]), (0, 1), '-') pylab.xlim(plot_scores_test_fit[0], plot_scores_test_fit[-1]) pylab.ylim(0, 1) pylab.xlabel("SVM Probability (test data)") pylab.ylabel("Empirical Probability") pylab.savefig(node_dir + "/reliable_diagrams_%d.png" % self.current_split)