Python BaseDataset Examples, pySPACE.resources.dataset_defs.base.BaseDataset Python Examples

Example #1

0

Show file

File: merge.py Project: ahmedadelhassan/pyspace

    def _merge_pickle_files(self, target_collection_path,
                            source_collection_pathes, train_set_name_suffix,
                            target_collection_params):
        """ Merge all collections in source_collection_pathes and store them \
            in the target collection"""

        # load a first collection, in which the data of all other collections
        # is assembled
        target_collection = BaseDataset.load(source_collection_pathes[0])
        author = get_author()
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_collection.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        k = "test" if self.reverse else "train"
        target_collection_params["__INPUT_DATASET__"][k] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
        target_collection_params[
            "__RESULT_DIRECTORY__"] = self.result_directory
        target_collection.meta_data.update({
            "author":
            author,
            "date":
            date,
            "dataset_directory":
            target_collection_path,
            "train_test":
            True,
            "parameter_setting":
            target_collection_params,
            "input_collection_name":
            source_collection_pathes[0][len(pySPACE.configuration.storage):]
        })

        # merge data of all other collections to target collection
        for source_collection_path in source_collection_pathes[1:]:
            source_collection = BaseDataset.load(source_collection_path)
            for run in source_collection.get_run_numbers():
                for split in source_collection.get_split_numbers():
                    data = source_collection.get_data(run, split,
                                                      train_set_name_suffix)
                    target_data = target_collection.get_data(
                        run, split, train_set_name_suffix)
                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)

        # if only test data was given, the "Rest_vs" collection is stored as
        # training data
        if not self.reverse and "test" == train_set_name_suffix:
            # exchange the "test" in key tuple to "train" before storing
            for key in target_collection.data.keys():
                assert ("test" == key[2])
                value = target_collection.data.pop(key)
                key = (key[0], key[1], "train")
                target_collection.data[key] = value

        target_collection.store(target_collection_path)

Example #2

0

Show file

File: merge.py Project: BioinformaticsArchive/pyspace

 def _merge_pickle_files(self, target_collection_path, source_collection_pathes,
                               train_set_name_suffix, target_collection_params):
     """ Merge all collections in source_collection_pathes and store them \
         in the target collection"""
     
     # load a first collection, in which the data of all other collections 
     # is assembled
     target_collection = BaseDataset.load(source_collection_pathes[0])
     try:
         author = pwd.getpwuid(os.getuid())[4]
     except:
         author = "unknown"
         self._log("Author could not be resolved.",level=logging.WARNING)
     date = time.strftime("%Y%m%d_%H_%M_%S")
     # Delete node_chain file name
     try:
         target_collection.meta_data.pop("node_chain_file_name")
     except:
         pass
     # Update meta data and store it
     k = "test" if self.reverse else "train"
     target_collection_params["__INPUT_DATASET__"][k] = \
              [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
     target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory
     target_collection.meta_data.update({
             "author" : author, 
             "date" : date, 
             "dataset_directory" : target_collection_path,
             "train_test" : True,
             "parameter_setting" : target_collection_params,
             "input_collection_name" : source_collection_pathes[0][len(
                                     pySPACE.configuration.storage):]
     })
   
     # merge data of all other collections to target collection
     for source_collection_path in source_collection_pathes[1:]:
         source_collection = BaseDataset.load(source_collection_path)
         for run in source_collection.get_run_numbers():
             for split in source_collection.get_split_numbers():
                 data = source_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 target_data = target_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 # actual data is stored in a list that has to be extended
                 target_data.extend(data)
                 
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if not self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in target_collection.data.keys():
             assert("test" == key[2])
             value = target_collection.data.pop(key)
             key = (key[0],key[1],"train")
             target_collection.data[key] = value
                 
     target_collection.store(target_collection_path)

Example #3

0

Show file

File: dummy.py Project: MMKrell/pyspace

    def store(self, result_dir, s_format = "None"):
        if not s_format == "None":
            self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL)
            return
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "only output of individual nodes stored",
                                      "storage_format": s_format,
                                      "author" : author,
                                      "data_pattern": "no data stored"})

        # Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)

Example #4

0

Show file

File: weka_classification.py Project: pyspace/test

 def _createProcesses(cls, processes, result_directory, operation_spec, 
             parameter_settings, input_collections, command_template):     
  
     # For each combination of classifier, input-collection and
     # run number, create one WEKA_process
     for dataset_dir in input_collections:
         collection = BaseDataset.load(dataset_dir)
         # Determine the number of iterations and splits to be used
         iterations = collection.meta_data["runs"]
         splits = collection.meta_data["splits"] 
         if "runs" in operation_spec:
             assert(iterations in [1, operation_spec["runs"]])
             iterations = operation_spec["runs"]
         if "cv_folds" in operation_spec:
             assert(splits in [1, operation_spec["cv_folds"]])
             splits = operation_spec["cv_folds"]          
         
         for parametrization in parameter_settings: 
             for run_number in range(iterations):
                 process = WEKAClassificationProcess(dataset_dir,
                                                     command_template,
                                                     parametrization,
                                                     splits,
                                                     run_number,
                                                     result_directory)
                 processes.put(process)
     # give executing process the sign that creation is now finished                
     processes.put(False)

Example #5

0

Show file

File: merge.py Project: jhuebotter/pyspace

 def _copy_file(self, source_collection_path, target_collection_path,
                train_set_name_suffix):
     """ Copy a dataset to a new destination 
     
     **Parameters**
     
         :source_collection_path:
             The path to the dataset that has to be copied.
             
         :target_collection_path:
             The path to where the dataset should be copied.
             
         :train_set_name_suffix:
             Either 'train' or 'test'. Specifies if the target dataset is
             handeled as training or testing data. 
     """
     source_collection = BaseDataset.load(source_collection_path)
     # if only test data was given, the "Rest_vs" collection is stored as
     # training data
     if self.reverse and "test" == train_set_name_suffix:
         # exchange the "test" in key tuple to "train" before storing
         for key in source_collection.data.keys():
             assert ("test" == key[2])
             value = source_collection.data.pop(key)
             key = (key[0], key[1], "train")
             source_collection.data[key] = value
     # we store the data in the same format as before
     source_collection.store(target_collection_path,
                             source_collection.meta_data["storage_format"])

Example #6

0

Show file

File: dummy.py Project: jhuebotter/pyspace

    def store(self, result_dir, s_format="None"):
        if not s_format == "None":
            self._log("The format %s is not supported!" % s_format,
                      level=logging.CRITICAL)
            return
        # Update the meta data
        author = get_author()
        self.update_meta_data({
            "type": "only output of individual nodes stored",
            "storage_format": s_format,
            "author": author,
            "data_pattern": "no data stored"
        })

        # Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)

Example #7

0

Show file

File: merge.py Project: MMKrell/pyspace

 def _copy_file(self, source_collection_path, target_collection_path,
                train_set_name_suffix):
     """ Copy a dataset to a new destination 
     
     **Parameters**
     
         :source_collection_path:
             The path to the dataset that has to be copied.
             
         :target_collection_path:
             The path to where the dataset should be copied.
             
         :train_set_name_suffix:
             Either 'train' or 'test'. Specifies if the target dataset is
             handeled as training or testing data. 
     """ 
     source_collection = BaseDataset.load(source_collection_path)
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in source_collection.data.keys():
             assert("test" == key[2])
             value = source_collection.data.pop(key)
             key = (key[0],key[1],"train")
             source_collection.data[key] = value
     # we store the data in the same format as before
     source_collection.store(target_collection_path, 
         source_collection.meta_data["storage_format"])

Example #8

0

Show file

    def __init__(self, dataset_dir, command_template, parametrization,
                 run_number, split_number, operation_result_dir,
                 hide_parameters = []):
        
        super(WEKAFilterProcess, self).__init__()
        
        # Determine the directory in which the of the process' results
        # are stored
        result_collection_name = dataset_dir.split(os.sep)[-2]
        for parameter_name, parameter_value in parametrization.iteritems():
            # If this is a parameter that should not be hidden, then we have to
            # encode it in the result collection name 
            if not parameter_name in hide_parameters:
                result_collection_name += "{__%s__:%s}" % (parameter_name.upper(),
                                                           parameter_value)
                                                                     
        self.result_directory = os.path.join(operation_result_dir,
                                             result_collection_name)
        
        # Create directory for intermediate results if it does not exist yet
        create_directory(self.result_directory 
                              + os.sep + "data_run%s" % run_number)
                
        # Create collection
        collection = BaseDataset.load(dataset_dir)
        
        # The parametrization that is independent of the collection type 
        # and the specific weka command template that is executed
        self.params = {"dataset_name": dataset_dir.replace('/','_'),
                       "dataset_dir": dataset_dir,
                       "run_number": run_number,
                       "split_number": split_number,
                       "weka_class_path": pySPACE.configuration.weka_class_path,
                       "temp_results": self.result_directory}

        # Load the abbreviations
        abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               'operations/weka_templates',
                                               'abbreviations.yaml'), 'r')
        self.abbreviations = yaml.load(abbreviations_file)
        # Add custom parameters for the weka command template
        for parameter_name, parameter_value in parametrization.iteritems():
            # Auto-expand abbreviations
            if parameter_value in self.abbreviations:
                parameter_value = self.abbreviations[parameter_value]
            self.params[parameter_name] = parameter_value
            
        # Build the WEKA command by repeatedly replacing all placeholders in 
        # the template 
        while True:
            instantiated_template = command_template % self.params
            if instantiated_template == command_template:
                # All placeholders replace 
                self.weka_command = instantiated_template
                break
            else:
                # We have to continue since we are not converged
                command_template = instantiated_template
        
        self.handler_class = None

Example #9

0

Show file

File: dummy.py Project: pyspace/test

    def store(self, result_dir, s_format = "None"):
        if not s_format == "None":
            self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL)
            return
        # Update the meta data
        try:
            author = pwd.getpwuid(os.getuid())[4]
        except:
            author = "unknown"
            self._log("Author could not be resolved.",level=logging.WARNING)
        self.update_meta_data({"type": "only output of individual nodes stored",
                                      "storage_format": s_format,
                                      "author" : author,
                                      "data_pattern": "no data stored"})

        # Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)

Example #10

0

Show file

File: weka_classification.py Project: pyspace/test

    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """
        A factory method that creates an WEKA operation based on the 
        information given in the operation specification operation_spec
        """
        assert(operation_spec["type"] == "weka_classification")
        # Determine all parameter combinations that should be tested
        parameter_settings = cls._get_parameter_space(operation_spec)
        
        # Read the command template from a file
        template_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               "operations",
                                               "weka_templates",
                                               operation_spec["template"]),
                             'r')
        command_template = template_file.read()
        template_file.close() 

        # number of processes
        if "runs" in operation_spec:
            number_processes = len(input_paths) * len(parameter_settings) * \
                           operation_spec["runs"]
        else: # approximate the number of processes 
            runs = []
            for dataset_dir in input_paths:
                collection = BaseDataset.load(dataset_dir)
                runs.append(collection.meta_data["runs"])
            runs = max(runs)
            number_processes = len(input_paths) * len(parameter_settings) * \
                               runs
        
        if debug == True:
            # To better debug creation of processes we don't limit the queue 
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, operation_spec, 
                                 parameter_settings, input_paths,
                                 command_template)
            # create and return the weka operation object
            return cls(processes, operation_spec, result_directory, 
                       number_processes)
        else:
            # Create all processes by calling a recursive helper method in 
            # another thread so that already created processes can be executed in 
            # parallel. Therefore a queue is used which size is maximized to 
            # guarantee that not to much objects are created (because this costs
            # memory). However, the actual number of 100 is arbitrary and might
            # be reviewed.
            processes = processing.Queue(100)
            create_process = processing.Process(target=cls._createProcesses,
                             args=( processes, result_directory, operation_spec, 
                                    parameter_settings, input_paths,
                                    command_template))
            create_process.start()            
            # create and return the weka operation object
            return cls(processes, operation_spec, result_directory, 
                       number_processes, create_process)

Example #11

0

Show file

File: node_chain.py Project: AlexanderFabisch/pyspace

    def _get_result_dataset_dir(base_dir, input_dataset_dir,
                                   parameter_setting, hide_parameters):
        """ Determines the name of the result directory

        Determines the name of the result directory based on the
        input_dataset_dir, the node_chain_name and the parameter setting.
        """
        input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
        input_name = input_name.strip("{}")
        # If the input is already the result of an operation
        if input_name.count("}{") > 0:
            input_name_parts = input_name.split("}{")
            input_name = input_name_parts[0]

        # Load the input meta data
        dataset_dir = os.sep.join([pySPACE.configuration.storage,
                                                input_dataset_dir])
        dataset_md = BaseDataset.load_meta_data(dataset_dir)

        # We are going to change the parameter_setting and don't want to
        # interfere with later runs so we work on a copy
        parameter_setting = copy.deepcopy(parameter_setting)

        # Ignore pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in parameter_setting:
            parameter_setting.pop("__PREPARE_OPERATION__")

        # Add the input parameters meta data to the given parameter setting
        if "parameter_setting" in dataset_md:
            parameter_setting.update(dataset_md["parameter_setting"])

        # We have to remove ' characters from the parameter value since
        # Weka does ignore them
        for key, value in parameter_setting.iteritems():
            if isinstance(value, basestring) and value.count("'") > 1:
                parameter_setting[key] = eval(value)

        # Determine the result_directory name
        # String between Key and value changed from ":" to "#",
        # because ot problems in windows and with windows file servers
        parameter_str = "}{".join(("%s#%s" % (key, value))
                                        for key, value in parameter_setting.iteritems()
                                            if key not in hide_parameters)

        result_name =  "{%s}" % input_name

        if parameter_str != "":
            result_name += "{%s}" % (parameter_str)

        # Determine the path where this result will be stored
        # and create the directory if necessary
        result_dir = base_dir
        result_dir += os.sep + result_name
        create_directory(result_dir)

        return result_dir

Example #12

0

Show file

File: node_chain.py Project: AlexanderFabisch/pyspace

    def __call__(self):
        """ Executes this process on the respective modality """
        # Restore configuration
        pySPACE.configuration = self.configuration

        # reduce log_level for processing a second time and
        # set communication possibility for nodes to backend
        pySPACE.configuration.min_log_level = self.min_log_level
        pySPACE.configuration.logging_com = self.handler_args
        pySPACE.configuration.backend_com = self.backend_com

        ############## Prepare benchmarking ##############
        super(NodeChainProcess, self).pre_benchmarking()

        # Load the data and check that it can be processed
        # Note: This can not be done in the objects constructor since in
        # that case the whole input would need to be pickled
        # when doing the remote call
        abs_dataset_dir = os.sep.join([self.storage,
                                          self.rel_dataset_dir])

        input_collection = BaseDataset.load(abs_dataset_dir)

        # We have to remember parameters used for generating this specific
        # input dataset
        if 'parameter_setting' in input_collection.meta_data.keys():
            # but not __INPUT_DATASET__ and __RESULT_DIRECTORY__
            for k, v in input_collection.meta_data['parameter_setting'].items():
                if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]:
                    self.parameter_setting[k] = v

        NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain,
                                                       input_collection)

        ############## Do the actual benchmarking ##############

        self._log("Start benchmarking run %s of node_chain %s on dataset %s"
                                % (self.run,
                                   self.node_chain_spec,
                                   self.rel_dataset_dir))


        # Do the actual benchmarking for this collection/node_chain combination
        try:
            result_collection = \
                self.node_chain.benchmark(input_collection = input_collection,
                                         run = self.run,
                                         persistency_directory = self.persistency_dir,
                                         store_node_chain = self.store_node_chain)
        except Exception, exception:
            # Send Exception to Logger
            import traceback
            print traceback.format_exc()
            self._log(traceback.format_exc(), level = logging.ERROR)
            raise

Example #13

0

Show file

    def __call__(self):
        """ Executes this process on the respective modality """
        # Restore configuration
        pySPACE.configuration = self.configuration

        # reduce log_level for processing a second time and
        # set communication possibility for nodes to backend
        pySPACE.configuration.min_log_level = self.min_log_level
        pySPACE.configuration.logging_com = self.handler_args
        pySPACE.configuration.backend_com = self.backend_com

        ############## Prepare benchmarking ##############
        super(NodeChainProcess, self).pre_benchmarking()

        # Load the data and check that it can be processed
        # Note: This can not be done in the objects constructor since in
        # that case the whole input would need to be pickled
        # when doing the remote call
        abs_dataset_dir = os.sep.join([self.storage,
                                       self.rel_dataset_dir])

        input_collection = BaseDataset.load(abs_dataset_dir)

        # We have to remember parameters used for generating this specific
        # input dataset
        if 'parameter_setting' in input_collection.meta_data.keys():
            # but not __INPUT_DATASET__ and __RESULT_DIRECTORY__
            for k, v in input_collection.meta_data['parameter_setting'].items():
                if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]:
                    self.parameter_setting[k] = v

        NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain,
                                                               input_collection)

        ############## Do the actual benchmarking ##############

        self._log("Start benchmarking run %s of node_chain %s on dataset %s"
                                % (self.run,
                                   self.node_chain_spec,
                                   self.rel_dataset_dir))


        # Do the actual benchmarking for this collection/node_chain combination
        try:
            result_collection = \
                self.node_chain.benchmark(
                    input_collection=input_collection,
                    run=self.run,
                    persistency_directory=self.persistency_dir,
                    store_node_chain=self.store_node_chain)
        except Exception, exception:
            # Send Exception to Logger
            import traceback
            self._log(traceback.format_exc(), level=logging.ERROR)
            raise

Example #14

0

Show file

    def create(cls,
               operation_spec,
               result_directory,
               debug=False,
               input_paths=[]):
        """
        A factory method that creates a statistic operation based on the
        information given in the operation specification operation_spec.
        If debug is TRUE the creation of the statistic processes will not
        be in a separated thread.
        """
        assert (operation_spec["type"] == "statistic")
        input_path = operation_spec["input_path"]
        tabular = BaseDataset.load(
            os.path.join(pySPACE.configuration.storage, input_path)).data

        if operation_spec.has_key("filter"):
            conditions = csv_analysis.empty_dict(tabular)
            for key, l in operation_spec["filter"].items():
                conditions[key].extend(l)
            tabular = csv_analysis.strip_dict(tabular, conditions)
        metric = operation_spec.get("metric", "Balanced_accuracy")
        parameter = operation_spec.get("parameter", "__Dataset__")
        rel_par = operation_spec.get("related_parameters",
                                     ["__Dataset__", "Key_Run", "Key_Fold"])
        average = operation_spec.get("average", None)

        if average in rel_par:
            rel_par.remove(average)
        if metric in rel_par:
            rel_par.remove(metric)
        if parameter in rel_par:
            rel_par.remove(parameter)

        reduced_tabular = cls.reduce_tabular(tabular, rel_par, metric,
                                             parameter, average)
        number_processes = 1
        processes = processing.Queue()
        cls._createProcesses(processes, result_directory, reduced_tabular)

        import shutil
        shutil.copy2(
            os.path.join(pySPACE.configuration.storage, input_path,
                         "results.csv"),
            os.path.join(result_directory, "results.csv"))
        shutil.copy2(
            os.path.join(pySPACE.configuration.storage, input_path,
                         "metadata.yaml"),
            os.path.join(result_directory, "metadata.yaml"))
        # create and return the shuffle operation object
        return cls(processes, operation_spec, result_directory,
                   number_processes)

Example #15

0

Show file

    def create(cls,
               operation_spec,
               result_directory,
               debug=False,
               input_paths=[]):
        """ A factory method that creates an Analysis operation based on the 
        information given in the operation specification operation_spec
        """
        assert (operation_spec["type"] == "analysis")
        input_path = operation_spec["input_path"]
        summary = BaseDataset.load(
            os.path.join(pySPACE.configuration.storage, input_path))
        data_dict = summary.data

        # Determine the parameters that should be analyzed
        parameters = operation_spec["parameters"]

        # Determine the metrics that should be plotted
        metrics = operation_spec["metrics"]

        # Determine how many processes will be created
        number_parameter_values = [
            len(set(data_dict[param])) for param in parameters
        ]
        number_processes = cls._numberOfProcesses(0,
                                                  number_parameter_values) + 1

        if debug == True:
            # To better debug creation of processes we don't limit the queue
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, data_dict,
                                 parameters, metrics, True)
            return cls(processes, operation_spec, result_directory,
                       number_processes)
        else:
            # Create all plot processes by calling a recursive helper method in
            # another thread so that already created processes can be executed
            # although creation of processes is not finished yet. Therefore a queue
            # is used which size is limited to guarantee that not to much objects
            # are created (since this costs memory). However, the actual number
            # of 100 is arbitrary and might be changed according to the system at hand.
            processes = processing.Queue(100)
            create_process = processing.Process(
                target=cls._createProcesses,
                args=(processes, result_directory, data_dict, parameters,
                      metrics, True))
            create_process.start()
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes, create_process)

Example #16

0

Show file

File: merge.py Project: ahmedadelhassan/pyspace

    def _copy_pickle_file(self, source_collection_path, target_collection_path,
                          train_set_name_suffix):

        source_collection = BaseDataset.load(source_collection_path)
        # if only test data was given, the "Rest_vs" collection is stored as
        # training data
        if self.reverse and "test" == train_set_name_suffix:
            # exchange the "test" in key tuple to "train" before storing
            for key in source_collection.data.keys():
                assert ("test" == key[2])
                value = source_collection.data.pop(key)
                key = (key[0], key[1], "train")
                source_collection.data[key] = value
        source_collection.store(target_collection_path)

Example #17

0

Show file

File: merge.py Project: BioinformaticsArchive/pyspace

 def _copy_pickle_file(self, source_collection_path, target_collection_path,
                       train_set_name_suffix):
     
     source_collection = BaseDataset.load(source_collection_path)
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in source_collection.data.keys():
             assert("test" == key[2])
             value = source_collection.data.pop(key)
             key = (key[0],key[1],"train")
             source_collection.data[key] = value
     source_collection.store(target_collection_path)

Example #18

0

Show file

    def test_time_series_storing(self):

        if os.path.exists('tmp') is False:
            os.makedirs('tmp')

        source = SimpleTimeSeriesSourceNode()
        sink = TimeSeriesSinkNode()
        sink.register_input_node(source)
        sink.set_run_number(0)
        sink.process_current_split()
        result_collection = sink.get_result_dataset()
        result_collection.store('tmp')
        #sink.store_results("test_time_series_storing.tmp")

        reloaded_collection = BaseDataset.load('tmp')

        reloader = TimeSeriesSourceNode()
        reloader.set_input_dataset(reloaded_collection)
        #set_permanent_attributes(time_series_file = "test_time_series_storing.tmp")

        orig_data = list(source.request_data_for_testing())
        restored_data = list(reloader.request_data_for_testing())

        # Check that the two list have the same length
        self.assertEqual(
            len(orig_data), len(restored_data),
            "Numbers of time series before storing and after reloading are not equal!"
        )

        # Check that there is a one-to-one correspondence
        for orig_datapoint, orig_label in orig_data:
            found = False
            for restored_datapoint, restored_label in restored_data:
                found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \
                            and (orig_label == restored_label)
                if found: break
            self.assert_(
                found,
                "One of the original time series cannot not be found after reloading"
            )

        shutil.rmtree('tmp')  # Cleaning up...

Example #19

0

Show file

File: analysis.py Project: AlexanderFabisch/pyspace

    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """ A factory method that creates an Analysis operation based on the 
        information given in the operation specification operation_spec
        """
        assert(operation_spec["type"] == "analysis")
        input_path = operation_spec["input_path"]
        summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage,
                                      input_path))
        data_dict = summary.data

        # Determine the parameters that should be analyzed
        parameters = operation_spec["parameters"]
        
        # Determine the metrics that should be plotted
        metrics = operation_spec["metrics"]
        
        # Determine how many processes will be created
        number_parameter_values = [len(set(data_dict[param])) for param in parameters]
        number_processes = cls._numberOfProcesses(0, number_parameter_values)+1
        
        if debug == True:
            # To better debug creation of processes we don't limit the queue 
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, data_dict, parameters, 
                                   metrics, True)
            return cls( processes, operation_spec, result_directory, number_processes)
        else:
            # Create all plot processes by calling a recursive helper method in 
            # another thread so that already created processes can be executed
            # although creation of processes is not finished yet. Therefore a queue 
            # is used which size is limited to guarantee that not to much objects 
            # are created (since this costs memory). However, the actual number 
            # of 100 is arbitrary and might be changed according to the system at hand.
            processes = processing.Queue(100)
            create_process = processing.Process(target=cls._createProcesses,
                             args=( processes, result_directory, data_dict, 
                                    parameters, metrics, True))
            create_process.start()
            # create and return the operation object
            return cls( processes, operation_spec, result_directory, number_processes, create_process)

Example #20

0

Show file

File: test_time_series_sink.py Project: Crespo911/pyspace

    def test_time_series_storing(self):

        if os.path.exists('tmp') is False :
            os.makedirs('tmp')
        
        source = SimpleTimeSeriesSourceNode()
        sink = TimeSeriesSinkNode()
        sink.register_input_node(source)
        sink.set_run_number(0)
        sink.process_current_split()
        result_collection = sink.get_result_dataset()
        result_collection.store('tmp')
        #sink.store_results("test_time_series_storing.tmp")
        
        reloaded_collection = BaseDataset.load('tmp')
        
        reloader = TimeSeriesSourceNode()
        reloader.set_input_dataset(reloaded_collection)
        #set_permanent_attributes(time_series_file = "test_time_series_storing.tmp")
        
        orig_data = list(source.request_data_for_testing()) 
        restored_data = list(reloader.request_data_for_testing())
        
        # Check that the two list have the same length
        self.assertEqual(len(orig_data), len(restored_data),
                         "Numbers of time series before storing and after reloading are not equal!")
        
        # Check that there is a one-to-one correspondence
        for orig_datapoint, orig_label in orig_data:
            found = False
            for restored_datapoint, restored_label in restored_data:
                found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \
                            and (orig_label == restored_label)
                if found: break
            self.assert_(found, 
                         "One of the original time series cannot not be found after reloading")
        
        shutil.rmtree('tmp') # Cleaning up...

Example #21

0

Show file

File: statistic.py Project: AlexanderFabisch/pyspace

 def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
     """
     A factory method that creates a statistic operation based on the
     information given in the operation specification operation_spec.
     If debug is TRUE the creation of the statistic processes will not
     be in a separated thread.
     """
     assert(operation_spec["type"] == "statistic")
     input_path = operation_spec["input_path"]
     tabular = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)).data
     
     if operation_spec.has_key("filter"):
         conditions= csv_analysis.empty_dict(tabular)
         for key,l in operation_spec["filter"].items():
             conditions[key].extend(l)
         tabular = csv_analysis.strip_dict(tabular,conditions)
     metric = operation_spec.get("metric","Balanced_accuracy")
     parameter = operation_spec.get("parameter","__Dataset__")
     rel_par = operation_spec.get("related_parameters",["__Dataset__", "Key_Run", "Key_Fold"])
     average = operation_spec.get("average",None)
     
     if average in rel_par:
         rel_par.remove(average)
     if metric in rel_par:
         rel_par.remove(metric)
     if parameter in rel_par:
         rel_par.remove(parameter)
         
     reduced_tabular=cls.reduce_tabular(tabular,rel_par,metric,parameter,average)
     number_processes = 1
     processes = processing.Queue()
     cls._createProcesses(processes, result_directory, reduced_tabular)
     
     import shutil
     shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"results.csv"), os.path.join(result_directory,"results.csv"))
     shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"metadata.yaml"), os.path.join(result_directory,"metadata.yaml"))
     # create and return the shuffle operation object
     return cls(processes, operation_spec, result_directory, number_processes)

Example #22

0

Show file

    def store(self, result_dir, s_format="pickle"):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        **Parameters**
        
          :result_dir:
              The directory in which the collection will be stored.
              
          :name:
              The prefix of the file names in which the individual data sets are 
              stored. The actual file names are determined by appending suffixes
              that encode run, split, train/test information. 
              
              (*optional, default: "time_series"*)
              
          :format:
              The format in which the actual data sets should be stored.
              
              Possible formats are *pickle*, *text*, *csv* and *MATLAB* (.mat)
              format.

              In the MATLAB and text format, all time series objects are
              concatenated to a single large table containing only integer
              values.
              For the csv format comma separated values are taken as default
              or a specified Python format string.
              
              The MATLAB format is a struct that contains the data, the
              sampling frequency and the channel names.
              
              .. note:: For the text and MATLAB format, markers could be added 
                        by using a Marker_To_Mux node before
              
              (*optional, default: "pickle"*)

        .. todo:: Put marker to the right time point and also write marker channel.
        """
        name = "time_series"
        if type(s_format) == list:
            s_type = s_format[1]
            s_format = s_format[0]
        else:
            s_type = "%.18e"
        if s_format in ["text", "matlab"]:
            s_type = "%i"
        if s_format == "csv" and s_type == "real":
            s_type = "%.18e"
        # Update the meta data
        author = get_author()
        self.update_meta_data({
            "type":
            "time_series",
            "storage_format":
            s_format,
            "author":
            author,
            "data_pattern":
            "data_run" + os.sep + name + "_sp_tt." + s_format
        })

        # Iterate through splits and runs in this dataset
        for key, time_series in self.data.iteritems():
            # load data, if necessary
            # (due to the  lazy loading, the data might be not loaded already)
            if isinstance(time_series, basestring):
                time_series = self.get_data(key[0], key[1], key[2])
            if self.sort_string is not None:
                time_series.sort(key=eval(self.sort_string))
            # Construct result directory
            result_path = result_dir + os.sep + "data" + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)

            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format in ["pickle", "cpickle", "cPickle"]:
                result_file = open(
                    os.path.join(result_path, name + key_str + ".pickle"), "w")
                cPickle.dump(time_series, result_file,
                             cPickle.HIGHEST_PROTOCOL)
            elif s_format in ["text", "csv"]:
                self.update_meta_data({
                    "type": "stream",
                    "marker_column": "marker"
                })
                result_file = open(
                    os.path.join(result_path, name + key_str + ".csv"), "w")
                csvwriter = csv.writer(result_file)
                channel_names = copy.deepcopy(time_series[0][0].channel_names)
                if s_format == "csv":
                    channel_names.append("marker")
                csvwriter.writerow(channel_names)
                for (data, key) in time_series:
                    if s_format == "text":
                        numpy.savetxt(result_file,
                                      data,
                                      delimiter=",",
                                      fmt=s_type)
                        if not key is None:
                            result_file.write(str(key))
                            result_file.flush()
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            result_file.write(str(data.marker_name))
                            result_file.flush()
                    else:
                        first_line = True
                        marker = ""
                        if not key is None:
                            marker = str(key)
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            marker = str(data.marker_name)
                        for line in data:
                            l = list(line)
                            l.append(marker)
                            csvwriter.writerow(list(l))
                            if first_line:
                                first_line = False
                                marker = ""
                        result_file.flush()
            elif s_format in ["mat"]:
                result_file = open(
                    os.path.join(result_path, name + key_str + ".mat"), "w")
                # extract a first time series object to get meta data
                merged_time_series = time_series.pop(0)[0]
                # collect all important information in the collection_object
                collection_object = {
                    "sampling_frequency":
                    merged_time_series.sampling_frequency,
                    "channel_names": merged_time_series.channel_names
                }

                # merge all data
                for (data, key) in time_series:
                    merged_time_series = numpy.vstack(
                        (merged_time_series, data))
                collection_object["data"] = merged_time_series
                mdict = dict()
                mdict[name + key_str] = collection_object
                import scipy.io
                scipy.io.savemat(result_file, mdict=mdict)
            elif s_format in ["eeg"]:

                result_file = open(
                    os.path.join(result_path, name + key_str + ".eeg"), "a+")
                result_file_mrk = open(
                    os.path.join(result_path, name + key_str + ".vmrk"), "w")

                result_file_mrk.write(
                    "Brain Vision Data Exchange Marker File, "
                    "Version 1.0\n")
                result_file_mrk.write("; Data stored by pySPACE\n")
                result_file_mrk.write("[Common Infos]\n")
                result_file_mrk.write("Codepage=UTF-8\n")
                result_file_mrk.write("DataFile=%s\n" %
                                      str(name + key_str + ".eeg"))
                result_file_mrk.write("\n[Marker Infos]\n")

                markerno = 1
                datapoint = 1
                sf = None
                channel_names = None

                for t in time_series:
                    if sf is None:
                        sf = t[0].sampling_frequency
                    if channel_names is None:
                        channel_names = t[0].get_channel_names()
                    for mrk in t[0].marker_name.keys():
                        for tm in t[0].marker_name[mrk]:
                            result_file_mrk.write(
                                str("Mk%d=Stimulus,%s,%d,1,0\n" %
                                    (markerno, mrk, datapoint +
                                     (tm * sf / 1000.0))))
                            markerno += 1
                    data_ = t[0].astype(numpy.int16)
                    data_.tofile(result_file)
                    datapoint += data_.shape[0]

                result_hdr = open(
                    os.path.join(result_path, name + key_str + ".vhdr"), "w")

                result_hdr.write("Brain Vision Data Exchange Header "
                                 "File Version 1.0\n")
                result_hdr.write("; Data stored by pySPACE\n\n")
                result_hdr.write("[Common Infos]\n")
                result_hdr.write("Codepage=UTF-8\n")
                result_hdr.write("DataFile=%s\n" %
                                 str(name + key_str + ".eeg"))
                result_hdr.write("MarkerFile=%s\n" %
                                 str(name + key_str + ".vmrk"))
                result_hdr.write("DataFormat=BINARY\n")
                result_hdr.write("DataOrientation=MULTIPLEXED\n")
                result_hdr.write("NumberOfChannels=%d\n" % len(channel_names))
                result_hdr.write("SamplingInterval=%d\n\n" % (1000000 / sf))
                result_hdr.write("[Binary Infos]\n")
                result_hdr.write("BinaryFormat=INT_16\n\n")
                result_hdr.write("[Channel Infos]\n")

                # TODO: Add Resolutions to time_series
                # 0 = 0.1 [micro]V,
                # 1 = 0.5 [micro]V,
                # 2 = 10 [micro]V,
                # 3 = 152.6 [micro]V (seems to be unused!)
                resolutions_str = [
                    unicode("0.1,%sV" % unicode(u"\u03BC")),
                    unicode("0.5,%sV" % unicode(u"\u03BC")),
                    unicode("10,%sV" % unicode(u"\u03BC")),
                    unicode("152.6,%sV" % unicode(u"\u03BC"))
                ]
                for i in range(len(channel_names)):
                    result_hdr.write(
                        unicode("Ch%d=%s,,%s\n" %
                                (i + 1, channel_names[i],
                                 unicode(resolutions_str[0]))).encode('utf-8'))
            else:
                NotImplementedError("Using unavailable storage format:%s!" %
                                    s_format)
            result_file.close()
        self.update_meta_data({
            "channel_names":
            copy.deepcopy(time_series[0][0].channel_names),
            "sampling_frequency":
            time_series[0][0].sampling_frequency
        })
        #Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)

Example #23

0

Show file

File: time_series.py Project: BioinformaticsArchive/pyspace

    def store(self, result_dir, s_format="pickle"):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        **Parameters**
        
          :result_dir:
              The directory in which the collection will be stored.
              
          :name:
              The prefix of the file names in which the individual data sets are 
              stored. The actual file names are determined by appending suffixes
              that encode run, split, train/test information. 
              
              (*optional, default: "time_series"*)
              
          :format:
              The format in which the actual data sets should be stored.
              
              Possible formats are *pickle*, *text*, *csv* and *MATLAB* (.mat)
              format.

              In the MATLAB and text format, all time series objects are
              concatenated to a single large table containing only integer
              values.
              For the csv format comma separated values are taken as default
              or a specified Python format string.
              
              The MATLAB format is a struct that contains the data, the
              sampling frequency and the channel names.
              
              .. note:: For the text and MATLAB format, markers could be added 
                        by using a Marker_To_Mux node before
              
              (*optional, default: "pickle"*)

        .. todo:: Put marker to the right time point and also write marker channel.
        """
        name = "time_series"
        if type(s_format) == list:
            s_type = s_format[1]
            s_format = s_format[0]
        else:
            s_type = "%.18e"
        if s_format in ["text", "matlab"]:
            s_type = "%i"
        if s_format == "csv" and s_type == "real":
            s_type = "%.18e"
        # Update the meta data
        try:
            author = pwd.getpwuid(os.getuid())[4]
        except Exception:
            author = "unknown"
            self._log("Author could not be resolved.", level=logging.WARNING)
        self.update_meta_data({"type": "time_series",
                               "storage_format": s_format,
                               "author": author,
                               "data_pattern": "data_run" + os.sep 
                                               + name + "_sp_tt." + s_format})

        # Iterate through splits and runs in this dataset
        for key, time_series in self.data.iteritems():
            # load data, if necessary 
            # (due to the  lazy loading, the data might be not loaded already)
            if isinstance(time_series, basestring):
                time_series = self.get_data(key[0], key[1], key[2])
            if self.sort_string is not None:
                time_series.sort(key=eval(self.sort_string))
            # Construct result directory
            result_path = result_dir + os.sep + "data" + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
            
            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format in ["pickle", "cpickle", "cPickle"]:
                result_file = open(os.path.join(result_path,
                                                name+key_str+".pickle"), "w")
                cPickle.dump(time_series, result_file, cPickle.HIGHEST_PROTOCOL)
            elif s_format in ["text","csv"]:
                self.update_meta_data({
                    "type": "stream",
                    "marker_column": "marker"})
                result_file = open(os.path.join(result_path,
                                                name + key_str + ".csv"), "w")
                csvwriter = csv.writer(result_file)
                channel_names = copy.deepcopy(time_series[0][0].channel_names)
                if s_format == "csv":
                    channel_names.append("marker")
                csvwriter.writerow(channel_names)
                for (data, key) in time_series:
                    if s_format == "text":
                        numpy.savetxt(result_file, data, delimiter=",", fmt=s_type)
                        if not key is None:
                            result_file.write(str(key))
                            result_file.flush()
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            result_file.write(str(data.marker_name))
                            result_file.flush()
                    else:
                        first_line = True
                        marker = ""
                        if not key is None:
                            marker = str(key)
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            marker = str(data.marker_name)
                        for line in data:
                            l = list(line)
                            l.append(marker)
                            csvwriter.writerow(list(l))
                            if first_line:
                                first_line = False
                                marker = ""
                        result_file.flush()
            elif s_format in ["mat"]:
                result_file = open(os.path.join(result_path,
                                                name + key_str + ".mat"),"w")
                # extract a first time series object to get meta data 
                merged_time_series = time_series.pop(0)[0]
                # collect all important information in the collection_object
                collection_object = {
                    "sampling_frequency": merged_time_series.sampling_frequency,
                    "channel_names": merged_time_series.channel_names}

                # merge all data 
                for (data,key) in time_series:
                    merged_time_series = numpy.vstack((merged_time_series,
                                                       data))
                collection_object["data"] = merged_time_series 
                mdict = dict()
                mdict[name + key_str] = collection_object 
                import scipy.io
                scipy.io.savemat(result_file, mdict=mdict)
            else:
                NotImplementedError("Using unavailable storage format:%s!"
                                    % s_format)
            result_file.close()
        self.update_meta_data({
            "channel_names": copy.deepcopy(time_series[0][0].channel_names),
            "sampling_frequency": time_series[0][0].sampling_frequency
        })
        #Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)

Example #24

0

Show file

File: analyzer_sink.py Project: pyspace/test

    def store(self, result_dir, s_format="BrainVision"):
        # Keep original file name, depends on the AnalyserSinkNode, see it's documentation.
        if self.meta_data.has_key('eeg_src_file_name') and self.meta_data[
                'eeg_src_file_name'] is None:
            name = self.meta_data['eeg_src_file_name']
        # or use default name from this collection
        else:
            name = "Analyzer"
        if not s_format == "BrainVision":
            self._log("The format %s is not supported!" % s_format,
                      level=logging.CRITICAL)
            return
        # Update the meta data
        try:
            author = pwd.getpwuid(os.getuid())[4]
        except:
            author = "unknown"
            self._log("Author could not be resolved.", level=logging.WARNING)
        self.update_meta_data({
            "type": "only output of individual nodes stored",
            "storage_format": s_format,
            "author": author,
            "data_pattern": "Multiplexed"
        })
        # Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)
        #self._log("EEG data file %s" % self.collection.data_file)
        slices = []
        slices.append(0)
        channel_names = []

        for key, time_series in self.data.iteritems():
            # Sort the Times-Series Array
            def cmp_start(a, b):
                return cmp(a[0].start_time, b[0].start_time)

            time_series.sort(cmp_start)
            # Check for overlapping Windows and remove them if existent
            i = 0
            while i < len(time_series):
                ts = time_series[i]
                #print ts[0].start_time, ts[0].end_time
                #print len(time_series)
                if ts[0].start_time >= slices[-1]:
                    slices.append(ts[0].end_time)
                else:
                    warnings.warn("Ignoring at least one overlapping window!",
                                  UserWarning)
                i = i + 1
            # STORE ACTUAL EEG DATA AND WRITE MARKERFILE
            result_path = result_dir + os.sep + "data_analyzer" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)

            key_str = "_sp%s_%s" % key[1:]
            # Keep original name
            if (self.meta_data.has_key('eeg_src_file_name')
                    and self.meta_data['eeg_src_file_name'] != None):
                result_file_eeg = open(
                    os.path.join(result_path, name + ".eeg"), "wb")
                result_file_mrk = open(
                    os.path.join(result_path, name + ".vmrk"), "w")
            # or use default name from this collection
            else:
                result_file_eeg = open(
                    os.path.join(result_path, name + key_str + ".eeg"), "wb")
                result_file_mrk = open(
                    os.path.join(result_path, name + key_str + ".vmrk"), "w")

            # Write Marker header
            if (self.meta_data.has_key('eeg_src_file_name')
                    and self.meta_data['eeg_src_file_name'] != None):
                result_file_mrk.write(header_mrk % (name))
            else:
                result_file_mrk.write(header_mrk % (name + key_str))

            result_file_ms = 0

            # Data for padding
            padding = None

            count_mrk = 2
            num_ch = 0
            sampling_int = 0

            for ts in time_series:
                if padding == None:
                    padding = numpy.zeros(len(ts[0].channel_names),
                                          dtype='int16')
                    num_ch = len(ts[0].channel_names)
                    channel_names = ts[0].channel_names
                    sampling_int = 1000000 / ts[0].sampling_frequency
                    #print "writing %d channels.." % len(ts[0].channel_names)
                # Write Padding (zeros)
                while result_file_ms < ts[0].start_time:
                    result_file_eeg.write(padding.tostring())
                    result_file_ms += ts[0]._samples_to_ms(1)
                # Write window
                ts[0].tofile(result_file_eeg)
                result_file_ms += ts[0].end_time - ts[0].start_time
                # Write Marker
                result_file_mrk.write(
                    "Mk%d=Label,%s,%d,1,0\n" %
                    (count_mrk, ts[1], ts[0]._ms_to_samples(ts[0].start_time)))
                count_mrk += 1
            # WRITE HEADERFILE
            # Keep original name
            if (self.meta_data.has_key('eeg_src_file_name')
                    and self.meta_data['eeg_src_file_name'] != None):
                result_file_hdr = open(
                    os.path.join(result_path, name + ".vhdr"), "w")
                result_file_hdr.write(header_hdr %
                                      ((name), (name), num_ch, sampling_int))
            # or use default name from this collection
            else:
                result_file_hdr = open(
                    os.path.join(result_path, name + key_str + ".vhdr"), "w")
                result_file_hdr.write(header_hdr %
                                      ((name + key_str),
                                       (name + key_str), num_ch, sampling_int))
            # Format: Ch1=Fp1,,0.1,\xB5V
            for i in range(num_ch):
                result_file_hdr.write("Ch%d=%s,,0.1,\xB5V\n" %
                                      (i + 1, channel_names[i]))

            result_file_hdr.close()
            result_file_eeg.close()
            result_file_mrk.close()

Example #25

0

Show file

File: node_chain.py Project: AlexanderFabisch/pyspace

    def consolidate(self):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([self.result_directory, "{*",])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            # Load their meta_data
            meta_data = BaseDataset.load_meta_data(dataset_path)

            # Determine author and date
            try:
                author = pwd.getpwuid(os.getuid())[4]
            except:
                author = "unknown"
                self._log("Author could not be resolved.",level=logging.WARNING)
            date = time.strftime("%Y%m%d_%H_%M_%S")

            # Update meta data and store it
            meta_data.update({"author" : author, "date" : date})
            BaseDataset.store_meta_data(dataset_path, meta_data)

            # Copy the input dataset specification file to the result
            # directory in order to make later analysis of
            # the results more easy
            input_meta_path = os.sep.join([pySPACE.configuration.storage,
                                          meta_data["input_collection_name"]])
            input_meta = BaseDataset.load_meta_data(input_meta_path)
            BaseDataset.store_meta_data(dataset_path,input_meta,
                                        file_name="input_metadata.yaml")
        # Check if some results consist of several runs
        # and update the meta data in this case
        # TODO: This is not a clean solution
        for dataset_dir in glob.glob(os.sep.join([self.result_directory,
                                                     "*"])):
            if not os.path.isdir(dataset_dir): continue
            # There can be either run dirs, persistency dirs, or both of them.
            # Check of whichever there are more. If both exist, their numbers
            # are supposed to be equal.
            nr_run_dirs = len(glob.glob(os.sep.join([dataset_dir,
                                              "data_run*"])))
            nr_per_dirs = len(glob.glob(os.sep.join([dataset_dir,
                                              "persistency_run*"])))
            nr_runs = max(nr_run_dirs, nr_per_dirs)

            if nr_runs > 1:
                collection_meta = BaseDataset.load_meta_data(dataset_dir)
                collection_meta["runs"] = nr_runs
                BaseDataset.store_meta_data(dataset_dir,collection_meta)
        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory,"results_*"))
        if len(pathlist)>0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            result_collection = PerformanceResultSummary(dataset_dir=self.result_directory)
            self._log("done")
            self._log("Storing result collection")
            result_collection.store(self.result_directory)
            self._log("done")
            PerformanceResultSummary.merge_traces(self.result_directory)

            if not(self.compression == False):
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile
                cwd=os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode, and then skip the zipping.
                try:
                    pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))
                    
                    if not self.compression == "delete":                        
                        save_file=zipfile.ZipFile(self.result_directory+'/result_folders.zip',mode="w",compression=self.compression)
                        # we want to have the zipped file relative to the result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path=os.path.relpath(node[0],self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path,data))
                        save_file.close()
                    # To still have an easy access to the history of the processing,
                    # we keep one folder.
                    pathlist.pop()
                    for path in pathlist:
                        shutil.rmtree(path)
                except:
                    self._log("Result files could not be compressed with 32 bit mode, switching to 64 bit mode.", level=logging.CRITICAL)
                    # nearly total code copy, only difference with 64 bit mode
                    try:
                        pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))
                        save_file=zipfile.ZipFile(self.result_directory+'/result_folders.zip',mode="w",compression=self.compression, allowZip64=True)
                        # we want to have the zipped file relative to the result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path=os.path.relpath(node[0],self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path,data))
                        save_file.close()
                        # To still have an easy access to the history of the processing,
                        # we keep one folder.
                        pathlist.pop()
                        for path in pathlist:
                            shutil.rmtree(path)
                    except:
                        self._log("64 bit mode also failed. Please check your files and your code or contact your local programmer!", level=logging.CRITICAL)
                os.chdir(cwd)

Example #26

0

Show file

File: node_chain.py Project: AlexanderFabisch/pyspace

    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """ A factory method that creates the processes which form an operation
        based on the  information given in the operation specification, *operation_spec*.

        In debug mode this is done in serial. In the other default mode,
        at the moment 4 processes are created in parallel and can be immediately
        executed. So generation of processes and execution are made in parallel.
        This kind of process creation is done independently from the backend.

        For huge parameter spaces this is necessary!
        Otherwise numerous processes are created and corresponding data is loaded
        but the concept of spreading the computation to different processors
        can not really be used, because process creation is blocking only
        one processor and memory space, but nothing more is done,
        till the processes are all created.

        .. todo:: Use :class:`~pySPACE.resources.dataset_defs.dummy.DummyDataset`
                  for empty data, when no input_path is given.
        """
        assert(operation_spec["type"] == "node_chain")

        # Determine all parameter combinations that should be tested
        parameter_settings = cls._get_parameter_space(operation_spec)

        ## Use node_chain parameter if no templates are given ##
        if not operation_spec.has_key("templates"):
            if operation_spec.has_key("node_chain"):
                operation_spec["templates"]=[operation_spec.pop("node_chain")]
            else:
                warnings.warn("Specify parameter 'templates' or 'node_chain' in your operation spec!")
                operation_spec["templates"]=[operation_spec.pop("flow")]
        elif operation_spec.has_key("node_chain"):
            operation_spec.pop("node_chain")
            warnings.warn("node_chain parameter is ignored. Templates are used.")
        elif type(operation_spec["templates"][0])==str: # load files in templates
            operation_spec["template_files"]=copy.deepcopy(operation_spec["templates"])
            for i in range(len(operation_spec["templates"])):
                rel_node_chain_file = operation_spec["templates"][i]
                abs_node_chain_file = open(os.sep.join([pySPACE.configuration.spec_dir,
                                                     "node_chains",
                                                     rel_node_chain_file]), 'r')
                node_chain = yaml.load(abs_node_chain_file)
                abs_node_chain_file.close()
                operation_spec["templates"][i] = node_chain


        storage = pySPACE.configuration.storage
        if not input_paths :
            raise Exception("No input datasets found in input_path %s in %s!"
                            % (operation_spec["input_path"],storage))

        # Get relative path
        rel_input_paths = [name[len(storage):]
                                for name in  input_paths]

        # Determine approximate number of runs
        if "runs" in operation_spec:
            runs = operation_spec["runs"]
        else:
            runs = []
            for dataset_dir in rel_input_paths:
                abs_collection_path = \
                        pySPACE.configuration.storage + os.sep \
                            + dataset_dir
                collection_runs = \
                        BaseDataset.load_meta_data(abs_collection_path).get('runs',1)
                runs.append(collection_runs)
            runs = max(runs)

        # Determine splits
        dataset_dir = rel_input_paths[0]
        abs_collection_path = \
                pySPACE.configuration.storage + os.sep + dataset_dir

        splits = BaseDataset.load_meta_data(abs_collection_path).get('splits', 1)

        # Determine how many processes will be created
        number_processes = len(operation_spec["templates"]) * \
                           len(parameter_settings) * len(rel_input_paths) * \
                           runs * splits

        if debug == True:
            # To better debug creation of processes we don't limit the queue
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, operation_spec,
                                 parameter_settings, rel_input_paths)
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes)
        else:
            # Create all processes by calling a recursive helper method in
            # another thread so that already created processes can be executed in
            # parallel. Therefore a queue is used which size is maximized to
            # guarantee that not to much objects are created (because this costs
            # memory). However, the actual number of 4 is arbitrary and might
            # be changed according to the system at hand.
            processes = processing.Queue(4)
            create_process = \
                    processing.Process(target=cls._createProcesses,
                                       args=(processes, result_directory,
                                             operation_spec, parameter_settings,
                                             rel_input_paths))
            create_process.start()
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes, create_process)

Example #27

0

Show file

    def _merge_pickle_files(self, target_dataset_path, source_dataset_pathes):
        """ Concatenate all datasets in source_dataset_pathes and store 
            them in the target dataset"""
        # sort the dataset 
        source_dataset_pathes.sort()
        # load a first dataset, in which the data of all other datasets is assembled
        target_dataset = BaseDataset.load(source_dataset_pathes[0])
        
        # Determine author and date
        try:
            author = getpass.getuser()
        except : 
            author = "Unknown"
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_dataset.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        params = target_dataset.meta_data.pop("parameter_setting")
        params["__INPUT_DATASET__"] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_dataset_pathes]
        params["__RESULT_DIRECTORY__"] = self.result_directory
        target_dataset.meta_data.update({"author" : author, 
                      "date" : date, 
                      "dataset_directory" : target_dataset_path,
                      "train_test" : False,
                      "parameter_setting" : params,
                      "changed_time" : self.change_time,
                      "input_dataset_name" : source_dataset_pathes[0][len(
                                        pySPACE.configuration.storage):]
        })
    
        # Concatenate data of all other datasets to target dataset
        for source_dataset_path in source_dataset_pathes[1:]:
            source_dataset = BaseDataset.load(source_dataset_path)
            for run in source_dataset.get_run_numbers():
                for split in source_dataset.get_split_numbers():
                    target_data = target_dataset.get_data(run, split, "test")

                    if self.change_time:
                        # ensure sorted target_data 
                        # TODO: encode this in meta data?  
                        target_data.sort(key=lambda t: t[0].end_time)
                        last_end_time = target_data[-1][0].end_time

                    for ts, l in target_data:
                        if ts.specs == None:
                            ts.specs = {"new_set": False}
                        elif ts.specs.has_key("new_set"):
                            break
                        else:
                            ts.specs["new_set"]= False

                    data = source_dataset.get_data(run, split, "test")

                    if self.change_time:                    
                        # ensure sorted target_data 
                        # TODO: encode this in meta data?
                        data.sort(key=lambda t: t[0].end_time)
                    # flag the first element of the concatenated data list
                    for i, (ts, l) in enumerate(data):
                        if ts.specs == None:
                            ts.specs = {"new_set": i==0}
                        else:
                            ts.specs["new_set"] = (i==0)
                        if self.change_time:
                            ts.start_time = last_end_time + ts.start_time
                            ts.end_time = last_end_time + ts.end_time
                            
                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)
                
        target_dataset.store(target_dataset_path)

Example #28

0

Show file

File: merge.py Project: jhuebotter/pyspace

    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(MergeProcess, self).pre_benchmarking()

        # For all input collections
        for source_test_collection_path in self.input_collections:
            # Check if the input data is splitted
            # e.g. only a single test file is in the source directory
            source_files = glob.glob(
                os.sep.join(
                    [source_test_collection_path, "data_run0", "*test*"]))
            splitted = len(source_files) > 1
            assert (not splitted)
            source_file_name = str(source_files[-1])

            # check if train sets are also present
            train_data_present = len(glob.glob(os.sep.join(
                                 [source_test_collection_path,"data_run0",\
                                  "*train*"]))) > 0

            # if training data is present -> use train and test sets separately
            if train_data_present:
                train_set_name_suffix = "train"
            else:
                train_set_name_suffix = "test"

            # We create the collection Rest_vs_Collection
            source_test_collection_name = \
                                   source_test_collection_path.split(os.sep)[-2]
            test_base_collection_name = \
                          source_test_collection_name.strip("}{").split("}{")[0]
            if self.reverse:
                target_collection_name = source_test_collection_name.replace(
                    test_base_collection_name,
                    test_base_collection_name + "_vs_" + self.name_pattern)
                key = "train"
            else:
                target_collection_name = source_test_collection_name.replace(
                    test_base_collection_name,
                    self.name_pattern + "_vs_" + test_base_collection_name)
                key = "test"

            target_collection_path = os.sep.join(
                [self.result_directory, target_collection_name])
            # determine the parameter_settings of the test collection
            test_collection = BaseDataset.load(source_test_collection_path)
            target_collection_params = \
                                 test_collection.meta_data["parameter_setting"]
            target_collection_params["__INPUT_DATASET__"] = \
                                           {key: source_test_collection_name}

            if source_file_name.endswith("arff"):
                file_ending = "arff"
                # Copy arff file from input collection to target collection
                source_test_file_path = os.sep.join([
                    source_test_collection_path, "data_run0",
                    "features_sp0" + train_set_name_suffix + ".arff"
                ])
                target_test_file_path = os.sep.join([
                    target_collection_path, "data_run0",
                    "features_sp0_" + key + ".arff"
                ])

            else:
                file_ending = source_file_name.split(".")[-1]
                source_test_file_path = source_test_collection_path
                target_test_file_path = target_collection_path

            source_train_pathes = []
            for source_train_collection_path in self.input_collections:
                source_train_collection_name = \
                                  source_train_collection_path.split(os.sep)[-2]
                # We must not use data originating from the same input
                # collection both in train and test files
                if source_test_collection_name == source_train_collection_name:
                    continue

                # Check that all constraints are fulfilled for this pair of
                # input collections
                if not all(eval(constraint_template % \
                  {'source_train_collection_name': source_train_collection_name,
                   'source_test_collection_name': source_test_collection_name})
                        for constraint_template in self.collection_constraints):
                    continue

                # check if all parameters are stored in the target path
                source_collection = \
                                BaseDataset.load(source_train_collection_path)
                source_collection_params = \
                            source_collection.meta_data["parameter_setting"]
                remaining_params = \
                          [param for param in source_collection_params.items() \
                            if param not in target_collection_params.items() and \
                               param[0] not in ["__INPUT_DATASET__",
                               "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__",
                               "__INPUT_COLLECTION__" ]] # for old data
                if remaining_params != []:
                    for k, v in remaining_params:
                        target_collection_path += "{%s#%s}" % (k, str(v))
                        target_collection_params[k] = v

                if "arff" == file_ending:
                    source_train_file_path = \
                                      os.sep.join([source_train_collection_path,
                                                "data_run0", "features_sp0_" + \
                                               train_set_name_suffix + ".arff"])
                else:
                    source_train_file_path = source_train_collection_path

                source_train_pathes.append(source_train_file_path)

            if "arff" == file_ending:
                target_train_file_path = os.sep.join([
                    target_collection_path, "data_run0",
                    "features_sp0_" + key + ".arff"
                ])
            else:
                target_train_file_path = target_collection_path

            if len(source_train_pathes) == 0:
                continue

            create_directory(os.sep.join([target_collection_path,
                                          "data_run0"]))

            if "arff" == file_ending:
                self._copy_arff_file(source_test_file_path,
                                     target_test_file_path,
                                     source_test_collection_name,
                                     target_collection_name)

                self._merge_arff_files(target_train_file_path,
                                       source_train_pathes,
                                       target_collection_name)
                # Copy metadata.yaml
                # TODO: Adapt to new collection
                input_meta = BaseDataset.load_meta_data(
                    source_test_collection_path)
                BaseDataset.store_meta_data(target_collection_path, input_meta)
            else:
                self._copy_file(source_test_collection_path,
                                target_collection_path, train_set_name_suffix)

                self._merge_files(target_train_file_path, source_train_pathes,
                                  train_set_name_suffix,
                                  target_collection_params)

        ############## Clean up after benchmarking ##############
        super(MergeProcess, self).post_benchmarking()

Example #29

0

Show file

    def _createProcesses(cls, processes, result_directory, operation_spec,
                         parameter_settings, input_collections):
        try:
            storage_format = operation_spec["storage_format"] if "storage_format" \
                in operation_spec else None

            # Determine whether the node_chain should be stored after data processing
            store_node_chain = operation_spec["store_node_chain"] \
                             if "store_node_chain" in operation_spec else False

            # Determine whether certain parameters should not be remembered
            hide_parameters = [] if "hide_parameters" not in operation_spec \
                                    else list(operation_spec["hide_parameters"])
            hide_parameters.append("__INPUT_COLLECTION__")
            hide_parameters.append("__INPUT_DATASET__")
            hide_parameters.append("__RESULT_DIRECTORY__")
            hide_parameters.append("__OUTPUT_BUNDLE__")
            operation_spec["hide_parameters"] = hide_parameters

            # Create all combinations of collections, runs and splits
            collection_run_split_combinations = []
            for input_dataset_dir in input_collections:
                # Determine number of runs to be conducted for this collection
                abs_collection_path = \
                    pySPACE.configuration.storage + os.sep \
                        + input_dataset_dir
                collection_runs = \
                    BaseDataset.load_meta_data(abs_collection_path).get('runs', 1)
                # D.get(k[,d]) -> D[k] if k in D, else d.

                if "runs" not in operation_spec:
                    requested_runs = collection_runs
                else:
                    requested_runs = operation_spec["runs"]

                assert collection_runs == requested_runs \
                    or collection_runs == 1, \
                    "Requested %s runs but input collection %s provides "\
                    "data for %s runs." % (requested_runs, input_dataset_dir,
                                           collection_runs)

                for run in range(max(requested_runs, collection_runs)):
                    collection_splits = BaseDataset.load_meta_data(
                        abs_collection_path).get('splits', 1)
                    for split in range(collection_splits):
                        collection_run_split_combinations.append(
                            (input_dataset_dir, run, split))

            # Shuffle order of dataset-run-split combinations. This should help to
            # avoid that all processes work on the same data which can cause
            # problems due to locking etc.
            random.shuffle(collection_run_split_combinations)

            # For all templates
            for node_chain_spec in operation_spec["templates"]:
                # For all possible parameter instantiations of this template
                for parameter_setting in parameter_settings:
                    # For all input collections-run combinations
                    for input_dataset_dir, run, split in \
                            collection_run_split_combinations:
                        # We are going to change the parameter_setting and don't
                        # want to interfere with later runs so we work on a copy
                        parameter_setting_cp = copy.deepcopy(parameter_setting)

                        # Add input and output path to parameter
                        # setting
                        parameter_setting_cp["__INPUT_DATASET__"] = \
                                input_dataset_dir.split(os.sep)[-2]
                        parameter_setting_cp["__RESULT_DIRECTORY__"] = \
                                result_directory
                        if len(operation_spec["templates"]) > 1:
                            index = operation_spec["templates"].index(
                                node_chain_spec)
                            parameter_setting_cp["__Template__"]=\
                                operation_spec["template_files"][index]

                        # Load the input meta data
                        dataset_dir = os.sep.join(
                            [pySPACE.configuration.storage, input_dataset_dir])
                        dataset_md = BaseDataset.load_meta_data(dataset_dir)
                        # Add the input parameter's meta data
                        # to the given parameter setting
                        if "parameter_setting" in dataset_md:
                            dataset_md["parameter_setting"].update(
                                parameter_setting_cp)
                            all_parameters = dataset_md["parameter_setting"]
                        else:
                            all_parameters = parameter_setting_cp

                        def check_constraint(constraint, parameters):
                            for key, value in parameters.iteritems():
                                constraint = constraint.replace(
                                    key, str(value))
                            return eval(constraint)

                        if not all(
                                check_constraint(constraint_def,
                                                 all_parameters)
                                for constraint_def in operation_spec.get(
                                    'old_parameter_constraints', [])):
                            continue

                        # Determine directory in which the result of this
                        # process should be written
                        result_dataset_directory = \
                            NodeChainOperation._get_result_dataset_dir(
                                result_directory,
                                input_dataset_dir,
                                parameter_setting_cp,
                                hide_parameters)

                        # Create the respective process and put it to the
                        # executing-queue of processes
                        process = NodeChainProcess(
                            node_chain_spec=node_chain_spec,
                            parameter_setting=parameter_setting_cp,
                            rel_dataset_dir=input_dataset_dir,
                            run=run,
                            split=split,
                            storage_format=storage_format,
                            result_dataset_directory=result_dataset_directory,
                            store_node_chain=store_node_chain,
                            hide_parameters=hide_parameters)

                        processes.put(process)
        finally:
            # give executing process the sign that creation is now finished
            processes.put(False)

Example #30

0

Show file

File: node_chain.py Project: Hansa064/pyspace

    def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting, hide_parameters):
        """ Determines the name of the result directory

        Determines the name of the result directory based on the
        input_dataset_dir, the node_chain_name and the parameter setting.
        """
        # Determine the result_directory name
        # String between Key and value changed from ":" to "#",
        # because ot problems in windows and with windows file servers
        def _get_result_dir_name(parameter_setting, hide_parameters, method=None):
            """ internal function to create result dir name in different ways"""
            if not method:
                parameter_str = "}{".join(
                    ("%s#%s" % (key, value))
                    for key, value in parameter_setting.iteritems()
                    if key not in hide_parameters
                )
            elif method == "hash":
                parameter_str = "}{".join(
                    ("%s#%s" % (key, hash(str(value).replace(" ", ""))))
                    for key, value in parameter_setting.iteritems()
                    if key not in hide_parameters
                )

            parameter_str = parameter_str.replace("'", "")
            parameter_str = parameter_str.replace(" ", "")
            parameter_str = parameter_str.replace("[", "")
            parameter_str = parameter_str.replace("]", "")
            parameter_str = parameter_str.replace(os.sep, "")
            result_name = "{%s}" % input_name

            if parameter_str != "":
                result_name += "{%s}" % (parameter_str)

            # Determine the path where this result will be stored
            # and create the directory if necessary
            result_dir = base_dir
            result_dir += os.sep + result_name
            # filename is to long
            # (longer than allowed including optional offsets for pyspace
            #  result csv naming conventions)
            # create a md5 hash of the result name and use that one
            import platform

            CURRENTOS = platform.system()
            if CURRENTOS == "Windows":
                # the maximum length for a filename on Windows is 255
                if len(result_dir) > 255 - 32:
                    result_name = "{" + hashlib.md5(result_name).hexdigest() + "}"
                    result_dir = base_dir
                    result_dir += os.sep + result_name
                return result_dir
            else:
                if len(result_dir) > os.pathconf(os.curdir, "PC_NAME_MAX") - 32:
                    result_name = "{" + hashlib.md5(result_name).hexdigest() + "}"
                    result_dir = base_dir
                    result_dir += os.sep + result_name
                return result_dir

        input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
        input_name = input_name.strip("{}")
        # If the input is already the result of an operation
        if input_name.count("}{") > 0:
            input_name_parts = input_name.split("}{")
            input_name = input_name_parts[0]

        # Load the input meta data
        dataset_dir = os.sep.join([pySPACE.configuration.storage, input_dataset_dir])
        dataset_md = BaseDataset.load_meta_data(dataset_dir)

        # We are going to change the parameter_setting and don't want to
        # interfere with later runs so we work on a copy
        parameter_setting = copy.deepcopy(parameter_setting)

        # Ignore pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in parameter_setting:
            parameter_setting.pop("__PREPARE_OPERATION__")

        # Add the input parameters meta data to the given parameter setting
        if "parameter_setting" in dataset_md:
            parameter_setting.update(dataset_md["parameter_setting"])

        # We have to remove ' characters from the parameter value since
        # Weka does ignore them
        for key, value in parameter_setting.iteritems():
            if isinstance(value, basestring) and value.count("'") > 1:
                parameter_setting[key] = eval(value)

        result_dir = _get_result_dir_name(parameter_setting, hide_parameters)
        try:
            create_directory(result_dir)
        except OSError as e:
            if e.errno == 36:
                # filename is too long
                result_dir = _get_result_dir_name(parameter_setting, hide_parameters, "hash")
            create_directory(result_dir)

        return result_dir

Example #31

0

Show file

    def consolidate(self, _=None):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([
            self.result_directory,
            "{*",
        ])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            try:
                # Load their meta_data
                meta_data = BaseDataset.load_meta_data(dataset_path)

                # Determine author and date
                author = get_author()
                date = time.strftime("%Y%m%d_%H_%M_%S")

                # Update meta data and store it
                meta_data.update({"author": author, "date": date})

                # There can be either run dirs, persistency dirs, or both of them.
                # Check of whichever there are more. If both exist, their numbers
                # are supposed to be equal.
                nr_run_dirs = len(
                    glob.glob(os.path.join(dataset_path, "data_run*")))
                nr_per_dirs = len(
                    glob.glob(os.path.join(dataset_path, "persistency_run*")))
                nr_runs = max(nr_run_dirs, nr_per_dirs)
                if nr_runs > 1:
                    meta_data["runs"] = nr_runs

                # Store the metadata
                BaseDataset.store_meta_data(dataset_path, meta_data)

                # Copy the input dataset specification file to the result
                # directory in order to make later analysis of
                # the results more easy
                # THA: Split the first "/" from the input collection name, because otherwise it will be treated
                # as an absolute path
                input_collection_name = meta_data["input_collection_name"][1:] if \
                    meta_data["input_collection_name"][0] == os.sep else meta_data["input_collection_name"]
                input_meta_path = os.path.join(pySPACE.configuration.storage,
                                               input_collection_name)
                try:
                    input_meta = BaseDataset.load_meta_data(input_meta_path)
                    BaseDataset.store_meta_data(
                        dataset_path,
                        input_meta,
                        file_name="input_metadata.yaml")
                except (IOError, OSError) as e:
                    self._log("Error copying the input_metadata.yaml: {error}".
                              format(error=e.message),
                              level=logging.CRITICAL)
            except Exception as e:
                logging.getLogger("%s" % self).exception(
                    "Error updating the metadata: {error!s}".format(error=e))
                raise e

        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory, "results_*"))
        if len(pathlist) > 0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            try:
                result_collection = PerformanceResultSummary(
                    dataset_dir=self.result_directory)
                self._log("done")
                self._log("Storing result collection")
                result_collection.store(self.result_directory)
                self._log("done")
                PerformanceResultSummary.merge_traces(self.result_directory)
            except Exception as e:
                logging.getLogger("%s" % self).exception(
                    "Error merging the result collection: {error!s}".format(
                        error=e))

            if self.compression:
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile
                cwd = os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode,
                # and then skip the zipping.
                try:
                    pathlist = glob.glob(
                        os.path.join(self.result_directory, "{*}"))

                    if not self.compression == "delete":
                        save_file = zipfile.ZipFile(
                            self.result_directory + '/result_folders.zip',
                            mode="w",
                            compression=self.compression)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path = os.path.relpath(
                                    node[0], self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(
                                        os.path.join(rel_path, data))
                        save_file.close()
                    # To still have an easy access to the history of the
                    # processing, we keep one folder.
                    pathlist.pop()
                    for path in pathlist:
                        shutil.rmtree(path)
                except Exception, e:
                    self._log("Result files could not be compressed with 32" +
                              " bit mode, switching to 64 bit mode",
                              level=logging.CRITICAL)
                    # nearly total code copy, only difference with 64 bit mode
                    try:
                        pathlist = glob.glob(
                            os.path.join(self.result_directory, "{*}"))
                        save_file = zipfile.ZipFile(
                            self.result_directory + '/result_folders.zip',
                            mode="w",
                            compression=self.compression,
                            allowZip64=True)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path = os.path.relpath(
                                    node[0], self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(
                                        os.path.join(rel_path, data))
                        save_file.close()
                        # To still have an easy access to the history of the
                        # processing, we keep one folder.
                        pathlist.pop()
                        for path in pathlist:
                            shutil.rmtree(path)
                    except:
                        self._log(
                            "64 bit mode also failed. Please check your files and your code or contact your local programmer!",
                            level=logging.CRITICAL)
                os.chdir(cwd)

Example #32

0

Show file

File: merge.py Project: BioinformaticsArchive/pyspace

    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(MergeProcess, self).pre_benchmarking()
        
        # For all input collections
        for source_test_collection_path in self.input_collections:
            # Check if the input data is splitted
            # e.g. only a single test file is in the source directory 
            source_files = glob.glob(os.sep.join([source_test_collection_path,
                                                  "data_run0", "*test*"]))
            splitted = len(source_files) > 1
            assert(not splitted)
            source_file_name = str(source_files[-1])
            
            # check if train sets are also present
            train_data_present = len(glob.glob(os.sep.join(
                                 [source_test_collection_path,"data_run0",\
                                  "*train*"]))) > 0
            
            # if training data is present -> use train and test sets separately
            if train_data_present:
                train_set_name_suffix = "train"
            else:
                train_set_name_suffix =  "test"
            
            # We create the collection Rest_vs_Collection
            source_test_collection_name = \
                                   source_test_collection_path.split(os.sep)[-2]
            test_base_collection_name = \
                          source_test_collection_name.strip("}{").split("}{")[0]
            if self.reverse:
                target_collection_name = source_test_collection_name.replace(
                                         test_base_collection_name,
                                         test_base_collection_name + "_vs_Rest")
                key = "train"
            else:
                target_collection_name = source_test_collection_name.replace(
                                         test_base_collection_name,
                                         "Rest_vs_" + test_base_collection_name)
                key = "test"
                
            target_collection_path = os.sep.join([self.result_directory,
                                                  target_collection_name])
            # determine the parameter_settings of the test collection
            test_collection = BaseDataset.load(source_test_collection_path)
            target_collection_params = \
                                 test_collection.meta_data["parameter_setting"]
            target_collection_params["__INPUT_DATASET__"] = \
                                           {key: source_test_collection_name}
            
            if source_file_name.endswith("arff"):
                file_ending = "arff"
                # Copy arff file from input collection to target collection
                source_test_file_path = os.sep.join([source_test_collection_path,
                                        "data_run0","features_sp0" +
                                        train_set_name_suffix + ".arff"])
                target_test_file_path = os.sep.join([target_collection_path,
                                       "data_run0","features_sp0_"+key+".arff"])
            
            elif source_file_name.endswith("pickle"):
                file_ending = "pickle"
                source_test_file_path = source_test_collection_path
                target_test_file_path = target_collection_path
            else:
                raise NotImplementedError("File type not supported in " \
                                                               "MergeOperation")
            
            source_train_pathes = []
            for source_train_collection_path in self.input_collections:
                source_train_collection_name = \
                                  source_train_collection_path.split(os.sep)[-2]
                # We must not use data originating from the same input
                # collection both in train and test files
                if source_test_collection_name == source_train_collection_name:
                    continue
                
                # Check that all constraints are fulfilled for this pair of
                # input collections
                if not all(eval(constraint_template % \
                  {'source_train_collection_name': source_train_collection_name,
                   'source_test_collection_name': source_test_collection_name})
                        for constraint_template in self.collection_constraints):
                    continue
                
                # check if all parameters are stored in the target path
                source_collection = \
                                BaseDataset.load(source_train_collection_path)
                source_collection_params = \
                            source_collection.meta_data["parameter_setting"]
                remaining_params = \
                          [param for param in source_collection_params.items() \
                            if param not in target_collection_params.items() and \
                               param[0] not in ["__INPUT_DATASET__", 
                               "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__",
                               "__INPUT_COLLECTION__" ]] # for old data
                if remaining_params != []:
                    for k,v in remaining_params:
                         target_collection_path += "{%s#%s}" % (k,str(v))
                         target_collection_params[k]=v
                   
                if "arff" == file_ending:
                    source_train_file_path = \
                                      os.sep.join([source_train_collection_path, 
                                                "data_run0", "features_sp0_" + \
                                               train_set_name_suffix + ".arff"])
                elif "pickle" == file_ending:
                    source_train_file_path = source_train_collection_path

                else:
                    raise NotImplementedError("File type not supported in " \
                                                              "MergeOperation!")     
                    
                source_train_pathes.append(source_train_file_path)
            
            if "arff" == file_ending:
                target_train_file_path = os.sep.join([target_collection_path,
                                       "data_run0","features_sp0_"+key+".arff"])
            elif "pickle" == file_ending:
                target_train_file_path = target_collection_path
            else:
                raise NotImplementedError("File type not supported in "
                                                              "MergeOperation!")     
            
            if len(source_train_pathes) == 0:
                continue
            
            create_directory(os.sep.join([target_collection_path,
                                          "data_run0"]))
            
            if "arff" == file_ending:
                self._copy_arff_file(source_test_file_path, 
                                     target_test_file_path,
                                     source_test_collection_name, 
                                     target_collection_name)
                                
                self._merge_arff_files(target_train_file_path, 
                                       source_train_pathes,
                                       target_collection_name)
                # Copy metadata.yaml
                # TODO: Adapt to new collection
                input_meta = BaseDataset.load_meta_data(source_test_collection_path)
                BaseDataset.store_meta_data(target_collection_path,input_meta)
            elif "pickle" == file_ending:
                self._copy_pickle_file(source_test_collection_path,
                                       target_collection_path,
                                       train_set_name_suffix)

                self._merge_pickle_files(target_train_file_path, 
                                         source_train_pathes, 
                                         train_set_name_suffix,
                                         target_collection_params)
            else:
                raise NotImplementedError("File type not supported in merge_operation")
            
        ############## Clean up after benchmarking ##############
        super(MergeProcess, self).post_benchmarking()

Example #33

0

Show file

File: concatenate.py Project: BioinformaticsArchive/pyspace

    def _merge_pickle_files(self, target_dataset_path, source_dataset_pathes):
        """ Concatenate all datasets in source_dataset_pathes and store 
            them in the target dataset"""
        # sort the dataset 
        source_dataset_pathes.sort()
        # load a first dataset, in which the data of all other datasets is assembled
        target_dataset = BaseDataset.load(source_dataset_pathes[0])
        
        # Determine author and date
        try:
            author = getpass.getuser()
        except : 
            author = "Unknown"
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_dataset.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        params = target_dataset.meta_data.pop("parameter_setting")
        params["__INPUT_DATASET__"] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_dataset_pathes]
        params["__RESULT_DIRECTORY__"] = self.result_directory
        target_dataset.meta_data.update({"author" : author, 
                      "date" : date, 
                      "dataset_directory" : target_dataset_path,
                      "train_test" : False,
                      "parameter_setting" : params,
                      "changed_time" : self.change_time,
                      "input_dataset_name" : source_dataset_pathes[0][len(
                                        pySPACE.configuration.storage):]
        })
    
        # Concatenate data of all other datasets to target dataset
        for source_dataset_path in source_dataset_pathes[1:]:
            source_dataset = BaseDataset.load(source_dataset_path)
            for run in source_dataset.get_run_numbers():
                for split in source_dataset.get_split_numbers():
                    target_data = target_dataset.get_data(run, split, "test")

                    if self.change_time:
                        # ensure sorted target_data 
                        # TODO: encode this in meta data?  
                        target_data.sort(key=lambda t: t[0].end_time)
                        last_end_time = target_data[-1][0].end_time

                    for ts, l in target_data:
                        if ts.specs == None:
                            ts.specs = {"new_set": False}
                        elif ts.specs.has_key("new_set"):
                            break
                        else:
                            ts.specs["new_set"]= False

                    data = source_dataset.get_data(run, split, "test")

                    if self.change_time:                    
                        # ensure sorted target_data 
                        # TODO: encode this in meta data?
                        data.sort(key=lambda t: t[0].end_time)
                    # flag the first element of the concatenated data list
                    for i, (ts, l) in enumerate(data):
                        if ts.specs == None:
                            ts.specs = {"new_set": i==0}
                        else:
                            ts.specs["new_set"] = (i==0)
                        if self.change_time:
                            ts.start_time = last_end_time + ts.start_time
                            ts.end_time = last_end_time + ts.end_time
                            
                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)
                
        target_dataset.store(target_dataset_path)

Example #34

0

Show file

    def _get_result_dataset_dir(base_dir, input_dataset_dir, parameter_setting,
                                hide_parameters):
        """ Determines the name of the result directory

        Determines the name of the result directory based on the
        input_dataset_dir, the node_chain_name and the parameter setting.
        """

        # Determine the result_directory name
        # String between Key and value changed from ":" to "#",
        # because ot problems in windows and with windows file servers
        def _get_result_dir_name(parameter_setting,
                                 hide_parameters,
                                 method=None):
            """ internal function to create result dir name in different ways"""
            if not method:
                parameter_str = "}{".join(
                    ("%s#%s" % (key, value))
                    for key, value in parameter_setting.iteritems()
                    if key not in hide_parameters)
            elif method == "hash":
                parameter_str = "}{".join(
                    ("%s#%s" % (key, hash(str(value).replace(' ', ''))))
                    for key, value in parameter_setting.iteritems()
                    if key not in hide_parameters)

            parameter_str = parameter_str.replace("'", "")
            parameter_str = parameter_str.replace(" ", "")
            parameter_str = parameter_str.replace("[", "")
            parameter_str = parameter_str.replace("]", "")
            parameter_str = parameter_str.replace(os.sep, "")
            result_name = "{%s}" % input_name

            if parameter_str != "":
                result_name += "{%s}" % (parameter_str)

            # Determine the path where this result will be stored
            # and create the directory if necessary
            result_dir = base_dir
            result_dir += os.sep + result_name
            # filename is to long
            # (longer than allowed including optional offsets for pyspace
            #  result csv naming conventions)
            # create a md5 hash of the result name and use that one
            import platform
            CURRENTOS = platform.system()
            if CURRENTOS == "Windows":
                # the maximum length for a filename on Windows is 255
                if len(result_dir) > 255 - 32:
                    result_name = "{" + hashlib.md5(
                        result_name).hexdigest() + "}"
                    result_dir = base_dir
                    result_dir += os.sep + result_name
                return result_dir
            else:
                if len(result_dir) > os.pathconf(os.curdir,
                                                 'PC_NAME_MAX') - 32:
                    result_name = "{" + hashlib.md5(
                        result_name).hexdigest() + "}"
                    result_dir = base_dir
                    result_dir += os.sep + result_name
                return result_dir

        input_name = input_dataset_dir.strip(os.sep).split(os.sep)[-1]
        input_name = input_name.strip("{}")
        # If the input is already the result of an operation
        if input_name.count("}{") > 0:
            input_name_parts = input_name.split("}{")
            input_name = input_name_parts[0]

        # Load the input meta data
        dataset_dir = os.sep.join(
            [pySPACE.configuration.storage, input_dataset_dir])
        dataset_md = BaseDataset.load_meta_data(dataset_dir)

        # We are going to change the parameter_setting and don't want to
        # interfere with later runs so we work on a copy
        parameter_setting = copy.deepcopy(parameter_setting)

        # Ignore pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in parameter_setting:
            parameter_setting.pop("__PREPARE_OPERATION__")

        # Add the input parameters meta data to the given parameter setting
        if "parameter_setting" in dataset_md:
            parameter_setting.update(dataset_md["parameter_setting"])

        # We have to remove ' characters from the parameter value since
        # Weka does ignore them
        for key, value in parameter_setting.iteritems():
            if isinstance(value, basestring) and value.count("'") > 1:
                parameter_setting[key] = eval(value)

        result_dir = _get_result_dir_name(parameter_setting, hide_parameters)
        try:
            create_directory(result_dir)
        except OSError as e:
            if e.errno == 36:
                # filename is too long
                result_dir = _get_result_dir_name(parameter_setting,
                                                  hide_parameters, "hash")
            create_directory(result_dir)

        return result_dir

Example #35

0

Show file

    def create(cls,
               operation_spec,
               result_directory,
               debug=False,
               input_paths=[]):
        """ A factory method that creates the processes which form an operation
        based on the  information given in the operation specification, *operation_spec*.

        In debug mode this is done in serial. In the other default mode,
        at the moment 4 processes are created in parallel and can be immediately
        executed. So generation of processes and execution are made in parallel.
        This kind of process creation is done independently from the backend.

        For huge parameter spaces this is necessary!
        Otherwise numerous processes are created and corresponding data is loaded
        but the concept of spreading the computation to different processors
        can not really be used, because process creation is blocking only
        one processor and memory space, but nothing more is done,
        till the processes are all created.

        .. todo:: Use :class:`~pySPACE.resources.dataset_defs.dummy.DummyDataset`
                  for empty data, when no input_path is given.
        """
        assert (operation_spec["type"] == "node_chain")

        # Determine all parameter combinations that should be tested
        parameter_settings = cls._get_parameter_space(operation_spec)

        ## Use node_chain parameter if no templates are given ##
        if not operation_spec.has_key("templates"):
            if operation_spec.has_key("node_chain"):
                operation_spec["templates"] = [
                    operation_spec.pop("node_chain")
                ]


#                    extract_key_str(operation_spec["base_file"],
#                                    keyword="node_chain")]
#                operation_spec.pop("node_chain")
            else:
                warnings.warn(
                    "Specify parameter 'templates' or 'node_chain' in your operation spec!"
                )
        elif operation_spec.has_key("node_chain"):
            operation_spec.pop("node_chain")
            warnings.warn(
                "node_chain parameter is ignored. Templates are used.")
        # load files in templates as dictionaries
        elif type(operation_spec["templates"][0]) == str:
            operation_spec["template_files"] = \
                copy.deepcopy(operation_spec["templates"])
            for i in range(len(operation_spec["templates"])):
                rel_node_chain_file = operation_spec["templates"][i]
                abs_node_chain_file_name = os.sep.join([
                    pySPACE.configuration.spec_dir, "node_chains",
                    rel_node_chain_file
                ])
                with open(abs_node_chain_file_name, "r") as read_file:
                    node_chain = read_file.read()
                    #node_chain = yaml.load(read_file)
                operation_spec["templates"][i] = node_chain

        storage = pySPACE.configuration.storage
        if not input_paths:
            raise Exception("No input datasets found in input_path %s in %s!" %
                            (operation_spec["input_path"], storage))

        # Get relative path
        rel_input_paths = [name[len(storage):] for name in input_paths]

        # Determine approximate number of runs
        if "runs" in operation_spec:
            runs = operation_spec["runs"]
        else:
            runs = []
            for dataset_dir in rel_input_paths:
                abs_collection_path = \
                        pySPACE.configuration.storage + os.sep \
                            + dataset_dir
                collection_runs = \
                        BaseDataset.load_meta_data(abs_collection_path).get('runs',1)
                runs.append(collection_runs)
            runs = max(runs)

        # Determine splits
        dataset_dir = rel_input_paths[0]
        abs_collection_path = \
                pySPACE.configuration.storage + os.sep + dataset_dir

        splits = BaseDataset.load_meta_data(abs_collection_path).get(
            'splits', 1)

        # Determine how many processes will be created
        number_processes = len(operation_spec["templates"]) * \
                           len(parameter_settings) * len(rel_input_paths) * \
                           runs * splits

        if debug:
            # To better debug creation of processes we don't limit the queue
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, operation_spec,
                                 parameter_settings, rel_input_paths)
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes)
        else:
            # Create all processes by calling a recursive helper method in
            # another thread so that already created processes can be executed in
            # parallel. Therefore a queue is used which size is maximized to
            # guarantee that not to much objects are created (because this costs
            # memory). However, the actual number of 4 is arbitrary and might
            # be changed according to the system at hand.
            processes = processing.Queue(4)
            create_process = \
                    processing.Process(target=cls._createProcesses,
                                       args=(processes, result_directory,
                                             operation_spec, parameter_settings,
                                             rel_input_paths))
            create_process.start()
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes, create_process)

Example #36

0

Show file

    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(ShuffleProcess, self).pre_benchmarking()

        for dataset_dir1 in self.input_datasets:
            for dataset_dir2 in self.input_datasets:
                dataset_name1 = dataset_dir1.split(os.sep)[-2]
                dataset_name2 = dataset_dir2.split(os.sep)[-2]

                # Check if the input data is split
                splitted = len(
                    glob.glob(os.sep.join([dataset_dir1, "data_run0", "*"
                                           ]))) > 1

                # Check that all constraints are fulfilled for this pair of
                # input datasets
                if not all(
                        eval(
                            constraint_template % {
                                'dataset_name1': dataset_name1,
                                'dataset_name2': dataset_name2
                            })
                        for constraint_template in self.dataset_constraints):
                    continue

                if dataset_name1 == dataset_name2:
                    if splitted:
                        # Copy the data
                        os.symlink(
                            dataset_dir1,
                            os.sep.join([self.result_directory,
                                         dataset_name1]))
                    continue

                # Determine names of the original data sets the input
                # datasets are based on
                base_dataset1 = dataset_name1.strip("}{").split("}{")[0]
                base_dataset2 = dataset_name2.strip("}{").split("}{")[0]

                # Determine target dataset name and create directory
                # for it
                mixed_base_dataset = "%s_vs_%s" % (base_dataset1,
                                                   base_dataset2)
                target_dataset_name = dataset_name1.replace(
                    base_dataset1, mixed_base_dataset)

                target_dataset_dir = os.sep.join(
                    [self.result_directory, target_dataset_name])

                create_directory(os.sep.join([target_dataset_dir,
                                              "data_run0"]))

                if splitted:
                    # For each split, copy the train data from dataset 1 and
                    # the test data from dataset 2 to the target dataset
                    for source_train_file_name in glob.glob(
                            os.sep.join(
                                [dataset_dir1, "data_run0", "*_sp*_train.*"])):
                        # TODO: We have $n$ train sets and $n$ test sets, we                   "metadata.yaml"])),

                        #       could use all $n*n$ combinations
                        target_train_file_name = source_train_file_name.replace(
                            dataset_dir1, target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name,
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name,
                                       target_train_file_name)

                        source_test_file_name = source_train_file_name.replace(
                            dataset_dir1, dataset_dir2)

                        source_test_file_name = source_test_file_name.replace(
                            "train.", "test.")
                        target_test_file_name = target_train_file_name.replace(
                            "train.", "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name,
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                else:
                    # Use the data set from dataset 1 as training set and
                    # the data set from dataset 2 as test data
                    for source_train_file_name in glob.glob(
                            os.sep.join(
                                [dataset_dir1, "data_run0", "*_sp*_test.*"])):
                        target_train_file_name = source_train_file_name.replace(
                            "test.", "train.")
                        target_train_file_name = target_train_file_name.replace(
                            dataset_dir1, target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name,
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name,
                                       target_train_file_name)

                        source_test_file_name = source_train_file_name.replace(
                            dataset_dir1, dataset_dir2)

                        target_test_file_name = target_train_file_name.replace(
                            "train.", "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name,
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                # Write metadata.yaml based on input meta data
                input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1)

                output_dataset_meta = dict(input_dataset1_meta)
                output_dataset_meta['train_test'] = True
                output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S")
                output_dataset_meta['author'] = get_author()
                BaseDataset.store_meta_data(target_dataset_dir,
                                            output_dataset_meta)

        ############## Clean up after benchmarking ##############
        super(ShuffleProcess, self).post_benchmarking()

Example #37

0

Show file

    def store(self, result_dir, s_format=["pickle", "real"]):
        """ store the collection in *result_dir*"""

        name = "predictions"
        # Update the meta data
        author = get_author()
        self.update_meta_data({
            "type":
            "prediction_vector",
            "storage_format":
            s_format,
            "author":
            author,
            "data_pattern":
            "data_run" + os.sep + name + "_sp_tt." + s_format[0]
        })

        if not s_format in ["csv", "arff", "pickle"]:
            self._log("Storage format not supported! Using default.",
                      level=logging.ERROR)
            s_format = "pickle"

        for key, prediction_vectors in self.data.iteritems():
            # Construct result directory
            result_path = result_dir + os.sep + "data" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)

            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format == "pickle":
                result_file = open(
                    os.path.join(result_path, name + key_str + ".pickle"), "w")
                cPickle.dump(prediction_vectors, result_file,
                             cPickle.HIGHEST_PROTOCOL)

            elif s_format == "csv":  # Write as Comma Separated Value
                result_file = open(
                    os.path.join(result_path, name + key_str + ".csv"), "w")
                if self.meta_data["num_predictors"] == 1:
                    result_file.write(
                        "Predicted Label, Prediction Score, True Label \n")
                    for pv in prediction_vectors:
                        result_file.write(
                            "%s, %s, %s\n" %
                            (pv[0].label[0], pv[0].prediction[0], pv[1]))
                else:
                    # we begin by dealing with the header of the csv file
                    base_header = "Predicted %(index)d Label, Prediction %(index)d Score, "
                    base_result = "%(label)s, %(score)s,"
                    header = ""
                    for i in range(self.meta_data["num_predictors"]):
                        header += base_header % dict(index=i + 1)
                    header += "True Label\n"
                    result_file.write(header)

                    # and now we can write each of the prediction vectors in turn

                    for pv in prediction_vectors:
                        result = ""
                        for i in range(self.meta_data["num_predictors"]):
                            result += base_result % dict(
                                label=pv[0].label[i],
                                score=pv[0].prediction[i])

                        result += str(pv[1]) + "\n"
                        result_file.write(result)

        #Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)

Example #38

0

Show file

File: merge.py Project: jhuebotter/pyspace

    def _merge_files(self, target_collection_path, source_collection_pathes,
                     train_set_name_suffix, target_collection_params):
        """ Merge all collections in source_collection_pathes and store them \
            in the target collection
            
        **Parameters**
        
            :target_collection_path:
                Path of the dataset, in which the data of all other datasets
                is assembled.
                
            :source_collection_pathes:
                Paths of the datasets to be merged.
                
            :train_set_name_suffix:
                Either 'train' or 'test'. Specifies if datasets are merged for
                training or testing.
                
            :target_collection_params:
                Dictionary with all the parameters of the target dataset.
                
        """

        # load a first collection, in which the data of all other collections
        # is assembled
        target_collection = BaseDataset.load(source_collection_pathes[0])
        author = get_author()
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_collection.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        k = "test" if self.reverse else "train"
        target_collection_params["__INPUT_DATASET__"][k] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
        target_collection_params[
            "__RESULT_DIRECTORY__"] = self.result_directory
        target_collection.meta_data.update({
            "author":
            author,
            "date":
            date,
            "dataset_directory":
            target_collection_path,
            "train_test":
            True,
            "parameter_setting":
            target_collection_params,
            "input_dataset_name":
            source_collection_pathes[0][len(pySPACE.configuration.storage):]
        })

        # merge data of all other collections to target collection
        for source_collection_path in source_collection_pathes[1:]:
            source_collection = BaseDataset.load(source_collection_path)
            for run in source_collection.get_run_numbers():
                for split in source_collection.get_split_numbers():
                    target_data = target_collection.get_data(
                        run, split, train_set_name_suffix)

                    if self.set_flag:
                        for ts, l in target_data:
                            if ts.specs == None:
                                ts.specs = {"new_set": False}
                            elif ts.specs.has_key("new_set"):
                                break
                            else:
                                ts.specs["new_set"] = False

                    data = source_collection.get_data(run, split,
                                                      train_set_name_suffix)

                    if self.set_flag:
                        for i, (ts, l) in enumerate(data):
                            # flag first element of the concatenated data list
                            if ts.specs == None:
                                ts.specs = {"new_set": i == 0}
                            else:
                                ts.specs["new_set"] = (i == 0)

                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)

        # if only test data was given, the "Rest_vs" collection is stored as
        # training data
        if not self.reverse and "test" == train_set_name_suffix:
            # exchange the "test" in key tuple to "train" before storing
            for key in target_collection.data.keys():
                assert ("test" == key[2])
                value = target_collection.data.pop(key)
                key = (key[0], key[1], "train")
                target_collection.data[key] = value
        # we store the data in the same format as before
        target_collection.store(target_collection_path,
                                target_collection.meta_data["storage_format"])

Example #39

0

Show file

File: shuffle.py Project: AlexanderFabisch/pyspace

    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(ShuffleProcess, self).pre_benchmarking()
        
        for dataset_dir1 in self.input_datasets:                
            for dataset_dir2 in self.input_datasets:
                dataset_name1 = dataset_dir1.split(os.sep)[-2]
                dataset_name2 = dataset_dir2.split(os.sep)[-2]
                
                # Check if the input data is split
                splitted = len(glob.glob(os.sep.join([dataset_dir1, "data_run0",
                                                      "*"]))) > 1
                
                # Check that all constraints are fulfilled for this pair of
                # input datasets
                if not all(eval(constraint_template % {'dataset_name1': dataset_name1,
                                                       'dataset_name2': dataset_name2})
                                    for constraint_template in self.dataset_constraints):
                    continue
                
                if dataset_name1 == dataset_name2:
                    if splitted:
                        # Copy the data 
                        os.symlink(dataset_dir1,
                                   os.sep.join([self.result_directory, 
                                                dataset_name1]))
                    continue
             
                # Determine names of the original data sets the input 
                # datasets are based on
                base_dataset1 = dataset_name1.strip("}{").split("}{")[0]
                base_dataset2 = dataset_name2.strip("}{").split("}{")[0]
                
                # Determine target dataset name and create directory
                # for it
                mixed_base_dataset = "%s_vs_%s" % (base_dataset1, 
                                                      base_dataset2)
                target_dataset_name = dataset_name1.replace(base_dataset1,
                                                                  mixed_base_dataset)
                
                target_dataset_dir = os.sep.join([self.result_directory,
                                                     target_dataset_name])
                
                create_directory(os.sep.join([target_dataset_dir, "data_run0"]))
                
                if splitted:
                    # For each split, copy the train data from dataset 1 and
                    # the test data from dataset 2 to the target dataset
                    for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
                                                                       "data_run0",
                                                                       "*_sp*_train.*"])):
                        # TODO: We have $n$ train sets and $n$ test sets, we                   "metadata.yaml"])),
                              
                        #       could use all $n*n$ combinations 
                        target_train_file_name = source_train_file_name.replace(dataset_dir1,
                                                                                target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name, 
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name, 
                                       target_train_file_name)
                        
                        source_test_file_name = source_train_file_name.replace(dataset_dir1,
                                                                               dataset_dir2)
                        
                        source_test_file_name =  source_test_file_name.replace("train.",
                                                                                "test.")
                        target_test_file_name = target_train_file_name.replace("train.",
                                                                                "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name, 
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                else:
                    # Use the data set from dataset 1 as training set and 
                    # the data set from dataset 2 as test data
                    for source_train_file_name in glob.glob(os.sep.join([dataset_dir1,
                                                                         "data_run0",
                                                                         "*_sp*_test.*"])):
                        target_train_file_name = source_train_file_name.replace("test.",
                                                                                "train.")
                        target_train_file_name = target_train_file_name.replace(dataset_dir1,
                                                                                target_dataset_dir)
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_train_file_name, 
                                                 target_train_file_name,
                                                 base_dataset1,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_train_file_name, 
                                       target_train_file_name)
                        
                        source_test_file_name = source_train_file_name.replace(dataset_dir1,
                                                                               dataset_dir2)
                        
                        target_test_file_name = target_train_file_name.replace("train.",
                                                                                "test.")
                        if source_train_file_name.endswith("arff"):
                            self._copy_arff_file(source_test_file_name, 
                                                 target_test_file_name,
                                                 base_dataset2,
                                                 mixed_base_dataset)
                        else:
                            os.symlink(source_test_file_name,
                                       target_test_file_name)
                # Write metadata.yaml based on input meta data
                input_dataset1_meta = BaseDataset.load_meta_data(dataset_dir1)

                output_dataset_meta = dict(input_dataset1_meta)
                output_dataset_meta['train_test'] = True
                output_dataset_meta['date'] = time.strftime("%Y%m%d_%H_%M_%S")
                try:
                    output_dataset_meta['author'] = pwd.getpwuid(os.getuid())[4]
                except :
                    self._log("Author could not be resolved.",level=logging.WARNING)
                    output_dataset_meta['author'] = "unknown"
                BaseDataset.store_meta_data(target_dataset_dir,output_dataset_meta)
        
        ############## Clean up after benchmarking ##############
        super(ShuffleProcess, self).post_benchmarking()

Example #40

0

Show file

File: analyzer_sink.py Project: ahmedadelhassan/pyspace

    def store(self, result_dir, s_format = "BrainVision"):
        self.merged = False
        scale = 10.0 # is used to scale up the eeg sample values.  The data samples are converted to int16
                    # when saving, so scaling is necessary to keep maintain the resolutions. 
        # Keep original file name, depends on the AnalyserSinkNode, see it's documentation.
        if self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] is not None:
            name = self.meta_data['eeg_src_file_name']
        # or use default name from this collection
        else:
            name = "Analyzer"
        if not s_format == "BrainVision":
            self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL)
            return
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "only output of individual nodes stored",
                                      "storage_format": s_format,
                                      "author" : author,
                                      "data_pattern": "Multiplexed"})
        # Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)
        #self._log("EEG data file %s" % self.collection.data_file)
        slices = []
        slices.append(0)
        channel_names = []
        

        
        for key, time_series in self.data.iteritems():
            # Sort the Times-Series Array
            def cmp_start(a, b):
                return cmp(a[0].start_time, b[0].start_time)

            time_series.sort(cmp_start)
            # Check for overlapping Windows and remove them if existent
            i = 0
            while i < len(time_series):
                ts = time_series[i]
                #print ts[0].start_time, ts[0].end_time
                #print len(time_series)
                if ts[0].start_time >= slices[-1]:
                    slices.append(ts[0].end_time)
                else:
                    warnings.warn("Ignoring at least one overlapping window!", UserWarning)
                i = i+1
            # STORE ACTUAL EEG DATA AND WRITE MARKERFILE
            result_path = result_dir + os.sep + "data_analyzer" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
            
            key_str = "_sp%s_%s" % key[1:]
            # Keep original name
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_eeg = open(os.path.join(result_path, name + ".eeg"), "wb")
                result_file_mrk = open(os.path.join(result_path, name + ".vmrk"), "w")
            # or use default name from this collection
            else:
                result_file_eeg = open(os.path.join(result_path, name + key_str + ".eeg"), "wb")
                result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"), "w")
        
            # Write Marker header
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_mrk.write(header_mrk % (name))
            else:
                result_file_mrk.write(header_mrk % (name + key_str))
        
            result_file_ms = 0
        
            # Data for padding
            padding = None
        
            count_mrk = 2
            num_ch = 0
            sampling_int = 0
            
            for ts in time_series:
                ts0 = ts[0] * scale 
                ts0 = ts0.astype(numpy.int16)
                
                if padding == None:
                    padding = numpy.zeros(len(ts[0].channel_names), dtype='int16')
                    num_ch = len(ts[0].channel_names)
                    channel_names = ts[0].channel_names
                    sampling_int = 1000000/ts[0].sampling_frequency
                    #print "writing %d channels.." % len(ts[0].channel_names)
                # Write Padding (zeros)
                while result_file_ms < ts[0].start_time - sampling_int/1000.0:
                    result_file_eeg.write(padding.tostring())
                    result_file_ms += ts[0]._samples_to_ms(1)
                # Write window
                ts0.tofile(result_file_eeg)
                result_file_ms += ts[0].end_time - (ts[0].start_time - sampling_int/1000.0)
                # Write Marker
                markers = []
                
                if(len(ts[0].marker_name) > 0):
                    mk_keys = ts[0].marker_name.keys()
                    mk_values = ts[0].marker_name.values()
                    for mk in range(len(mk_keys)):
                        for mv in range(len(mk_values[mk])):
                            markers.append((mk_keys[mk], mk_values[mk][mv]))
                    markers = sorted(markers, key=lambda tup: tup[1])
                    
                    for i in range(len(markers)):
                        if 'R' in markers[i][0]: 
                            event_type = 'Response' 
                        elif 'S' in markers[i][0]:
                            event_type = 'Stimulus'
                        else:
                            event_type = 'Label'
                            
                        result_file_mrk.write("Mk%d=%s,%s,%d,1,0\n" % (count_mrk, event_type, markers[i][0], (ts[0].start_time + markers[i][1])*ts[0].sampling_frequency/1000.0))                                                     
                        count_mrk += 1

            # WRITE HEADERFILE
            # Keep original name
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_hdr = open(os.path.join(result_path, name + ".vhdr"), "w")
                result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int))
            # or use default name from this collection
            else:
                result_file_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"), "w")
                result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int))
            # Format: Ch1=Fp1,,0.1,\xB5V
            for i in range(num_ch):
                result_file_hdr.write("Ch%d=%s,,%.2f,\xB5V\n" % (i+1,channel_names[i], 1./scale))

            result_file_hdr.close()
            result_file_eeg.close()
            result_file_mrk.close()

Example #41

0

Show file

File: node_chain.py Project: pyspace/pyspace

    def consolidate(self, _=None):
        """ Consolidates the results obtained by the single processes into a consistent structure
        of collections that are stored on the file system.
        """
        # Consolidate the results
        directory_pattern = os.sep.join([self.result_directory, "{*",])
        dataset_pathes = glob.glob(directory_pattern)

        # For all collections found
        for dataset_path in dataset_pathes:
            try:
                # Load their meta_data
                meta_data = BaseDataset.load_meta_data(dataset_path)

                # Determine author and date
                author = get_author()
                date = time.strftime("%Y%m%d_%H_%M_%S")

                # Update meta data and store it
                meta_data.update({"author": author, "date": date})

                # There can be either run dirs, persistency dirs, or both of them.
                # Check of whichever there are more. If both exist, their numbers
                # are supposed to be equal.
                nr_run_dirs = len(glob.glob(os.path.join(dataset_path, "data_run*")))
                nr_per_dirs = len(glob.glob(os.path.join(dataset_path, "persistency_run*")))
                nr_runs = max(nr_run_dirs, nr_per_dirs)
                if nr_runs > 1:
                    meta_data["runs"] = nr_runs

                # Store the metadata
                BaseDataset.store_meta_data(dataset_path, meta_data)

                # Copy the input dataset specification file to the result
                # directory in order to make later analysis of
                # the results more easy
                # THA: Split the first "/" from the input collection name, because otherwise it will be treated
                # as an absolute path
                input_collection_name = meta_data["input_dataset_name"][1:] if \
                    meta_data["input_dataset_name"][0] == os.sep else meta_data["input_dataset_name"]
                input_meta_path = os.path.join(pySPACE.configuration.storage, input_collection_name)
                try:
                    input_meta = BaseDataset.load_meta_data(input_meta_path)
                    BaseDataset.store_meta_data(dataset_path, input_meta, file_name="input_metadata.yaml")
                except (IOError, OSError) as e:
                    self._log("Error copying the input_metadata.yaml: {error}".format(error=e.message),
                              level=logging.CRITICAL)
            except Exception as e:
                logging.getLogger("%s" % self).exception("Error updating the metadata: {error!s}".format(error=e))
                raise e

        # If we don't create a feature vector or time series collection,
        # we evaluated our classification using a classification performance sink.
        # The resulting files should be merged to one csv tabular.
        pathlist = glob.glob(os.path.join(self.result_directory,"results_*"))
        if len(pathlist)>0:
            # Do the consolidation the same way as for WekaClassificationOperation
            self._log("Consolidating results ...")
            # We load and store the results once into a PerformanceResultSummary
            # This does the necessary consolidation...
            self._log("Reading intermediate results...")
            try:
                result_collection = PerformanceResultSummary(dataset_dir=self.result_directory)
                self._log("done")
                self._log("Storing result collection")
                result_collection.store(self.result_directory)
                self._log("done")
                PerformanceResultSummary.merge_traces(self.result_directory)
            except Exception as e:
                logging.getLogger("%s" % self).exception("Error merging the result collection: {error!s}".format(
                    error=e))

            if self.compression:
                # Since we get one result summary,
                # we don't need the numerous folders.
                # So we zip them to make the whole folder more easy visible.
                import zipfile
                cwd = os.getcwd()
                os.chdir(self.result_directory)
                # If there are to many or to large folders, problems may occur.
                # This case we want to log, try 64 bit mode,
                # and then skip the zipping.
                try:
                    pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))

                    if not self.compression == "delete":
                        save_file = zipfile.ZipFile(
                            self.result_directory+'/result_folders.zip',
                            mode="w", compression=self.compression)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path=os.path.relpath(node[0],
                                                         self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path,
                                                                 data))
                        save_file.close()
                    # To still have an easy access to the history of the
                    # processing, we keep one folder.
                    pathlist.pop()
                    for path in pathlist:
                        shutil.rmtree(path)
                except Exception, e:
                    self._log("Result files could not be compressed with 32"+
                              " bit mode, switching to 64 bit mode",
                              level=logging.CRITICAL)
                    # nearly total code copy, only difference with 64 bit mode
                    try:
                        pathlist = glob.glob(os.path.join(self.result_directory,"{*}"))
                        save_file=zipfile.ZipFile(
                            self.result_directory+'/result_folders.zip',
                            mode="w", compression=self.compression,
                            allowZip64=True)
                        # we want to have the zipped file relative to the
                        # result directory
                        for path in pathlist:
                            for node in os.walk(path):
                                rel_path = os.path.relpath(node[0],
                                                         self.result_directory)
                                save_file.write(rel_path)
                                for data in node[2]:
                                    save_file.write(os.path.join(rel_path,data))
                        save_file.close()
                        # To still have an easy access to the history of the
                        # processing, we keep one folder.
                        pathlist.pop()
                        for path in pathlist:
                            shutil.rmtree(path)
                    except:
                        self._log("64 bit mode also failed. Please check your files and your code or contact your local programmer!", level=logging.CRITICAL)
                os.chdir(cwd)

Example #42

0

Show file

File: node_chain.py Project: AlexanderFabisch/pyspace

    def _createProcesses(cls, processes, result_directory, operation_spec,
                         parameter_settings, input_collections):

        storage_format = operation_spec["storage_format"] if "storage_format" \
                         in operation_spec else None

        # Determine whether the node_chain should be stored after data processing
        store_node_chain = operation_spec["store_node_chain"] \
                         if "store_node_chain" in operation_spec else False

        # Determine whether certain parameters should not be remembered
        hide_parameters = [] if "hide_parameters" not in operation_spec \
                                else list(operation_spec["hide_parameters"])
        hide_parameters.append("__INPUT_COLLECTION__")
        hide_parameters.append("__INPUT_DATASET__")
        hide_parameters.append("__RESULT_DIRECTORY__")
        hide_parameters.append("__OUTPUT_BUNDLE__")

        # Create all combinations of collections, runs and splits
        collection_run_split_combinations = []
        for input_dataset_dir in input_collections:
            # Determine number of runs to be conducted for this collection
            abs_collection_path = \
                pySPACE.configuration.storage + os.sep \
                    + input_dataset_dir
            collection_runs = \
                BaseDataset.load_meta_data(abs_collection_path).get('runs', 1)
                # D.get(k[,d]) -> D[k] if k in D, else d.

            if "runs" not in operation_spec:
                requested_runs  = collection_runs
            else:
                requested_runs = operation_spec["runs"]

            assert collection_runs == requested_runs \
                        or collection_runs ==  1, \
                    "Requested %s runs but input collection %s provides "\
                    "data for %s runs." % (requested_runs, input_dataset_dir,
                                           collection_runs)

            for run in range(max(requested_runs, collection_runs)):
                collection_splits = \
                    BaseDataset.load_meta_data(abs_collection_path).get('splits', 1)
                for split in range(collection_splits):
                    collection_run_split_combinations.append((input_dataset_dir, run, split))

        # Shuffle order of dataset-run-split combinations. This should help to
        # avoid that all processes work on the same data which can cause
        # problems due to locking etc.
        random.shuffle(collection_run_split_combinations)

        # For all templates
        for node_chain_spec in operation_spec["templates"]:
            # For all possible parameter instantiations of this template
            for parameter_setting in parameter_settings:
                # For all input collections-run combinations
                for input_dataset_dir, run, split in collection_run_split_combinations:
                    # We are going to change the parameter_setting and don't want to
                    # interfere with later runs so we work on a copy
                    parameter_setting_cp = copy.deepcopy(parameter_setting)

                    # Add input and output path to parameter
                    # setting
                    parameter_setting_cp["__INPUT_DATASET__"] = \
                            input_dataset_dir.split(os.sep)[-2]
                    parameter_setting_cp["__RESULT_DIRECTORY__"] = \
                            result_directory
                    if len(operation_spec["templates"])>1:
                        index = operation_spec["templates"].index(node_chain_spec)
                        parameter_setting_cp["__Template__"]=\
                            operation_spec["template_files"][index]

                    # Load the input meta data
                    dataset_dir = os.sep.join([pySPACE.configuration.storage,
                                               input_dataset_dir])
                    dataset_md = BaseDataset.load_meta_data(dataset_dir)
                    # Add the input parameters meta data to the given parameter setting
                    if "parameter_setting" in dataset_md:
                        dataset_md["parameter_setting"].update(parameter_setting_cp)
                        all_parameters = dataset_md["parameter_setting"]
                    else:
                        all_parameters = parameter_setting_cp

                    def check_constraint(constraint, parameters):
                        for key, value in parameters.iteritems():
                            constraint = constraint.replace(key, str(value))
                        return eval(constraint)

                    if not all(check_constraint(constraint_def,
                                                all_parameters) for \
                               constraint_def in \
                               operation_spec.get('old_parameter_constraints',[])):
                        continue

                    # Determine directory in which the result of this
                    # process should be written
                    result_dataset_directory = \
                        NodeChainOperation._get_result_dataset_dir(result_directory,
                                                                 input_dataset_dir,
                                                                 parameter_setting_cp,
                                                                 hide_parameters)
                    # Create the respective process and put it to the
                    # executing-queue of processes
                    process = NodeChainProcess(node_chain_spec= node_chain_spec,
                                          parameter_setting   = parameter_setting_cp,
                                          rel_dataset_dir  = input_dataset_dir,
                                          run = run, split    = split,
                                          storage_format      = storage_format,
                                          result_dataset_directory = result_dataset_directory,
                                          store_node_chain          = store_node_chain)

                    processes.put(process)

        # give executing process the sign that creation is now finished
        processes.put(False)

Example #43

0

Show file

File: trainer.py Project: jhuebotter/pyspace

    def prepare_training(self,
                         training_files,
                         potentials,
                         operation,
                         nullmarker_stride_ms=None):
        """ Prepares pyspace live for training.

        Prepares everything for training of pyspace live,
        i.e. creates flows based on the dataflow specs
        and configures them.
        """
        online_logger.info("Preparing Training")
        self.potentials = potentials
        self.operation = operation
        self.nullmarker_stride_ms = nullmarker_stride_ms
        if self.nullmarker_stride_ms == None:
            online_logger.warn(
                'Nullmarker stride interval is %s. You can specify it in your parameter file.'
                % self.nullmarker_stride_ms)
        else:
            online_logger.info('Nullmarker stride interval is set to %s ms ' %
                               self.nullmarker_stride_ms)

        online_logger.info("Creating flows..")
        for key in self.potentials.keys():
            spec_base = self.potentials[key]["configuration"].spec_dir
            if self.operation == "train":
                self.potentials[key]["node_chain"] = os.path.join(
                    spec_base, self.potentials[key]["node_chain"])
                online_logger.info("node_chain_spec:" +
                                   self.potentials[key]["node_chain"])

            elif self.operation in ("prewindowing", "prewindowing_offline"):
                self.potentials[key]["prewindowing_flow"] = os.path.join(
                    spec_base, self.potentials[key]["prewindowing_flow"])
                online_logger.info("prewindowing_dataflow_spec: " +
                                   self.potentials[key]["prewindowing_flow"])

            elif self.operation == "prewindowed_train":
                self.potentials[key]["postprocess_flow"] = os.path.join(
                    spec_base, self.potentials[key]["postprocess_flow"])
                online_logger.info("postprocessing_dataflow_spec: " +
                                   self.potentials[key]["postprocess_flow"])

            self.training_active_potential[key] = multiprocessing.Value(
                "b", False)

        online_logger.info("Path variables set for NodeChains")

        # check if multiple potentials are given for training
        if isinstance(training_files, list):
            self.training_data = training_files
        else:
            self.training_data = [training_files]

        # Training is done in separate processes, we send the time series
        # windows to these threads via two queues
        online_logger.info("Initializing Queues")
        for key in self.potentials.keys():
            self.queue[key] = multiprocessing.Queue()

        def flow_generator(key):
            """create a generator to yield all the abri flow windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block=True, timeout=None)
                if window == None: break
                yield window

        # Create the actual data flows
        for key in self.potentials.keys():

            if self.operation == "train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["node_chain"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["node_chain"])
            elif self.operation in ("prewindowing", "prewindowing_offline"):
                online_logger.info("loading prewindowing flow..")
                online_logger.info(
                    "file: " + str(self.potentials[key]["prewindowing_flow"]))

                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["prewindowing_flow"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["prewindowing_flow"])
            elif self.operation == "prewindowed_train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["postprocess_flow"]))
                replace_start_and_end_markers = False

                final_collection = TimeSeriesDataset()
                final_collection_path = os.path.join(
                    self.prewindowed_data_directory, key, "all_train_data")
                # delete previous training collection
                if os.path.exists(final_collection_path):
                    online_logger.info(
                        "deleting old training data collection for " + key)
                    shutil.rmtree(final_collection_path)

                # load all prewindowed collections and
                # append data to the final collection
                prewindowed_sets = \
                    glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
                if len(prewindowed_sets) == 0:
                    online_logger.error(
                        "Couldn't find data, please do prewindowing first!")
                    raise Exception
                online_logger.info("concatenating prewindowed data from " +
                                   str(prewindowed_sets))

                for s, d in enumerate(prewindowed_sets):
                    collection = BaseDataset.load(d)
                    data = collection.get_data(0, 0, "train")
                    for d, (sample, label) in enumerate(data):
                        if replace_start_and_end_markers:
                            # in case we concatenate multiple 'Window' labeled
                            # sets we have to remove every start- and endmarker
                            for k in sample.marker_name.keys():
                                # find '{S,s}  8' or '{S,s}  9'
                                m = re.match("^s\s{0,2}[8,9]{1}$", k,
                                             re.IGNORECASE)
                                if m is not None:
                                    online_logger.info(
                                        str("remove %s from %d %d" %
                                            (m.group(), s, d)))
                                    del (sample.marker_name[m.group()])

                            if s == len(prewindowed_sets)-1 and \
                                d == len(data)-1:
                                # insert endmarker
                                sample.marker_name["S  9"] = [0.0]
                                online_logger.info("added endmarker" + str(s) +
                                                   " " + str(d))

                            if s == 0 and d == 0:
                                # insert startmarker
                                sample.marker_name["S  8"] = [0.0]
                                online_logger.info("added startmarker" +
                                                   str(s) + " " + str(d))

                        final_collection.add_sample(sample, label, True)

                # save final collection (just for debugging)
                os.mkdir(final_collection_path)
                final_collection.store(final_collection_path)

                online_logger.info("stored final collection at " +
                                   final_collection_path)

                # load final collection again for training
                online_logger.info("loading data from " +
                                   final_collection_path)
                self.prewindowed_data[key] = BaseDataset.load(
                    final_collection_path)
                self.node_chains[key][0].set_input_dataset(
                    self.prewindowed_data[key])

                flow = open(self.potentials[key]["postprocess_flow"])

            # create window_stream for every potential

            if self.operation in ("prewindowing"):
                window_spec_file = os.path.join(
                    spec_base, "node_chains", "windower",
                    self.potentials[key]["windower_spec_path_train"])

                self.window_stream[key] = \
                        self.stream_manager.request_window_stream(window_spec_file,
                                                              nullmarker_stride_ms = self.nullmarker_stride_ms)
            elif self.operation in ("prewindowing_offline"):
                pass
            elif self.operation in ("train"):
                pass

            self.node_chain_definitions[key] = yaml.load(flow)
            flow.close()

        # TODO: check if the prewindowing flow is still needed when using the stream mode!
        if self.operation in ("train"):
            online_logger.info("Removing old flows...")
            try:
                shutil.rmtree(self.flow_storage)
            except:
                online_logger.info("Could not delete flow storage directory")
            os.mkdir(self.flow_storage)
        elif self.operation in ("prewindowing", "prewindowing_offline"):
            # follow this policy:
            # - delete prewindowed data older than 12 hours
            # - always delete trained/stored flows
            now = datetime.datetime.now()
            then = now - datetime.timedelta(hours=12)

            if not os.path.exists(self.prewindowed_data_directory):
                os.mkdir(self.prewindowed_data_directory)
            if not os.path.exists(self.flow_storage):
                os.mkdir(self.flow_storage)

            for key in self.potentials.keys():
                found = self.find_files_older_than(then, \
                        os.path.join(self.prewindowed_data_directory, key))
                if found is not None:
                    for f in found:
                        online_logger.info(
                            str("recursively deleting files in \'%s\'" % f))
                        try:
                            shutil.rmtree(os.path.abspath(f))
                        except Exception as e:
                            # TODO: find a smart solution for this!
                            pass  # dir was probably already deleted..

                if os.path.exists(
                        os.path.join(self.prewindowed_data_directory, key,
                                     "all_train_data")):
                    shutil.rmtree(
                        os.path.join(self.prewindowed_data_directory, key,
                                     "all_train_data"))
                    online_logger.info(
                        "deleted concatenated training data for " + key)

        online_logger.info("Training preparations finished")
        return 0

Example #44

0

Show file

File: time_series.py Project: MMKrell/pyspace

    def store(self, result_dir, s_format="pickle"):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        **Parameters**
        
          :result_dir:
              The directory in which the collection will be stored.
              
          :name:
              The prefix of the file names in which the individual data sets are 
              stored. The actual file names are determined by appending suffixes
              that encode run, split, train/test information. 
              
              (*optional, default: "time_series"*)
              
          :s_format:
              The format in which the actual data sets should be stored.
              
              Possible formats are 'pickle', 'text', 'csv' and 'mat' (matlab)
              format. If s_format is a list, the second element further 
              specifies additional options for storing.
              
              - pickle:
                  Standard Python format
                  
              - text:
                  In the text format, all time series objects are concatenated 
                  to a single large table containing only integer values.
                  
              - csv:
                  For the csv format comma separated values are taken as default
                  or a specified Python format string.
                  
              - mat:
                  Scipy's savemat function is used for storing. Thereby the data
                  is stored as 3 dimensional array. Also meta data information,
                  like sampling frequency and channel names are saved.
                  As an additional parameter the orientation of the data arrays 
                  can be given as 'channelXtime' or 'timeXchannel'
              
              .. note:: For the text and MATLAB format, markers could be added 
                        by using a Marker_To_Mux node before
              
              (*optional, default: "pickle"*)

        .. todo:: Put marker to the right time point and also write marker channel.
        
        .. todo:: Shouldn't be 'text' and 'csv' format part of the stream data
                  set?!
        """
        name = "time_series"
        # for some storage procedures we need further specifications
        s_type = None
        if type(s_format) == list:
            # file format is first position
            f_format = s_format[0]
            if len(s_format) > 1:
                s_type = s_format[1]
        else:
            f_format = s_format
        if f_format == "text" and s_type is None:
            s_type = "%i"
        elif f_format == "csv" and s_type == "real":
            s_type = "%.18e"
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "time_series",
                               "storage_format": s_format,
                               "author": author,
                               "data_pattern": "data_run" + os.sep 
                                               + name + "_sp_tt." + f_format})

        # Iterate through splits and runs in this dataset
        for key, time_series in self.data.iteritems():
            # load data, if necessary 
            # (due to the lazy loading, the data might be not loaded already)
            if isinstance(time_series, basestring):
                time_series = self.get_data(key[0], key[1], key[2])
            if self.sort_string is not None:
                time_series.sort(key=eval(self.sort_string))
            # Construct result directory
            result_path = result_dir + os.sep + "data" + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
            
            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if f_format in ["pickle", "cpickle", "cPickle"]:
                result_file = open(os.path.join(result_path,
                                                name+key_str+".pickle"), "w")
                cPickle.dump(time_series, result_file, cPickle.HIGHEST_PROTOCOL)
                result_file.close()
            elif f_format in ["text","csv"]:
                self.update_meta_data({
                    "type": "stream",
                    "marker_column": "marker"})
                result_file = open(os.path.join(result_path,
                                                name + key_str + ".csv"), "w")
                csvwriter = csv.writer(result_file)
                channel_names = copy.deepcopy(time_series[0][0].channel_names)
                if f_format == "csv":
                    channel_names.append("marker")
                csvwriter.writerow(channel_names)
                for (data, key) in time_series:
                    if f_format == "text":
                        numpy.savetxt(result_file, data, delimiter=",", fmt=s_type)
                        if not key is None:
                            result_file.write(str(key))
                            result_file.flush()
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            result_file.write(str(data.marker_name))
                            result_file.flush()
                    else:
                        first_line = True
                        marker = ""
                        if not key is None:
                            marker = str(key)
                        elif data.marker_name is not None \
                                and len(data.marker_name) > 0:
                            marker = str(data.marker_name)
                        for line in data:
                            l = list(line)
                            l.append(marker)
                            csvwriter.writerow(list(l))
                            if first_line:
                                first_line = False
                                marker = ""
                        result_file.flush()
                result_file.close()
            elif f_format in ["matlab", "mat", "MATLAB"]:
                # todo: handle all the other attributes of ts objects!
                import scipy.io
                result_file_name = os.path.join(result_path, 
                                                name + key_str + ".mat")
                # extract a first time series object to get meta data 
                ts1 = time_series[0][0]
                
                # collect all important information in the collection_object
                dataset_dict = {
                    "sampling_frequency": ts1.sampling_frequency,
                    "channel_names": ts1.channel_names}
                
                # we have to extract the data and labels separatly
                if 'channelXtime' in s_format:
                    dataset_dict["data"] = [data.T for data, _ in time_series] 
                else:
                    dataset_dict["data"] = [data for data, _ in time_series]
                dataset_dict["labels"] = [label for _, label in time_series]
                # construct numpy 3d array (e.g., channelXtimeXtrials)
                dataset_dict["data"] = numpy.rollaxis(numpy.array(
                    dataset_dict["data"]), 0, 3)
                
                scipy.io.savemat(result_file_name, mdict=dataset_dict)
            elif f_format in ["bp_eeg"]:

                result_file = open(os.path.join(result_path,
                                                name + key_str + ".eeg"),"a+")
                result_file_mrk = open(os.path.join(result_path,
                                                name + key_str + ".vmrk"),"w")

                result_file_mrk.write("Brain Vision Data Exchange Marker File, "
                                      "Version 1.0\n")
                result_file_mrk.write("; Data stored by pySPACE\n")
                result_file_mrk.write("[Common Infos]\n")
                result_file_mrk.write("Codepage=UTF-8\n")
                result_file_mrk.write("DataFile=%s\n" %
                                      str(name + key_str + ".eeg"))
                result_file_mrk.write("\n[Marker Infos]\n")

                markerno = 1
                datapoint = 1
                sf = None
                channel_names = None

                for t in time_series:
                    if sf is None:
                        sf = t[0].sampling_frequency
                    if channel_names is None:
                        channel_names = t[0].get_channel_names()
                    for mrk in t[0].marker_name.keys():
                        for tm in t[0].marker_name[mrk]:
                            result_file_mrk.write(str("Mk%d=Stimulus,%s,%d,1,0\n" %
                                (markerno, mrk, datapoint+(tm*sf/1000.0))))
                            markerno += 1
                    data_ = t[0].astype(numpy.int16)
                    data_.tofile(result_file)
                    datapoint += data_.shape[0]

                result_hdr = open(os.path.join(result_path,
                                                name + key_str + ".vhdr"),"w")

                result_hdr.write("Brain Vision Data Exchange Header "
                                 "File Version 1.0\n")
                result_hdr.write("; Data stored by pySPACE\n\n")
                result_hdr.write("[Common Infos]\n")
                result_hdr.write("Codepage=UTF-8\n")
                result_hdr.write("DataFile=%s\n" %
                                      str(name + key_str + ".eeg"))
                result_hdr.write("MarkerFile=%s\n" %
                                      str(name + key_str + ".vmrk"))
                result_hdr.write("DataFormat=BINARY\n")
                result_hdr.write("DataOrientation=MULTIPLEXED\n")
                result_hdr.write("NumberOfChannels=%d\n" % len(channel_names))
                result_hdr.write("SamplingInterval=%d\n\n" % (1000000/sf))
                result_hdr.write("[Binary Infos]\n")
                result_hdr.write("BinaryFormat=INT_16\n\n")
                result_hdr.write("[Channel Infos]\n")

                # TODO: Add Resolutions to time_series
                # 0 = 0.1 [micro]V,
                # 1 = 0.5 [micro]V,
                # 2 = 10 [micro]V,
                # 3 = 152.6 [micro]V (seems to be unused!)
                resolutions_str = [unicode("0.1,%sV" % unicode(u"\u03BC")),
                   unicode("0.5,%sV" % unicode(u"\u03BC")),
                   unicode("10,%sV" % unicode(u"\u03BC")),
                   unicode("152.6,%sV" % unicode(u"\u03BC"))]
                for i in range(len(channel_names)):
                    result_hdr.write(unicode("Ch%d=%s,,%s\n" %
                        (i+1,channel_names[i],
                        unicode(resolutions_str[0]))).encode('utf-8'))
                result_file.close()
            else:
                NotImplementedError("Using unavailable storage format:%s!"
                                    % f_format)
        self.update_meta_data({
            "channel_names": copy.deepcopy(time_series[0][0].channel_names),
            "sampling_frequency": time_series[0][0].sampling_frequency
        })
        #Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)

Example #45

0

Show file

File: trainer.py Project: AlexanderFabisch/pyspace

    def prepare_training(self, training_files, potentials, operation):
        """ Prepares pyspace live for training.

        Prepares everything for training of pyspace live,
        i.e. creates flows based on the dataflow specs
        and configures them.
        """
        online_logger.info( "Preparing Training")
        self.potentials = potentials
        self.operation = operation

        online_logger.info( "Creating flows..")
        for key in self.potentials.keys():
            spec_base = self.potentials[key]["configuration"].spec_dir
            if self.operation == "train":
                self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"])
                online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"])

            elif self.operation in ("prewindowing", "prewindowing_offline"):
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"])
                else:
                    self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"])
                online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"])

            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"])
                else:
                    self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"])
                online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"])

            self.training_active_potential[key] = multiprocessing.Value("b",False)

        online_logger.info("Path variables set for NodeChains")

        # check if multiple potentials are given for training
        if isinstance(training_files, list):
            self.training_data = training_files
        else:
            self.training_data = [training_files]

        # Training is done in separate processes, we send the time series
        # windows to these threads via two queues
        online_logger.info( "Initializing Queues")
        for key in self.potentials.keys():
            self.queue[key] = multiprocessing.Queue()


        def flow_generator(key):
            """create a generator to yield all the abri flow windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block = True, timeout = None)
                if window == None: break
                yield window

        # Create the actual data flows
        for key in self.potentials.keys():

            if self.operation == "train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                         flow_spec = file(self.potentials[key]["node_chain"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["node_chain"])
            elif self.operation in ("prewindowing", "prewindowing_offline"):
                online_logger.info("loading prewindowing flow..")
                online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"]))

                self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                             flow_spec = file(self.potentials[key]["prewindowing_flow"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["prewindowing_flow"])
            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                                     flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    # create windower
                    online_logger.info( "Creating Windower")
                    online_logger.info(self.potentials[key]["windower_spec_path_train"])
                    self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]))
                    replace_start_and_end_markers = True
                else:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    replace_start_and_end_markers = False

                final_collection = TimeSeriesDataset()
                final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data")
                # delete previous training collection
                if os.path.exists(final_collection_path):
                    online_logger.info("deleting old training data collection for " + key)
                    shutil.rmtree(final_collection_path)

                # load all prewindowed collections and
                # append data to the final collection
                prewindowed_sets = \
                    glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
                if len(prewindowed_sets) == 0:
                    online_logger.error("Couldn't find data, please do prewindowing first!")
                    raise Exception
                online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets))

                for s,d in enumerate(prewindowed_sets):
                    collection = BaseDataset.load(d)
                    data = collection.get_data(0, 0, "train")
                    for d,(sample,label) in enumerate(data):

                        if replace_start_and_end_markers:
                            # in case we concatenate multiple 'Window' labeled
                            # sets we have to remove every start- and endmarker
                            for k in sample.marker_name.keys():
                                # find '{S,s}  8' or '{S,s}  9'
                                m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE)
                                if m is not None:
                                    online_logger.info(str("remove %s from %d %d" % (m.group(), s, d)))
                                    del(sample.marker_name[m.group()])

                            if s == len(prewindowed_sets)-1 and \
                                d == len(data)-1:
                                # insert endmarker
                                sample.marker_name["S  9"] = [0.0]
                                online_logger.info("added endmarker" + str(s) + " " + str(d))

                            if s == 0 and d == 0:
                                # insert startmarker
                                sample.marker_name["S  8"] = [0.0]
                                online_logger.info("added startmarker" + str(s) + " " + str(d))

                        final_collection.add_sample(sample, label, True)

                # save final collection (just for debugging)
                os.mkdir(final_collection_path)
                final_collection.store(final_collection_path)

                online_logger.info("stored final collection at " + final_collection_path)

                # load final collection again for training
                online_logger.info("loading data from " + final_collection_path)
                self.prewindowed_data[key] =  BaseDataset.load(final_collection_path)
                self.node_chains[key][0].set_input_dataset(self.prewindowed_data[key])

                flow = open(self.potentials[key]["postprocess_flow"])

            self.node_chain_definitions[key] = yaml.load(flow)
            flow.close()

        # TODO: check if the prewindowing flow is still needed
        # when using the stream mode!
        if self.operation in ("train"):
            online_logger.info( "Removing old flows...")
            try:
                shutil.rmtree(self.flow_storage)
            except:
                online_logger.info("Could not delete flow storage directory")
            os.mkdir(self.flow_storage)
        elif self.operation in ("prewindowing", "prewindowing_offline"):
            # follow this policy:
            # - delete prewindowed data older than 12 hours
            # - always delete trained/stored flows
            now = datetime.datetime.now()
            then = now - datetime.timedelta(hours=12)

            if not os.path.exists(self.prewindowed_data_directory):
                os.mkdir(self.prewindowed_data_directory)
            if not os.path.exists(self.flow_storage):
                os.mkdir(self.flow_storage)

            for key in self.potentials.keys():
                found = self.find_files_older_than(then, \
                        os.path.join(self.prewindowed_data_directory, key))
                if found is not None:
                    for f in found:
                        online_logger.info(str("recursively deleting files in \'%s\'" % f))
                        try:
                            shutil.rmtree(os.path.abspath(f))
                        except Exception as e:
                            # TODO: find a smart solution for this!
                            pass # dir was probably already deleted..

                if os.path.exists(os.path.join(self.prewindowed_data_directory, key, "all_train_data")):
                    shutil.rmtree(os.path.join(self.prewindowed_data_directory, key, "all_train_data"))
                    online_logger.info("deleted concatenated training data for " + key)


        online_logger.info( "Training preparations finished")
        return 0

Example #46

0

Show file

File: comp_analysis.py Project: pyspace/test

    def create(cls,
               operation_spec,
               result_directory,
               debug=False,
               input_paths=[]):
        """
        A factory method that creates an Analysis operation based on the
        information given in the operation specification operation_spec.
        If debug is TRUE the creation of the Analysis Processes will not
        be in a separated thread.
        """
        assert (operation_spec["type"] == "comp_analysis")
        input_path = operation_spec["input_path"]
        summary = BaseDataset.load(
            os.path.join(pySPACE.configuration.storage, input_path))
        data_dict = summary.data
        ## Done

        # Determine the parameters that should be analyzed
        parameters = operation_spec["parameters"]

        # Determine dependent parameters, which don't get extra resolution
        try:
            dep_par = operation_spec["dep_par"]
        except KeyError:
            dep_par = []

        # Determine the metrics that should be plotted
        spec_metrics = operation_spec["metrics"]
        metrics = []
        for metric in spec_metrics:
            if data_dict.has_key(metric):
                metrics.append(metric)
            else:
                import warnings
                warnings.warn('The metric "' + metric +
                              '" is not contained in the results csv file.')
        if len(metrics) == 0:
            warnings.warn(
                'No metric available from spec file, default to first dict entry.'
            )
            metrics.append(data_dict.keys()[0])

        # Determine how many processes will be created
        number_parameter_values = [
            len(set(data_dict[param])) for param in parameters
        ]
        number_processes = cls._numberOfProcesses(0,
                                                  number_parameter_values) + 1

        logscale = False
        if operation_spec.has_key('logscale'):
            logscale = operation_spec['logscale']

        markertype = 'x'
        if operation_spec.has_key('markertype'):
            markertype = operation_spec['markertype']

        if debug == True:
            # To better debug creation of processes we don't limit the queue
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, data_dict,
                                 parameters, dep_par, metrics, logscale,
                                 markertype, True)
            return cls(processes, operation_spec, result_directory,
                       number_processes)
        else:
            # Create all plot processes by calling a recursive helper method in
            # another thread so that already created processes can be executed
            # although creation of processes is not finished yet. Therefore a queue
            # is used which size is limited to guarantee that not to much objects
            # are created (since this costs memory). However, the actual number
            # of 100 is arbitrary and might be reviewed.
            processes = processing.Queue(100)
            create_process = processing.Process(
                target=cls._createProcesses,
                args=(processes, result_directory, data_dict, parameters,
                      dep_par, metrics, logscale, markertype, True))
            create_process.start()
            # create and return the comp_analysis operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes, create_process)

Example #47

0

Show file

File: prediction_vector.py Project: Crespo911/pyspace

    def store(self, result_dir, s_format=["pickle", "real"]):
        """ store the collection in *result_dir*"""

        name = "predictions"
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "prediction_vector",
                               "storage_format": s_format,
                               "author": author,
                               "data_pattern": "data_run" + os.sep
                                                 + name + "_sp_tt." + s_format[0]})

        if not s_format in ["csv", "arff", "pickle"]:
            self._log("Storage format not supported! Using default.",
                      level=logging.ERROR)
            s_format = "pickle"

        for key, prediction_vectors in self.data.iteritems():
            # Construct result directory
            result_path = result_dir + os.sep + "data" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)

            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format == "pickle":
                result_file = open(os.path.join(result_path,
                                                name + key_str + ".pickle"),
                                   "w")
                cPickle.dump(prediction_vectors, result_file, cPickle.HIGHEST_PROTOCOL)

            elif s_format == "csv": # Write as Comma Separated Value
                result_file = open(os.path.join(result_path,
                                                name + key_str + ".csv"),"w")
                if self.meta_data["num_predictors"] == 1:
                    result_file.write("Predicted Label, Prediction Score, True Label \n")
                    for pv in prediction_vectors:
                        result_file.write("%s, %s, %s\n" % (pv[0].label[0], pv[0].prediction[0], pv[1]))
                else:
                    # we begin by dealing with the header of the csv file
                    base_header = "Predicted %(index)d Label, Prediction %(index)d Score, "
                    base_result = "%(label)s, %(score)s,"
                    header = ""
                    for i in range(self.meta_data["num_predictors"]):
                        header+= base_header % dict(index=i+1)
                    header += "True Label\n"
                    result_file.write(header)

                    # and now we can write each of the prediction vectors in turn

                    for pv in prediction_vectors:
                        result = ""
                        for i in range(self.meta_data["num_predictors"]):
                            result += base_result % dict(label=pv[0].label[i],
                                                         score=pv[0].prediction[i])

                        result += str(pv[1]) + "\n"
                        result_file.write(result)

        #Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)

Example #48

0

Show file

    def store(self, result_dir, s_format=["pickle", "real"]):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        The method expects the following parameters:
          * *result_dir* The directory in which the collection will be stored
          * *name* The prefix of the file names in which the individual \
                   data sets are stored. The actual file names are determined \
                   by appending suffixes that encode run, split, train/test \
                   information. Defaults to "features".
          * *format* A list with information about the format in which the 
                    actual data sets should be stored. The first entry specifies
                    the file format. If it is "arff" the second entry specifies the
                    attribute format. 
                    
                    Examples: ["arff", "real"], ["arff", "{0,1}"]
                    
                    .. todo:: Someone could implement the format ["fasta"] for sax features
                    
                    To store the data in comma separated values, use ["csv", "real"].
                    
                    (*optional, default: ["pickle", "real"]*)

        .. todo:: Adapt storing of csv file to external library instead of
                  doing it manually.

        """
        name = "features"
        # Update the meta data
        try:
            author = pwd.getpwuid(os.getuid())[4]
        except:
            author = "unknown"
            self._log("Author could not be resolved.", level=logging.WARNING)
        self.update_meta_data({
            "type":
            "feature_vector",
            "storage_format":
            s_format,
            "author":
            author,
            "data_pattern":
            "data_run" + os.sep + name + "_sp_tt." + s_format[0]
        })

        # Iterate through splits and runs in this dataset
        for key, feature_vectors in self.data.iteritems():
            # Construct result directory
            result_path = result_dir + os.sep + "data" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)

            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format[0] == "pickle":
                result_file = open(
                    os.path.join(result_path, name + key_str + ".pickle"), "w")

                cPickle.dump(feature_vectors, result_file,
                             cPickle.HIGHEST_PROTOCOL)
            elif s_format[0] == "arff":  # Write as ARFF
                result_file = open(
                    os.path.join(result_path, name + key_str + ".arff"), "w")
                # Create the arff file header
                relation_name = result_dir.split(os.sep)[-1]
                result_file.write('@relation "%s"\n' % relation_name)
                # Write the type of all features
                for feature_name in self.meta_data["feature_names"]:
                    result_file.write('@attribute %s %s\n' %
                                      (feature_name, s_format[1]))
                classString = "" + ",".join(
                    sorted(self.meta_data["classes_names"])) + ""

                result_file.write("@attribute class {%s}\n" % classString)

                result_file.write("@data\n")
                # Write all given training data into the ARFF file
                fv = feature_vectors[0][0]
                if numpy.issubdtype(fv.dtype, numpy.string_):
                    feature_format = "%s,"
                elif numpy.issubdtype(fv.dtype, numpy.floating):
                    feature_format = "%f,"
                elif numpy.issubdtype(fv.dtype, numpy.integer):
                    feature_format = "%d,"
                for features, class_name in feature_vectors:
                    for feature in features[0]:
                        result_file.write(feature_format % feature)
                    result_file.write("%s\n" % str(class_name))
            elif s_format[0] == "csv":  # Write as Comma Separated Value
                result_file = open(
                    os.path.join(result_path, name + key_str + ".csv"), "w")
                for feature_name in self.meta_data["feature_names"]:
                    result_file.write('%s,' % (feature_name))
                result_file.write('\n')
                fv = feature_vectors[0][0]
                if numpy.issubdtype(fv.dtype, numpy.floating):
                    feature_format = "%f,"
                elif numpy.issubdtype(fv.dtype, numpy.integer):
                    feature_format = "%d,"
                else:
                    feature_format = "%s,"
                for features, class_name in feature_vectors:
                    f = features.view(numpy.ndarray)
                    for feature in f[0]:
                        result_file.write(feature_format % feature)
                    result_file.write("%s\n" % str(class_name))
            result_file.close()

        #Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)

Example #49

0

Show file

File: weka_classification.py Project: pyspace/test

    def __init__(self,
                 dataset_dir,
                 command_template,
                 parametrization,
                 cv_folds,
                 run_number,
                 operation_result_dir):
        super(WEKAClassificationProcess, self).__init__()
        # Load the abbreviations
        abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               'operations/weka_templates',
                                               'abbreviations.yaml'), 'r')
        self.abbreviations = yaml.load(abbreviations_file)
        abbreviations_file.close()
        # Determine the directory in which the process' results
        # are stored
        self.result_directory = operation_result_dir
        # Create collection
        collection = BaseDataset.load(dataset_dir)
        # The parametrization that is independent of the collection type
        # and the specific weka command template that is executed
        self.params = {"collection_name": dataset_dir.strip(os.sep).split(os.sep)[-1],
                       "run_number": run_number,
                       "cv_folds": cv_folds,
                       "weka_class_path": pySPACE.configuration.weka_class_path,
                       "temp_results": self.result_directory,
                       "unique_id": WEKAClassificationProcess.unique_id}
        # Collection dependent parameters
        if not collection.meta_data["train_test"] \
             and collection.meta_data["splits"] == 1:
            raise NotImplementedError()
        else:
            # The pattern of the train and test files generated by crossvalidation
            data_pattern =  os.path.join(dataset_dir,
                                         collection.meta_data["data_pattern"])
            # One example arff file in which WEKa can look up relation name etc.
            sample_dataset =  data_pattern.replace("_run", "_run0")\
                                          .replace("_sp_","_sp0_")\
                                          .replace("_tt","_train")
            self.params.update({"sample_dataset": sample_dataset,
                                "data_pattern": data_pattern})
        # Add custom parameters for the weka command template
        for parameter_name, parameter_value in parametrization.iteritems():
            self.params[parameter_name + "_abbr"] = parameter_value
            # Auto-expand abbreviations
            if parameter_value in self.abbreviations:
                parameter_value = self.abbreviations[parameter_value]
            elif parameter_name == 'classifier':
                import warnings
                warnings.warn("Did not find classifier abbreviation %s. "
                              " Expecting full name." % parameter_value)
            self.params[parameter_name] = parameter_value

        # Build the WEKA command by repeatedly replacing all placeholders in 
        # the template 
        while True:
            instantiated_template = command_template % self.params
            if instantiated_template == command_template:
                # All placeholders replace 
                self.weka_command = instantiated_template
                break
            else:
                # We have to continue since we are not converged
                command_template = instantiated_template
        
        self.handler_class = None
        
        WEKAClassificationProcess.unique_id += 1

Example #50

0

Show file

File: analyzer_sink.py Project: BioinformaticsArchive/pyspace

    def store(self, result_dir, s_format="BrainVision"):
        # Keep original file name, depends on the AnalyserSinkNode, see it's documentation.
        if self.meta_data.has_key("eeg_src_file_name") and self.meta_data["eeg_src_file_name"] is None:
            name = self.meta_data["eeg_src_file_name"]
        # or use default name from this collection
        else:
            name = "Analyzer"
        if not s_format == "BrainVision":
            self._log("The format %s is not supported!" % s_format, level=logging.CRITICAL)
            return
        # Update the meta data
        try:
            author = pwd.getpwuid(os.getuid())[4]
        except:
            author = "unknown"
            self._log("Author could not be resolved.", level=logging.WARNING)
        self.update_meta_data(
            {
                "type": "only output of individual nodes stored",
                "storage_format": s_format,
                "author": author,
                "data_pattern": "Multiplexed",
            }
        )
        # Store meta data
        BaseDataset.store_meta_data(result_dir, self.meta_data)
        # self._log("EEG data file %s" % self.collection.data_file)
        slices = []
        slices.append(0)
        channel_names = []

        for key, time_series in self.data.iteritems():
            # Sort the Times-Series Array
            def cmp_start(a, b):
                return cmp(a[0].start_time, b[0].start_time)

            time_series.sort(cmp_start)
            # Check for overlapping Windows and remove them if existent
            i = 0
            while i < len(time_series):
                ts = time_series[i]
                # print ts[0].start_time, ts[0].end_time
                # print len(time_series)
                if ts[0].start_time >= slices[-1]:
                    slices.append(ts[0].end_time)
                else:
                    warnings.warn("Ignoring at least one overlapping window!", UserWarning)
                i = i + 1
            # STORE ACTUAL EEG DATA AND WRITE MARKERFILE
            result_path = result_dir + os.sep + "data_analyzer" + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)

            key_str = "_sp%s_%s" % key[1:]
            # Keep original name
            if self.meta_data.has_key("eeg_src_file_name") and self.meta_data["eeg_src_file_name"] != None:
                result_file_eeg = open(os.path.join(result_path, name + ".eeg"), "wb")
                result_file_mrk = open(os.path.join(result_path, name + ".vmrk"), "w")
            # or use default name from this collection
            else:
                result_file_eeg = open(os.path.join(result_path, name + key_str + ".eeg"), "wb")
                result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"), "w")

            # Write Marker header
            if self.meta_data.has_key("eeg_src_file_name") and self.meta_data["eeg_src_file_name"] != None:
                result_file_mrk.write(header_mrk % (name))
            else:
                result_file_mrk.write(header_mrk % (name + key_str))

            result_file_ms = 0

            # Data for padding
            padding = None

            count_mrk = 2
            num_ch = 0
            sampling_int = 0

            for ts in time_series:
                if padding == None:
                    padding = numpy.zeros(len(ts[0].channel_names), dtype="int16")
                    num_ch = len(ts[0].channel_names)
                    channel_names = ts[0].channel_names
                    sampling_int = 1000000 / ts[0].sampling_frequency
                    # print "writing %d channels.." % len(ts[0].channel_names)
                # Write Padding (zeros)
                while result_file_ms < ts[0].start_time:
                    result_file_eeg.write(padding.tostring())
                    result_file_ms += ts[0]._samples_to_ms(1)
                # Write window
                ts[0].tofile(result_file_eeg)
                result_file_ms += ts[0].end_time - ts[0].start_time
                # Write Marker
                result_file_mrk.write(
                    "Mk%d=Label,%s,%d,1,0\n" % (count_mrk, ts[1], ts[0]._ms_to_samples(ts[0].start_time))
                )
                count_mrk += 1
            # WRITE HEADERFILE
            # Keep original name
            if self.meta_data.has_key("eeg_src_file_name") and self.meta_data["eeg_src_file_name"] != None:
                result_file_hdr = open(os.path.join(result_path, name + ".vhdr"), "w")
                result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int))
            # or use default name from this collection
            else:
                result_file_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"), "w")
                result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int))
            # Format: Ch1=Fp1,,0.1,\xB5V
            for i in range(num_ch):
                result_file_hdr.write("Ch%d=%s,,0.1,\xB5V\n" % (i + 1, channel_names[i]))

            result_file_hdr.close()
            result_file_eeg.close()
            result_file_mrk.close()

Example #51

0

Show file

File: analyzer_sink.py Project: Hansa064/pyspace

    def store(self, result_dir, s_format = "bp_eeg"):
        self.merged = False
        scale = 10.0 # is used to scale up the eeg sample values.  The data samples are converted to int16
                    # when saving, so scaling is necessary to keep maintain the resolutions. 
        # Keep original file name, depends on the AnalyserSinkNode, see it's documentation.
        if self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] is not None:
            name = self.meta_data['eeg_src_file_name']
        # or use default name from this collection
        else:
            name = "Analyzer"
        if not s_format == "bp_eeg":
            self._log("The format %s is not supported!"%s_format, level=logging.CRITICAL)
            return
        # Update the meta data
        author = get_author()
        self.update_meta_data({"type": "only output of individual nodes stored",
                                      "storage_format": s_format,
                                      "author" : author,
                                      "data_pattern": "Multiplexed"})
        # Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)
        #self._log("EEG data file %s" % self.collection.data_file)
        slices = []
        slices.append(0)
        channel_names = []
        

        
        for key, time_series in self.data.iteritems():
            # Sort the Times-Series Array
            def cmp_start(a, b):
                return cmp(a[0].start_time, b[0].start_time)

            time_series.sort(cmp_start)
            # Check for overlapping Windows and remove them if existent
            i = 0
            while i < len(time_series):
                ts = time_series[i]
                #print ts[0].start_time, ts[0].end_time
                #print len(time_series)
                if ts[0].start_time >= slices[-1]:
                    slices.append(ts[0].end_time)
                else:
                    warnings.warn("Ignoring at least one overlapping window!", UserWarning)
                i = i+1
            # STORE ACTUAL EEG DATA AND WRITE MARKERFILE
            result_path = result_dir + os.sep + "data_analyzer" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
            
            key_str = "_sp%s_%s" % key[1:]
            # Keep original name
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_eeg = open(os.path.join(result_path, name + ".eeg"), "wb")
                result_file_mrk = open(os.path.join(result_path, name + ".vmrk"), "w")
            # or use default name from this collection
            else:
                result_file_eeg = open(os.path.join(result_path, name + key_str + ".eeg"), "wb")
                result_file_mrk = open(os.path.join(result_path, name + key_str + ".vmrk"), "w")
        
            # Write Marker header
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_mrk.write(header_mrk % (name))
            else:
                result_file_mrk.write(header_mrk % (name + key_str))
        
            result_file_ms = 0
        
            # Data for padding
            padding = None
        
            count_mrk = 2
            num_ch = 0
            sampling_int = 0
            
            for ts in time_series:
                ts0 = ts[0] * scale 
                ts0 = ts0.astype(numpy.int16)
                
                if padding == None:
                    padding = numpy.zeros(len(ts[0].channel_names), dtype='int16')
                    num_ch = len(ts[0].channel_names)
                    channel_names = ts[0].channel_names
                    sampling_int = 1000000/ts[0].sampling_frequency
                    #print "writing %d channels.." % len(ts[0].channel_names)
                # Write Padding (zeros)
                while result_file_ms < ts[0].start_time - sampling_int/1000.0:
                    result_file_eeg.write(padding.tostring())
                    result_file_ms += ts[0]._samples_to_ms(1)
                # Write window
                ts0.tofile(result_file_eeg)
                result_file_ms += ts[0].end_time - (ts[0].start_time - sampling_int/1000.0)
                # Write Marker
                markers = []
                
                if(len(ts[0].marker_name) > 0):
                    mk_keys = ts[0].marker_name.keys()
                    mk_values = ts[0].marker_name.values()
                    for mk in range(len(mk_keys)):
                        for mv in range(len(mk_values[mk])):
                            markers.append((mk_keys[mk], mk_values[mk][mv]))
                    markers = sorted(markers, key=lambda tup: tup[1])
                    
                    for i in range(len(markers)):
                        if 'R' in markers[i][0]: 
                            event_type = 'Response' 
                        elif 'S' in markers[i][0]:
                            event_type = 'Stimulus'
                        else:
                            event_type = 'Label'
                            
                        result_file_mrk.write("Mk%d=%s,%s,%d,1,0\n" % (count_mrk, event_type, markers[i][0], (ts[0].start_time + markers[i][1])*ts[0].sampling_frequency/1000.0))                                                     
                        count_mrk += 1

            # WRITE HEADERFILE
            # Keep original name
            if (self.meta_data.has_key('eeg_src_file_name') and self.meta_data['eeg_src_file_name'] != None):
                result_file_hdr = open(os.path.join(result_path, name + ".vhdr"), "w")
                result_file_hdr.write(header_hdr % ((name), (name), num_ch, sampling_int))
            # or use default name from this collection
            else:
                result_file_hdr = open(os.path.join(result_path, name + key_str + ".vhdr"), "w")
                result_file_hdr.write(header_hdr % ((name + key_str), (name + key_str), num_ch, sampling_int))
            # Format: Ch1=Fp1,,0.1,\xB5V
            for i in range(num_ch):
                result_file_hdr.write("Ch%d=%s,,%.2f,\xB5V\n" % (i+1,channel_names[i], 1./scale))

            result_file_hdr.close()
            result_file_eeg.close()
            result_file_mrk.close()

Example #52

0

Show file

File: feature_vector.py Project: BioinformaticsArchive/pyspace

    def store(self, result_dir, s_format = ["pickle", "real"]):
        """ Stores this collection in the directory *result_dir*.
        
        In contrast to *dump* this method stores the collection
        not in a single file but as a whole directory structure with meta
        information etc. The data sets are stored separately for each run, 
        split, train/test combination.
        
        The method expects the following parameters:
          * *result_dir* The directory in which the collection will be stored
          * *name* The prefix of the file names in which the individual \
                   data sets are stored. The actual file names are determined \
                   by appending suffixes that encode run, split, train/test \
                   information. Defaults to "features".
          * *format* A list with information about the format in which the 
                    actual data sets should be stored. The first entry specifies
                    the file format. If it is "arff" the second entry specifies the
                    attribute format. 
                    
                    Examples: ["arff", "real"], ["arff", "{0,1}"]
                    
                    .. todo:: Someone could implement the format ["fasta"] for sax features
                    
                    To store the data in comma separated values, use ["csv", "real"].
                    
                    (*optional, default: ["pickle", "real"]*)

        .. todo:: Adapt storing of csv file to external library instead of
                  doing it manually.

        """
        name = "features"
        # Update the meta data
        try:
            author = pwd.getpwuid(os.getuid())[4]
        except:
            author = "unknown"
            self._log("Author could not be resolved.",level=logging.WARNING)
        self.update_meta_data({"type": "feature_vector",
                               "storage_format": s_format,
                               "author": author,
                               "data_pattern": "data_run" + os.sep 
                                                 + name + "_sp_tt." + s_format[0]})
        
        if type(s_format) == list:
            s_type = s_format[1]
            s_format = s_format[0]
        else:
            s_type = "real"
            
        if not s_format in ["csv", "arff", "pickle"]:
            self._log("Storage format not supported! Using default.", 
                      level=logging.ERROR)
            s_format = "pickle"
        
        # Iterate through splits and runs in this dataset
        for key, feature_vectors in self.data.iteritems():
            # Construct result directory
            result_path = result_dir + os.sep + "data" \
                            + "_run%s" % key[0]
            if not os.path.exists(result_path):
                os.mkdir(result_path)
                
            key_str = "_sp%s_%s" % key[1:]
            # Store data depending on the desired format
            if s_format == "pickle":
                result_file = open(os.path.join(result_path, 
                                                name + key_str + ".pickle"),
                                   "w")
         
                cPickle.dump(feature_vectors, result_file, cPickle.HIGHEST_PROTOCOL)
            elif s_format == "arff": # Write as ARFF
                result_file = open(os.path.join(result_path, 
                                                name + key_str + ".arff"),"w")
                # Create the arff file header
                relation_name = result_dir.split(os.sep)[-1]
                result_file.write('@relation "%s"\n' % relation_name)
                # Write the type of all features
                for feature_name in self.meta_data["feature_names"]:
                    result_file.write("@attribute %s %s\n" % (feature_name,  s_type))
                classString = "" + ",".join(sorted(self.meta_data["classes_names"])) + ""

                result_file.write("@attribute class {%s}\n" % classString)
                
                result_file.write("@data\n")
                # Write all given training data into the ARFF file
                fv = feature_vectors[0][0]
                if numpy.issubdtype(fv.dtype, numpy.string_):
                    feature_format = "%s,"
                elif numpy.issubdtype(fv.dtype, numpy.floating):
                    feature_format = "%f,"
                elif numpy.issubdtype(fv.dtype, numpy.integer):
                    feature_format = "%d,"
                for features, class_name in feature_vectors:
                    for feature in features[0]:
                        result_file.write(feature_format % feature)
                    result_file.write("%s\n" % str(class_name))
            elif s_format == "csv": # Write as Comma Separated Value
                result_file = open(os.path.join(result_path, 
                                                name + key_str + ".csv"),"w")
                for feature_name in self.meta_data["feature_names"]:
                    result_file.write("%s," % (feature_name))
                result_file.write("\n")
                fv = feature_vectors[0][0]
                if numpy.issubdtype(fv.dtype, numpy.floating):
                    feature_format = "%f,"
                elif numpy.issubdtype(fv.dtype, numpy.integer):
                    feature_format = "%d,"
                else:
                    feature_format = "%s,"
                for features, class_name in feature_vectors:
                    f = features.view(numpy.ndarray)
                    for feature in f[0]:
                        result_file.write(feature_format % feature)
                    result_file.write("%s\n" % str(class_name))
            result_file.close()

        #Store meta data
        BaseDataset.store_meta_data(result_dir,self.meta_data)

Example #53

0

Show file

    def consolidate(self):
        """
        Consolidates the results obtained by the single WEKA filter
        processes into a consistent summary of datasets that is stored on
        the file system.
        
        .. todo:: Some of the contents of this method should go into the
                  :class:`~pySPACE.resources.dataset_defs.feature_vector.FeatureVectorDataset`
        """

        # Iterate over all collections and store the collection meta data etc.
        for entries in os.listdir(self.result_directory):
            fullpath = os.path.join(self.result_directory, entries)
            # For each collection        
            if os.path.isdir(fullpath):
                if entries.startswith("{"):
                    # Extract the parameters from the collection name in order to
                    # adjust the relation name
                    if self.num_parameters > 0:
                        parameter_strings = entries.strip("}{").split("}{")[-self.num_parameters:]
                        parameter_postfix = "{" + "}{".join(parameter_strings) + "}"
                    else:
                        parameter_strings = ""
                        parameter_postfix = ""
                    # Postprocessing of the arff files of this collection
                    for train_arff_file in glob.glob(fullpath + os.sep + "data_run*" 
                                           + os.sep + "*train.arff"):
                        # Adjust the relation name of the train file
                        content = open(train_arff_file, 'r').readlines()             
                        # We strip everything after the last "}"
                        endindex = content[0].rfind("}")
                        content[0] = content[0][:endindex+1]
                        content[0] += parameter_postfix + "'"
                        open(train_arff_file, 'w').writelines(content)
                        # Use relation name of train data for test data
                        test_arff_file = train_arff_file.replace("train.arff", "test.arff") 
                        test_content = open(test_arff_file, 'r').readlines()
                        test_content[0] = content[0] + "\n"
                        open(test_arff_file, 'w').writelines(test_content)
                    
                        # Check which features are contained in the arff file
                        feature_names = []
                        for line in content:
                            if line.startswith("@attribute"):
                                attribute = line.split()[1]
                                if attribute is not "class":
                                    feature_names.append(attribute)
                    # Store the collection meta data etc.
                    if self.num_parameters > 0:
                        input_collection_name = \
                            "{" + "}{".join(entries.strip("}{").split("}{")[:-self.num_parameters]) + "}"
                    else:
                        input_collection_name = entries
                        
                    input_collection_path = os.path.join(self.operation_spec["input_path"],
                                                     input_collection_name)

                    input_collection_meta = BaseDataset.load_meta_data(
                                            pySPACE.configuration.storage
                                            + os.sep
                                            + input_collection_path)
                    # Store the input collection
                    BaseDataset.store_meta_data(fullpath, input_collection_meta,
                                                file_name="input_metadata.yaml")
                    # Adjust collection metadata for the new collection
                    input_collection_meta["feature_names"] = feature_names
                    input_collection_meta["num_features"] = len(feature_names)
                    input_collection_meta["author"] = get_author()
                    input_collection_meta["date"] = time.strftime("%Y%m%d")
                    input_collection_meta["input_collection_name"] = input_collection_name
                    # Write the collection meta information into the folder
                    BaseDataset.store_meta_data(fullpath,input_collection_meta)
                    # Store the command_template
                    command_template_file = open(os.path.join(fullpath,
                                                          "command_template"), 'w')
                    command_template_file.write(self.command_template)
                    command_template_file.close()
                else:
                    # training and test arff need the same relation name
                    # otherwise Weka can't relate it to each other; the collection
                    # name and the parameters in {}{}-optic must be the relation 
                    # name for further processing    
                    self._log("WARNING: Collection name doesn't begin with '{'. Further processing may be collapse!", level= logging.WARNING)
        # Write the specification of this operation
        # to the result directory in order to make later 
        # analysis of results more easy
        source_operation_file = open(os.path.join(self.result_directory,
                                                  "source_operation.yaml"), 'w')
        yaml.dump(self.operation_spec, source_operation_file)
        source_operation_file.close()