Example #1
0
    def _merge_pickle_files(self, target_collection_path,
                            source_collection_pathes, train_set_name_suffix,
                            target_collection_params):
        """ Merge all collections in source_collection_pathes and store them \
            in the target collection"""

        # load a first collection, in which the data of all other collections
        # is assembled
        target_collection = BaseDataset.load(source_collection_pathes[0])
        author = get_author()
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_collection.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        k = "test" if self.reverse else "train"
        target_collection_params["__INPUT_DATASET__"][k] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
        target_collection_params[
            "__RESULT_DIRECTORY__"] = self.result_directory
        target_collection.meta_data.update({
            "author":
            author,
            "date":
            date,
            "dataset_directory":
            target_collection_path,
            "train_test":
            True,
            "parameter_setting":
            target_collection_params,
            "input_collection_name":
            source_collection_pathes[0][len(pySPACE.configuration.storage):]
        })

        # merge data of all other collections to target collection
        for source_collection_path in source_collection_pathes[1:]:
            source_collection = BaseDataset.load(source_collection_path)
            for run in source_collection.get_run_numbers():
                for split in source_collection.get_split_numbers():
                    data = source_collection.get_data(run, split,
                                                      train_set_name_suffix)
                    target_data = target_collection.get_data(
                        run, split, train_set_name_suffix)
                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)

        # if only test data was given, the "Rest_vs" collection is stored as
        # training data
        if not self.reverse and "test" == train_set_name_suffix:
            # exchange the "test" in key tuple to "train" before storing
            for key in target_collection.data.keys():
                assert ("test" == key[2])
                value = target_collection.data.pop(key)
                key = (key[0], key[1], "train")
                target_collection.data[key] = value

        target_collection.store(target_collection_path)
Example #2
0
 def _merge_pickle_files(self, target_collection_path, source_collection_pathes,
                               train_set_name_suffix, target_collection_params):
     """ Merge all collections in source_collection_pathes and store them \
         in the target collection"""
     
     # load a first collection, in which the data of all other collections 
     # is assembled
     target_collection = BaseDataset.load(source_collection_pathes[0])
     try:
         author = pwd.getpwuid(os.getuid())[4]
     except:
         author = "unknown"
         self._log("Author could not be resolved.",level=logging.WARNING)
     date = time.strftime("%Y%m%d_%H_%M_%S")
     # Delete node_chain file name
     try:
         target_collection.meta_data.pop("node_chain_file_name")
     except:
         pass
     # Update meta data and store it
     k = "test" if self.reverse else "train"
     target_collection_params["__INPUT_DATASET__"][k] = \
              [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
     target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory
     target_collection.meta_data.update({
             "author" : author, 
             "date" : date, 
             "dataset_directory" : target_collection_path,
             "train_test" : True,
             "parameter_setting" : target_collection_params,
             "input_collection_name" : source_collection_pathes[0][len(
                                     pySPACE.configuration.storage):]
     })
   
     # merge data of all other collections to target collection
     for source_collection_path in source_collection_pathes[1:]:
         source_collection = BaseDataset.load(source_collection_path)
         for run in source_collection.get_run_numbers():
             for split in source_collection.get_split_numbers():
                 data = source_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 target_data = target_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 # actual data is stored in a list that has to be extended
                 target_data.extend(data)
                 
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if not self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in target_collection.data.keys():
             assert("test" == key[2])
             value = target_collection.data.pop(key)
             key = (key[0],key[1],"train")
             target_collection.data[key] = value
                 
     target_collection.store(target_collection_path)
Example #3
0
 def _createProcesses(cls, processes, result_directory, operation_spec, 
             parameter_settings, input_collections, command_template):     
  
     # For each combination of classifier, input-collection and
     # run number, create one WEKA_process
     for dataset_dir in input_collections:
         collection = BaseDataset.load(dataset_dir)
         # Determine the number of iterations and splits to be used
         iterations = collection.meta_data["runs"]
         splits = collection.meta_data["splits"] 
         if "runs" in operation_spec:
             assert(iterations in [1, operation_spec["runs"]])
             iterations = operation_spec["runs"]
         if "cv_folds" in operation_spec:
             assert(splits in [1, operation_spec["cv_folds"]])
             splits = operation_spec["cv_folds"]          
         
         for parametrization in parameter_settings: 
             for run_number in range(iterations):
                 process = WEKAClassificationProcess(dataset_dir,
                                                     command_template,
                                                     parametrization,
                                                     splits,
                                                     run_number,
                                                     result_directory)
                 processes.put(process)
     # give executing process the sign that creation is now finished                
     processes.put(False)
Example #4
0
    def __init__(self, dataset_dir, command_template, parametrization,
                 run_number, split_number, operation_result_dir,
                 hide_parameters = []):
        
        super(WEKAFilterProcess, self).__init__()
        
        # Determine the directory in which the of the process' results
        # are stored
        result_collection_name = dataset_dir.split(os.sep)[-2]
        for parameter_name, parameter_value in parametrization.iteritems():
            # If this is a parameter that should not be hidden, then we have to
            # encode it in the result collection name 
            if not parameter_name in hide_parameters:
                result_collection_name += "{__%s__:%s}" % (parameter_name.upper(),
                                                           parameter_value)
                                                                     
        self.result_directory = os.path.join(operation_result_dir,
                                             result_collection_name)
        
        # Create directory for intermediate results if it does not exist yet
        create_directory(self.result_directory 
                              + os.sep + "data_run%s" % run_number)
                
        # Create collection
        collection = BaseDataset.load(dataset_dir)
        
        # The parametrization that is independent of the collection type 
        # and the specific weka command template that is executed
        self.params = {"dataset_name": dataset_dir.replace('/','_'),
                       "dataset_dir": dataset_dir,
                       "run_number": run_number,
                       "split_number": split_number,
                       "weka_class_path": pySPACE.configuration.weka_class_path,
                       "temp_results": self.result_directory}

        # Load the abbreviations
        abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               'operations/weka_templates',
                                               'abbreviations.yaml'), 'r')
        self.abbreviations = yaml.load(abbreviations_file)
        # Add custom parameters for the weka command template
        for parameter_name, parameter_value in parametrization.iteritems():
            # Auto-expand abbreviations
            if parameter_value in self.abbreviations:
                parameter_value = self.abbreviations[parameter_value]
            self.params[parameter_name] = parameter_value
            
        # Build the WEKA command by repeatedly replacing all placeholders in 
        # the template 
        while True:
            instantiated_template = command_template % self.params
            if instantiated_template == command_template:
                # All placeholders replace 
                self.weka_command = instantiated_template
                break
            else:
                # We have to continue since we are not converged
                command_template = instantiated_template
        
        self.handler_class = None
Example #5
0
 def _copy_file(self, source_collection_path, target_collection_path,
                train_set_name_suffix):
     """ Copy a dataset to a new destination 
     
     **Parameters**
     
         :source_collection_path:
             The path to the dataset that has to be copied.
             
         :target_collection_path:
             The path to where the dataset should be copied.
             
         :train_set_name_suffix:
             Either 'train' or 'test'. Specifies if the target dataset is
             handeled as training or testing data. 
     """
     source_collection = BaseDataset.load(source_collection_path)
     # if only test data was given, the "Rest_vs" collection is stored as
     # training data
     if self.reverse and "test" == train_set_name_suffix:
         # exchange the "test" in key tuple to "train" before storing
         for key in source_collection.data.keys():
             assert ("test" == key[2])
             value = source_collection.data.pop(key)
             key = (key[0], key[1], "train")
             source_collection.data[key] = value
     # we store the data in the same format as before
     source_collection.store(target_collection_path,
                             source_collection.meta_data["storage_format"])
Example #6
0
 def _copy_file(self, source_collection_path, target_collection_path,
                train_set_name_suffix):
     """ Copy a dataset to a new destination 
     
     **Parameters**
     
         :source_collection_path:
             The path to the dataset that has to be copied.
             
         :target_collection_path:
             The path to where the dataset should be copied.
             
         :train_set_name_suffix:
             Either 'train' or 'test'. Specifies if the target dataset is
             handeled as training or testing data. 
     """ 
     source_collection = BaseDataset.load(source_collection_path)
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in source_collection.data.keys():
             assert("test" == key[2])
             value = source_collection.data.pop(key)
             key = (key[0],key[1],"train")
             source_collection.data[key] = value
     # we store the data in the same format as before
     source_collection.store(target_collection_path, 
         source_collection.meta_data["storage_format"])
Example #7
0
    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """
        A factory method that creates an WEKA operation based on the 
        information given in the operation specification operation_spec
        """
        assert(operation_spec["type"] == "weka_classification")
        # Determine all parameter combinations that should be tested
        parameter_settings = cls._get_parameter_space(operation_spec)
        
        # Read the command template from a file
        template_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               "operations",
                                               "weka_templates",
                                               operation_spec["template"]),
                             'r')
        command_template = template_file.read()
        template_file.close() 

        # number of processes
        if "runs" in operation_spec:
            number_processes = len(input_paths) * len(parameter_settings) * \
                           operation_spec["runs"]
        else: # approximate the number of processes 
            runs = []
            for dataset_dir in input_paths:
                collection = BaseDataset.load(dataset_dir)
                runs.append(collection.meta_data["runs"])
            runs = max(runs)
            number_processes = len(input_paths) * len(parameter_settings) * \
                               runs
        
        if debug == True:
            # To better debug creation of processes we don't limit the queue 
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, operation_spec, 
                                 parameter_settings, input_paths,
                                 command_template)
            # create and return the weka operation object
            return cls(processes, operation_spec, result_directory, 
                       number_processes)
        else:
            # Create all processes by calling a recursive helper method in 
            # another thread so that already created processes can be executed in 
            # parallel. Therefore a queue is used which size is maximized to 
            # guarantee that not to much objects are created (because this costs
            # memory). However, the actual number of 100 is arbitrary and might
            # be reviewed.
            processes = processing.Queue(100)
            create_process = processing.Process(target=cls._createProcesses,
                             args=( processes, result_directory, operation_spec, 
                                    parameter_settings, input_paths,
                                    command_template))
            create_process.start()            
            # create and return the weka operation object
            return cls(processes, operation_spec, result_directory, 
                       number_processes, create_process)        
Example #8
0
    def __call__(self):
        """ Executes this process on the respective modality """
        # Restore configuration
        pySPACE.configuration = self.configuration

        # reduce log_level for processing a second time and
        # set communication possibility for nodes to backend
        pySPACE.configuration.min_log_level = self.min_log_level
        pySPACE.configuration.logging_com = self.handler_args
        pySPACE.configuration.backend_com = self.backend_com

        ############## Prepare benchmarking ##############
        super(NodeChainProcess, self).pre_benchmarking()

        # Load the data and check that it can be processed
        # Note: This can not be done in the objects constructor since in
        # that case the whole input would need to be pickled
        # when doing the remote call
        abs_dataset_dir = os.sep.join([self.storage,
                                          self.rel_dataset_dir])

        input_collection = BaseDataset.load(abs_dataset_dir)

        # We have to remember parameters used for generating this specific
        # input dataset
        if 'parameter_setting' in input_collection.meta_data.keys():
            # but not __INPUT_DATASET__ and __RESULT_DIRECTORY__
            for k, v in input_collection.meta_data['parameter_setting'].items():
                if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]:
                    self.parameter_setting[k] = v

        NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain,
                                                       input_collection)

        ############## Do the actual benchmarking ##############

        self._log("Start benchmarking run %s of node_chain %s on dataset %s"
                                % (self.run,
                                   self.node_chain_spec,
                                   self.rel_dataset_dir))


        # Do the actual benchmarking for this collection/node_chain combination
        try:
            result_collection = \
                self.node_chain.benchmark(input_collection = input_collection,
                                         run = self.run,
                                         persistency_directory = self.persistency_dir,
                                         store_node_chain = self.store_node_chain)
        except Exception, exception:
            # Send Exception to Logger
            import traceback
            print traceback.format_exc()
            self._log(traceback.format_exc(), level = logging.ERROR)
            raise
Example #9
0
    def __call__(self):
        """ Executes this process on the respective modality """
        # Restore configuration
        pySPACE.configuration = self.configuration

        # reduce log_level for processing a second time and
        # set communication possibility for nodes to backend
        pySPACE.configuration.min_log_level = self.min_log_level
        pySPACE.configuration.logging_com = self.handler_args
        pySPACE.configuration.backend_com = self.backend_com

        ############## Prepare benchmarking ##############
        super(NodeChainProcess, self).pre_benchmarking()

        # Load the data and check that it can be processed
        # Note: This can not be done in the objects constructor since in
        # that case the whole input would need to be pickled
        # when doing the remote call
        abs_dataset_dir = os.sep.join([self.storage,
                                       self.rel_dataset_dir])

        input_collection = BaseDataset.load(abs_dataset_dir)

        # We have to remember parameters used for generating this specific
        # input dataset
        if 'parameter_setting' in input_collection.meta_data.keys():
            # but not __INPUT_DATASET__ and __RESULT_DIRECTORY__
            for k, v in input_collection.meta_data['parameter_setting'].items():
                if k not in ["__INPUT_DATASET__", "__RESULT_DIRECTORY__"]:
                    self.parameter_setting[k] = v

        NodeChainProcess._check_node_chain_dataset_consistency(self.node_chain,
                                                               input_collection)

        ############## Do the actual benchmarking ##############

        self._log("Start benchmarking run %s of node_chain %s on dataset %s"
                                % (self.run,
                                   self.node_chain_spec,
                                   self.rel_dataset_dir))


        # Do the actual benchmarking for this collection/node_chain combination
        try:
            result_collection = \
                self.node_chain.benchmark(
                    input_collection=input_collection,
                    run=self.run,
                    persistency_directory=self.persistency_dir,
                    store_node_chain=self.store_node_chain)
        except Exception, exception:
            # Send Exception to Logger
            import traceback
            self._log(traceback.format_exc(), level=logging.ERROR)
            raise
Example #10
0
    def create(cls,
               operation_spec,
               result_directory,
               debug=False,
               input_paths=[]):
        """
        A factory method that creates a statistic operation based on the
        information given in the operation specification operation_spec.
        If debug is TRUE the creation of the statistic processes will not
        be in a separated thread.
        """
        assert (operation_spec["type"] == "statistic")
        input_path = operation_spec["input_path"]
        tabular = BaseDataset.load(
            os.path.join(pySPACE.configuration.storage, input_path)).data

        if operation_spec.has_key("filter"):
            conditions = csv_analysis.empty_dict(tabular)
            for key, l in operation_spec["filter"].items():
                conditions[key].extend(l)
            tabular = csv_analysis.strip_dict(tabular, conditions)
        metric = operation_spec.get("metric", "Balanced_accuracy")
        parameter = operation_spec.get("parameter", "__Dataset__")
        rel_par = operation_spec.get("related_parameters",
                                     ["__Dataset__", "Key_Run", "Key_Fold"])
        average = operation_spec.get("average", None)

        if average in rel_par:
            rel_par.remove(average)
        if metric in rel_par:
            rel_par.remove(metric)
        if parameter in rel_par:
            rel_par.remove(parameter)

        reduced_tabular = cls.reduce_tabular(tabular, rel_par, metric,
                                             parameter, average)
        number_processes = 1
        processes = processing.Queue()
        cls._createProcesses(processes, result_directory, reduced_tabular)

        import shutil
        shutil.copy2(
            os.path.join(pySPACE.configuration.storage, input_path,
                         "results.csv"),
            os.path.join(result_directory, "results.csv"))
        shutil.copy2(
            os.path.join(pySPACE.configuration.storage, input_path,
                         "metadata.yaml"),
            os.path.join(result_directory, "metadata.yaml"))
        # create and return the shuffle operation object
        return cls(processes, operation_spec, result_directory,
                   number_processes)
Example #11
0
    def create(cls,
               operation_spec,
               result_directory,
               debug=False,
               input_paths=[]):
        """ A factory method that creates an Analysis operation based on the 
        information given in the operation specification operation_spec
        """
        assert (operation_spec["type"] == "analysis")
        input_path = operation_spec["input_path"]
        summary = BaseDataset.load(
            os.path.join(pySPACE.configuration.storage, input_path))
        data_dict = summary.data

        # Determine the parameters that should be analyzed
        parameters = operation_spec["parameters"]

        # Determine the metrics that should be plotted
        metrics = operation_spec["metrics"]

        # Determine how many processes will be created
        number_parameter_values = [
            len(set(data_dict[param])) for param in parameters
        ]
        number_processes = cls._numberOfProcesses(0,
                                                  number_parameter_values) + 1

        if debug == True:
            # To better debug creation of processes we don't limit the queue
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, data_dict,
                                 parameters, metrics, True)
            return cls(processes, operation_spec, result_directory,
                       number_processes)
        else:
            # Create all plot processes by calling a recursive helper method in
            # another thread so that already created processes can be executed
            # although creation of processes is not finished yet. Therefore a queue
            # is used which size is limited to guarantee that not to much objects
            # are created (since this costs memory). However, the actual number
            # of 100 is arbitrary and might be changed according to the system at hand.
            processes = processing.Queue(100)
            create_process = processing.Process(
                target=cls._createProcesses,
                args=(processes, result_directory, data_dict, parameters,
                      metrics, True))
            create_process.start()
            # create and return the operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes, create_process)
Example #12
0
 def _copy_pickle_file(self, source_collection_path, target_collection_path,
                       train_set_name_suffix):
     
     source_collection = BaseDataset.load(source_collection_path)
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in source_collection.data.keys():
             assert("test" == key[2])
             value = source_collection.data.pop(key)
             key = (key[0],key[1],"train")
             source_collection.data[key] = value
     source_collection.store(target_collection_path)
Example #13
0
    def _copy_pickle_file(self, source_collection_path, target_collection_path,
                          train_set_name_suffix):

        source_collection = BaseDataset.load(source_collection_path)
        # if only test data was given, the "Rest_vs" collection is stored as
        # training data
        if self.reverse and "test" == train_set_name_suffix:
            # exchange the "test" in key tuple to "train" before storing
            for key in source_collection.data.keys():
                assert ("test" == key[2])
                value = source_collection.data.pop(key)
                key = (key[0], key[1], "train")
                source_collection.data[key] = value
        source_collection.store(target_collection_path)
Example #14
0
    def test_time_series_storing(self):

        if os.path.exists('tmp') is False:
            os.makedirs('tmp')

        source = SimpleTimeSeriesSourceNode()
        sink = TimeSeriesSinkNode()
        sink.register_input_node(source)
        sink.set_run_number(0)
        sink.process_current_split()
        result_collection = sink.get_result_dataset()
        result_collection.store('tmp')
        #sink.store_results("test_time_series_storing.tmp")

        reloaded_collection = BaseDataset.load('tmp')

        reloader = TimeSeriesSourceNode()
        reloader.set_input_dataset(reloaded_collection)
        #set_permanent_attributes(time_series_file = "test_time_series_storing.tmp")

        orig_data = list(source.request_data_for_testing())
        restored_data = list(reloader.request_data_for_testing())

        # Check that the two list have the same length
        self.assertEqual(
            len(orig_data), len(restored_data),
            "Numbers of time series before storing and after reloading are not equal!"
        )

        # Check that there is a one-to-one correspondence
        for orig_datapoint, orig_label in orig_data:
            found = False
            for restored_datapoint, restored_label in restored_data:
                found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \
                            and (orig_label == restored_label)
                if found: break
            self.assert_(
                found,
                "One of the original time series cannot not be found after reloading"
            )

        shutil.rmtree('tmp')  # Cleaning up...
Example #15
0
    def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
        """ A factory method that creates an Analysis operation based on the 
        information given in the operation specification operation_spec
        """
        assert(operation_spec["type"] == "analysis")
        input_path = operation_spec["input_path"]
        summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage,
                                      input_path))
        data_dict = summary.data

        # Determine the parameters that should be analyzed
        parameters = operation_spec["parameters"]
        
        # Determine the metrics that should be plotted
        metrics = operation_spec["metrics"]
        
        # Determine how many processes will be created
        number_parameter_values = [len(set(data_dict[param])) for param in parameters]
        number_processes = cls._numberOfProcesses(0, number_parameter_values)+1
        
        if debug == True:
            # To better debug creation of processes we don't limit the queue 
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, data_dict, parameters, 
                                   metrics, True)
            return cls( processes, operation_spec, result_directory, number_processes)
        else:
            # Create all plot processes by calling a recursive helper method in 
            # another thread so that already created processes can be executed
            # although creation of processes is not finished yet. Therefore a queue 
            # is used which size is limited to guarantee that not to much objects 
            # are created (since this costs memory). However, the actual number 
            # of 100 is arbitrary and might be changed according to the system at hand.
            processes = processing.Queue(100)
            create_process = processing.Process(target=cls._createProcesses,
                             args=( processes, result_directory, data_dict, 
                                    parameters, metrics, True))
            create_process.start()
            # create and return the operation object
            return cls( processes, operation_spec, result_directory, number_processes, create_process)        
Example #16
0
 def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
     """
     A factory method that creates a statistic operation based on the
     information given in the operation specification operation_spec.
     If debug is TRUE the creation of the statistic processes will not
     be in a separated thread.
     """
     assert(operation_spec["type"] == "statistic")
     input_path = operation_spec["input_path"]
     tabular = BaseDataset.load(os.path.join(pySPACE.configuration.storage, input_path)).data
     
     if operation_spec.has_key("filter"):
         conditions= csv_analysis.empty_dict(tabular)
         for key,l in operation_spec["filter"].items():
             conditions[key].extend(l)
         tabular = csv_analysis.strip_dict(tabular,conditions)
     metric = operation_spec.get("metric","Balanced_accuracy")
     parameter = operation_spec.get("parameter","__Dataset__")
     rel_par = operation_spec.get("related_parameters",["__Dataset__", "Key_Run", "Key_Fold"])
     average = operation_spec.get("average",None)
     
     if average in rel_par:
         rel_par.remove(average)
     if metric in rel_par:
         rel_par.remove(metric)
     if parameter in rel_par:
         rel_par.remove(parameter)
         
     reduced_tabular=cls.reduce_tabular(tabular,rel_par,metric,parameter,average)
     number_processes = 1
     processes = processing.Queue()
     cls._createProcesses(processes, result_directory, reduced_tabular)
     
     import shutil
     shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"results.csv"), os.path.join(result_directory,"results.csv"))
     shutil.copy2(os.path.join(pySPACE.configuration.storage, input_path,"metadata.yaml"), os.path.join(result_directory,"metadata.yaml"))
     # create and return the shuffle operation object
     return cls(processes, operation_spec, result_directory, number_processes)
    def test_time_series_storing(self):

        if os.path.exists('tmp') is False :
            os.makedirs('tmp')
        
        source = SimpleTimeSeriesSourceNode()
        sink = TimeSeriesSinkNode()
        sink.register_input_node(source)
        sink.set_run_number(0)
        sink.process_current_split()
        result_collection = sink.get_result_dataset()
        result_collection.store('tmp')
        #sink.store_results("test_time_series_storing.tmp")
        
        reloaded_collection = BaseDataset.load('tmp')
        
        reloader = TimeSeriesSourceNode()
        reloader.set_input_dataset(reloaded_collection)
        #set_permanent_attributes(time_series_file = "test_time_series_storing.tmp")
        
        orig_data = list(source.request_data_for_testing()) 
        restored_data = list(reloader.request_data_for_testing())
        
        # Check that the two list have the same length
        self.assertEqual(len(orig_data), len(restored_data),
                         "Numbers of time series before storing and after reloading are not equal!")
        
        # Check that there is a one-to-one correspondence
        for orig_datapoint, orig_label in orig_data:
            found = False
            for restored_datapoint, restored_label in restored_data:
                found |= (orig_datapoint.view(numpy.ndarray) == restored_datapoint.view(numpy.ndarray)).all() \
                            and (orig_label == restored_label)
                if found: break
            self.assert_(found, 
                         "One of the original time series cannot not be found after reloading")
        
        shutil.rmtree('tmp') # Cleaning up... 
Example #18
0
    def prepare_training(self,
                         training_files,
                         potentials,
                         operation,
                         nullmarker_stride_ms=None):
        """ Prepares pyspace live for training.

        Prepares everything for training of pyspace live,
        i.e. creates flows based on the dataflow specs
        and configures them.
        """
        online_logger.info("Preparing Training")
        self.potentials = potentials
        self.operation = operation
        self.nullmarker_stride_ms = nullmarker_stride_ms
        if self.nullmarker_stride_ms == None:
            online_logger.warn(
                'Nullmarker stride interval is %s. You can specify it in your parameter file.'
                % self.nullmarker_stride_ms)
        else:
            online_logger.info('Nullmarker stride interval is set to %s ms ' %
                               self.nullmarker_stride_ms)

        online_logger.info("Creating flows..")
        for key in self.potentials.keys():
            spec_base = self.potentials[key]["configuration"].spec_dir
            if self.operation == "train":
                self.potentials[key]["node_chain"] = os.path.join(
                    spec_base, self.potentials[key]["node_chain"])
                online_logger.info("node_chain_spec:" +
                                   self.potentials[key]["node_chain"])

            elif self.operation in ("prewindowing", "prewindowing_offline"):
                self.potentials[key]["prewindowing_flow"] = os.path.join(
                    spec_base, self.potentials[key]["prewindowing_flow"])
                online_logger.info("prewindowing_dataflow_spec: " +
                                   self.potentials[key]["prewindowing_flow"])

            elif self.operation == "prewindowed_train":
                self.potentials[key]["postprocess_flow"] = os.path.join(
                    spec_base, self.potentials[key]["postprocess_flow"])
                online_logger.info("postprocessing_dataflow_spec: " +
                                   self.potentials[key]["postprocess_flow"])

            self.training_active_potential[key] = multiprocessing.Value(
                "b", False)

        online_logger.info("Path variables set for NodeChains")

        # check if multiple potentials are given for training
        if isinstance(training_files, list):
            self.training_data = training_files
        else:
            self.training_data = [training_files]

        # Training is done in separate processes, we send the time series
        # windows to these threads via two queues
        online_logger.info("Initializing Queues")
        for key in self.potentials.keys():
            self.queue[key] = multiprocessing.Queue()

        def flow_generator(key):
            """create a generator to yield all the abri flow windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block=True, timeout=None)
                if window == None: break
                yield window

        # Create the actual data flows
        for key in self.potentials.keys():

            if self.operation == "train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["node_chain"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["node_chain"])
            elif self.operation in ("prewindowing", "prewindowing_offline"):
                online_logger.info("loading prewindowing flow..")
                online_logger.info(
                    "file: " + str(self.potentials[key]["prewindowing_flow"]))

                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["prewindowing_flow"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["prewindowing_flow"])
            elif self.operation == "prewindowed_train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["postprocess_flow"]))
                replace_start_and_end_markers = False

                final_collection = TimeSeriesDataset()
                final_collection_path = os.path.join(
                    self.prewindowed_data_directory, key, "all_train_data")
                # delete previous training collection
                if os.path.exists(final_collection_path):
                    online_logger.info(
                        "deleting old training data collection for " + key)
                    shutil.rmtree(final_collection_path)

                # load all prewindowed collections and
                # append data to the final collection
                prewindowed_sets = \
                    glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
                if len(prewindowed_sets) == 0:
                    online_logger.error(
                        "Couldn't find data, please do prewindowing first!")
                    raise Exception
                online_logger.info("concatenating prewindowed data from " +
                                   str(prewindowed_sets))

                for s, d in enumerate(prewindowed_sets):
                    collection = BaseDataset.load(d)
                    data = collection.get_data(0, 0, "train")
                    for d, (sample, label) in enumerate(data):
                        if replace_start_and_end_markers:
                            # in case we concatenate multiple 'Window' labeled
                            # sets we have to remove every start- and endmarker
                            for k in sample.marker_name.keys():
                                # find '{S,s}  8' or '{S,s}  9'
                                m = re.match("^s\s{0,2}[8,9]{1}$", k,
                                             re.IGNORECASE)
                                if m is not None:
                                    online_logger.info(
                                        str("remove %s from %d %d" %
                                            (m.group(), s, d)))
                                    del (sample.marker_name[m.group()])

                            if s == len(prewindowed_sets)-1 and \
                                d == len(data)-1:
                                # insert endmarker
                                sample.marker_name["S  9"] = [0.0]
                                online_logger.info("added endmarker" + str(s) +
                                                   " " + str(d))

                            if s == 0 and d == 0:
                                # insert startmarker
                                sample.marker_name["S  8"] = [0.0]
                                online_logger.info("added startmarker" +
                                                   str(s) + " " + str(d))

                        final_collection.add_sample(sample, label, True)

                # save final collection (just for debugging)
                os.mkdir(final_collection_path)
                final_collection.store(final_collection_path)

                online_logger.info("stored final collection at " +
                                   final_collection_path)

                # load final collection again for training
                online_logger.info("loading data from " +
                                   final_collection_path)
                self.prewindowed_data[key] = BaseDataset.load(
                    final_collection_path)
                self.node_chains[key][0].set_input_dataset(
                    self.prewindowed_data[key])

                flow = open(self.potentials[key]["postprocess_flow"])

            # create window_stream for every potential

            if self.operation in ("prewindowing"):
                window_spec_file = os.path.join(
                    spec_base, "node_chains", "windower",
                    self.potentials[key]["windower_spec_path_train"])

                self.window_stream[key] = \
                        self.stream_manager.request_window_stream(window_spec_file,
                                                              nullmarker_stride_ms = self.nullmarker_stride_ms)
            elif self.operation in ("prewindowing_offline"):
                pass
            elif self.operation in ("train"):
                pass

            self.node_chain_definitions[key] = yaml.load(flow)
            flow.close()

        # TODO: check if the prewindowing flow is still needed when using the stream mode!
        if self.operation in ("train"):
            online_logger.info("Removing old flows...")
            try:
                shutil.rmtree(self.flow_storage)
            except:
                online_logger.info("Could not delete flow storage directory")
            os.mkdir(self.flow_storage)
        elif self.operation in ("prewindowing", "prewindowing_offline"):
            # follow this policy:
            # - delete prewindowed data older than 12 hours
            # - always delete trained/stored flows
            now = datetime.datetime.now()
            then = now - datetime.timedelta(hours=12)

            if not os.path.exists(self.prewindowed_data_directory):
                os.mkdir(self.prewindowed_data_directory)
            if not os.path.exists(self.flow_storage):
                os.mkdir(self.flow_storage)

            for key in self.potentials.keys():
                found = self.find_files_older_than(then, \
                        os.path.join(self.prewindowed_data_directory, key))
                if found is not None:
                    for f in found:
                        online_logger.info(
                            str("recursively deleting files in \'%s\'" % f))
                        try:
                            shutil.rmtree(os.path.abspath(f))
                        except Exception as e:
                            # TODO: find a smart solution for this!
                            pass  # dir was probably already deleted..

                if os.path.exists(
                        os.path.join(self.prewindowed_data_directory, key,
                                     "all_train_data")):
                    shutil.rmtree(
                        os.path.join(self.prewindowed_data_directory, key,
                                     "all_train_data"))
                    online_logger.info(
                        "deleted concatenated training data for " + key)

        online_logger.info("Training preparations finished")
        return 0
Example #19
0
 def _merge_files(self, target_collection_path, source_collection_pathes,
                  train_set_name_suffix, target_collection_params):
     """ Merge all collections in source_collection_pathes and store them \
         in the target collection
         
     **Parameters**
     
         :target_collection_path:
             Path of the dataset, in which the data of all other datasets
             is assembled.
             
         :source_collection_pathes:
             Paths of the datasets to be merged.
             
         :train_set_name_suffix:
             Either 'train' or 'test'. Specifies if datasets are merged for
             training or testing.
             
         :target_collection_params:
             Dictionary with all the parameters of the target dataset.
             
     """
     
     # load a first collection, in which the data of all other collections 
     # is assembled
     target_collection = BaseDataset.load(source_collection_pathes[0])
     author = get_author()
     date = time.strftime("%Y%m%d_%H_%M_%S")
     # Delete node_chain file name
     try:
         target_collection.meta_data.pop("node_chain_file_name")
     except:
         pass
     # Update meta data and store it
     k = "test" if self.reverse else "train"
     target_collection_params["__INPUT_DATASET__"][k] = \
              [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
     target_collection_params["__RESULT_DIRECTORY__"] = self.result_directory
     target_collection.meta_data.update({
             "author" : author, 
             "date" : date, 
             "dataset_directory" : target_collection_path,
             "train_test" : True,
             "parameter_setting" : target_collection_params,
             "input_collection_name" : source_collection_pathes[0][len(
                                     pySPACE.configuration.storage):]
     })
   
     # merge data of all other collections to target collection
     for source_collection_path in source_collection_pathes[1:]:
         source_collection = BaseDataset.load(source_collection_path)
         for run in source_collection.get_run_numbers():
             for split in source_collection.get_split_numbers():
                 target_data = target_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 
                 if self.set_flag:
                     for ts, l in target_data:
                         if ts.specs == None:
                             ts.specs = {"new_set": False}
                         elif ts.specs.has_key("new_set"):
                             break
                         else:
                             ts.specs["new_set"]= False
                 
                 data = source_collection.get_data(run, split, 
                                                       train_set_name_suffix)
                 
                 if self.set_flag:
                     for i, (ts, l) in enumerate(data):
                         # flag first element of the concatenated data list
                         if ts.specs == None:
                             ts.specs = {"new_set": i==0}
                         else:
                             ts.specs["new_set"] = (i==0)
                 
                 # actual data is stored in a list that has to be extended
                 target_data.extend(data)
                 
     # if only test data was given, the "Rest_vs" collection is stored as 
     # training data
     if not self.reverse and "test" == train_set_name_suffix: 
         # exchange the "test" in key tuple to "train" before storing
         for key in target_collection.data.keys():
             assert("test" == key[2])
             value = target_collection.data.pop(key)
             key = (key[0],key[1],"train")
             target_collection.data[key] = value
     # we store the data in the same format as before
     target_collection.store(target_collection_path, 
         target_collection.meta_data["storage_format"])
Example #20
0
    def _merge_files(self, target_collection_path, source_collection_pathes,
                     train_set_name_suffix, target_collection_params):
        """ Merge all collections in source_collection_pathes and store them \
            in the target collection
            
        **Parameters**
        
            :target_collection_path:
                Path of the dataset, in which the data of all other datasets
                is assembled.
                
            :source_collection_pathes:
                Paths of the datasets to be merged.
                
            :train_set_name_suffix:
                Either 'train' or 'test'. Specifies if datasets are merged for
                training or testing.
                
            :target_collection_params:
                Dictionary with all the parameters of the target dataset.
                
        """

        # load a first collection, in which the data of all other collections
        # is assembled
        target_collection = BaseDataset.load(source_collection_pathes[0])
        author = get_author()
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_collection.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        k = "test" if self.reverse else "train"
        target_collection_params["__INPUT_DATASET__"][k] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_collection_pathes]
        target_collection_params[
            "__RESULT_DIRECTORY__"] = self.result_directory
        target_collection.meta_data.update({
            "author":
            author,
            "date":
            date,
            "dataset_directory":
            target_collection_path,
            "train_test":
            True,
            "parameter_setting":
            target_collection_params,
            "input_dataset_name":
            source_collection_pathes[0][len(pySPACE.configuration.storage):]
        })

        # merge data of all other collections to target collection
        for source_collection_path in source_collection_pathes[1:]:
            source_collection = BaseDataset.load(source_collection_path)
            for run in source_collection.get_run_numbers():
                for split in source_collection.get_split_numbers():
                    target_data = target_collection.get_data(
                        run, split, train_set_name_suffix)

                    if self.set_flag:
                        for ts, l in target_data:
                            if ts.specs == None:
                                ts.specs = {"new_set": False}
                            elif ts.specs.has_key("new_set"):
                                break
                            else:
                                ts.specs["new_set"] = False

                    data = source_collection.get_data(run, split,
                                                      train_set_name_suffix)

                    if self.set_flag:
                        for i, (ts, l) in enumerate(data):
                            # flag first element of the concatenated data list
                            if ts.specs == None:
                                ts.specs = {"new_set": i == 0}
                            else:
                                ts.specs["new_set"] = (i == 0)

                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)

        # if only test data was given, the "Rest_vs" collection is stored as
        # training data
        if not self.reverse and "test" == train_set_name_suffix:
            # exchange the "test" in key tuple to "train" before storing
            for key in target_collection.data.keys():
                assert ("test" == key[2])
                value = target_collection.data.pop(key)
                key = (key[0], key[1], "train")
                target_collection.data[key] = value
        # we store the data in the same format as before
        target_collection.store(target_collection_path,
                                target_collection.meta_data["storage_format"])
Example #21
0
 def create(cls, operation_spec, result_directory, debug=False, input_paths=[]):
     """
     A factory method that creates an Analysis operation based on the
     information given in the operation specification operation_spec.
     If debug is TRUE the creation of the Analysis Processes will not
     be in a separated thread.
     """
     assert(operation_spec["type"] == "comp_analysis")
     input_path = operation_spec["input_path"]
     summary = BaseDataset.load(os.path.join(pySPACE.configuration.storage,
                                   input_path))
     data_dict = summary.data
     ## Done
     
     # Determine the parameters that should be analyzed
     parameters = operation_spec["parameters"]
     
     # Determine dependent parameters, which don't get extra resolution
     try:
         dep_par = operation_spec["dep_par"]
     except KeyError:
         dep_par=[]
     
     # Determine the metrics that should be plotted
     spec_metrics = operation_spec["metrics"]
     metrics=[]
     for metric in spec_metrics:
         if data_dict.has_key(metric):
             metrics.append(metric)
         else:
             import warnings
             warnings.warn('The metric "' + metric + '" is not contained in the results csv file.')
     if len(metrics)==0:
         warnings.warn('No metric available from spec file, default to first dict entry.')
         metrics.append(data_dict.keys()[0])
         
     # Determine how many processes will be created
     number_parameter_values = [len(set(data_dict[param])) for param in parameters]
     number_processes = cls._numberOfProcesses(0, number_parameter_values)+1
     
     logscale = False
     if operation_spec.has_key('logscale'):
         logscale = operation_spec['logscale']
     
     markertype='x'
     if operation_spec.has_key('markertype'):
         markertype = operation_spec['markertype']
     
     if debug == True:
         # To better debug creation of processes we don't limit the queue 
         # and create all processes before executing them
         processes = processing.Queue()
         cls._createProcesses(processes, result_directory, data_dict, parameters, 
                                dep_par, metrics, logscale, markertype, True)
         return cls( processes, operation_spec, result_directory, number_processes)
     else:
         # Create all plot processes by calling a recursive helper method in 
         # another thread so that already created processes can be executed
         # although creation of processes is not finished yet. Therefore a queue 
         # is used which size is limited to guarantee that not to much objects 
         # are created (since this costs memory). However, the actual number 
         # of 100 is arbitrary and might be reviewed.
         processes = processing.Queue(100)
         create_process = processing.Process(target=cls._createProcesses,
                          args=( processes, result_directory, data_dict, 
                                 parameters, dep_par, metrics, logscale, markertype, True))
         create_process.start()
         # create and return the comp_analysis operation object
         return cls( processes, operation_spec, result_directory, 
                                number_processes, create_process)
Example #22
0
    def _merge_pickle_files(self, target_dataset_path, source_dataset_pathes):
        """ Concatenate all datasets in source_dataset_pathes and store 
            them in the target dataset"""
        # sort the dataset 
        source_dataset_pathes.sort()
        # load a first dataset, in which the data of all other datasets is assembled
        target_dataset = BaseDataset.load(source_dataset_pathes[0])
        
        # Determine author and date
        try:
            author = getpass.getuser()
        except : 
            author = "Unknown"
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_dataset.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        params = target_dataset.meta_data.pop("parameter_setting")
        params["__INPUT_DATASET__"] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_dataset_pathes]
        params["__RESULT_DIRECTORY__"] = self.result_directory
        target_dataset.meta_data.update({"author" : author, 
                      "date" : date, 
                      "dataset_directory" : target_dataset_path,
                      "train_test" : False,
                      "parameter_setting" : params,
                      "changed_time" : self.change_time,
                      "input_dataset_name" : source_dataset_pathes[0][len(
                                        pySPACE.configuration.storage):]
        })
    
        # Concatenate data of all other datasets to target dataset
        for source_dataset_path in source_dataset_pathes[1:]:
            source_dataset = BaseDataset.load(source_dataset_path)
            for run in source_dataset.get_run_numbers():
                for split in source_dataset.get_split_numbers():
                    target_data = target_dataset.get_data(run, split, "test")

                    if self.change_time:
                        # ensure sorted target_data 
                        # TODO: encode this in meta data?  
                        target_data.sort(key=lambda t: t[0].end_time)
                        last_end_time = target_data[-1][0].end_time

                    for ts, l in target_data:
                        if ts.specs == None:
                            ts.specs = {"new_set": False}
                        elif ts.specs.has_key("new_set"):
                            break
                        else:
                            ts.specs["new_set"]= False

                    data = source_dataset.get_data(run, split, "test")

                    if self.change_time:                    
                        # ensure sorted target_data 
                        # TODO: encode this in meta data?
                        data.sort(key=lambda t: t[0].end_time)
                    # flag the first element of the concatenated data list
                    for i, (ts, l) in enumerate(data):
                        if ts.specs == None:
                            ts.specs = {"new_set": i==0}
                        else:
                            ts.specs["new_set"] = (i==0)
                        if self.change_time:
                            ts.start_time = last_end_time + ts.start_time
                            ts.end_time = last_end_time + ts.end_time
                            
                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)
                
        target_dataset.store(target_dataset_path)
Example #23
0
    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(MergeProcess, self).pre_benchmarking()

        # For all input collections
        for source_test_collection_path in self.input_collections:
            # Check if the input data is splitted
            # e.g. only a single test file is in the source directory
            source_files = glob.glob(
                os.sep.join(
                    [source_test_collection_path, "data_run0", "*test*"]))
            splitted = len(source_files) > 1
            assert (not splitted)
            source_file_name = str(source_files[-1])

            # check if train sets are also present
            train_data_present = len(glob.glob(os.sep.join(
                                 [source_test_collection_path,"data_run0",\
                                  "*train*"]))) > 0

            # if training data is present -> use train and test sets separately
            if train_data_present:
                train_set_name_suffix = "train"
            else:
                train_set_name_suffix = "test"

            # We create the collection Rest_vs_Collection
            source_test_collection_name = \
                                   source_test_collection_path.split(os.sep)[-2]
            test_base_collection_name = \
                          source_test_collection_name.strip("}{").split("}{")[0]
            if self.reverse:
                target_collection_name = source_test_collection_name.replace(
                    test_base_collection_name,
                    test_base_collection_name + "_vs_" + self.name_pattern)
                key = "train"
            else:
                target_collection_name = source_test_collection_name.replace(
                    test_base_collection_name,
                    self.name_pattern + "_vs_" + test_base_collection_name)
                key = "test"

            target_collection_path = os.sep.join(
                [self.result_directory, target_collection_name])
            # determine the parameter_settings of the test collection
            test_collection = BaseDataset.load(source_test_collection_path)
            target_collection_params = \
                                 test_collection.meta_data["parameter_setting"]
            target_collection_params["__INPUT_DATASET__"] = \
                                           {key: source_test_collection_name}

            if source_file_name.endswith("arff"):
                file_ending = "arff"
                # Copy arff file from input collection to target collection
                source_test_file_path = os.sep.join([
                    source_test_collection_path, "data_run0",
                    "features_sp0" + train_set_name_suffix + ".arff"
                ])
                target_test_file_path = os.sep.join([
                    target_collection_path, "data_run0",
                    "features_sp0_" + key + ".arff"
                ])

            else:
                file_ending = source_file_name.split(".")[-1]
                source_test_file_path = source_test_collection_path
                target_test_file_path = target_collection_path

            source_train_pathes = []
            for source_train_collection_path in self.input_collections:
                source_train_collection_name = \
                                  source_train_collection_path.split(os.sep)[-2]
                # We must not use data originating from the same input
                # collection both in train and test files
                if source_test_collection_name == source_train_collection_name:
                    continue

                # Check that all constraints are fulfilled for this pair of
                # input collections
                if not all(eval(constraint_template % \
                  {'source_train_collection_name': source_train_collection_name,
                   'source_test_collection_name': source_test_collection_name})
                        for constraint_template in self.collection_constraints):
                    continue

                # check if all parameters are stored in the target path
                source_collection = \
                                BaseDataset.load(source_train_collection_path)
                source_collection_params = \
                            source_collection.meta_data["parameter_setting"]
                remaining_params = \
                          [param for param in source_collection_params.items() \
                            if param not in target_collection_params.items() and \
                               param[0] not in ["__INPUT_DATASET__",
                               "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__",
                               "__INPUT_COLLECTION__" ]] # for old data
                if remaining_params != []:
                    for k, v in remaining_params:
                        target_collection_path += "{%s#%s}" % (k, str(v))
                        target_collection_params[k] = v

                if "arff" == file_ending:
                    source_train_file_path = \
                                      os.sep.join([source_train_collection_path,
                                                "data_run0", "features_sp0_" + \
                                               train_set_name_suffix + ".arff"])
                else:
                    source_train_file_path = source_train_collection_path

                source_train_pathes.append(source_train_file_path)

            if "arff" == file_ending:
                target_train_file_path = os.sep.join([
                    target_collection_path, "data_run0",
                    "features_sp0_" + key + ".arff"
                ])
            else:
                target_train_file_path = target_collection_path

            if len(source_train_pathes) == 0:
                continue

            create_directory(os.sep.join([target_collection_path,
                                          "data_run0"]))

            if "arff" == file_ending:
                self._copy_arff_file(source_test_file_path,
                                     target_test_file_path,
                                     source_test_collection_name,
                                     target_collection_name)

                self._merge_arff_files(target_train_file_path,
                                       source_train_pathes,
                                       target_collection_name)
                # Copy metadata.yaml
                # TODO: Adapt to new collection
                input_meta = BaseDataset.load_meta_data(
                    source_test_collection_path)
                BaseDataset.store_meta_data(target_collection_path, input_meta)
            else:
                self._copy_file(source_test_collection_path,
                                target_collection_path, train_set_name_suffix)

                self._merge_files(target_train_file_path, source_train_pathes,
                                  train_set_name_suffix,
                                  target_collection_params)

        ############## Clean up after benchmarking ##############
        super(MergeProcess, self).post_benchmarking()
Example #24
0
    def __init__(self,
                 dataset_dir,
                 command_template,
                 parametrization,
                 cv_folds,
                 run_number,
                 operation_result_dir):
        super(WEKAClassificationProcess, self).__init__()
        # Load the abbreviations
        abbreviations_file = open(os.path.join(pySPACE.configuration.spec_dir,
                                               'operations/weka_templates',
                                               'abbreviations.yaml'), 'r')
        self.abbreviations = yaml.load(abbreviations_file)
        abbreviations_file.close()
        # Determine the directory in which the process' results
        # are stored
        self.result_directory = operation_result_dir
        # Create collection
        collection = BaseDataset.load(dataset_dir)
        # The parametrization that is independent of the collection type
        # and the specific weka command template that is executed
        self.params = {"collection_name": dataset_dir.strip(os.sep).split(os.sep)[-1],
                       "run_number": run_number,
                       "cv_folds": cv_folds,
                       "weka_class_path": pySPACE.configuration.weka_class_path,
                       "temp_results": self.result_directory,
                       "unique_id": WEKAClassificationProcess.unique_id}
        # Collection dependent parameters
        if not collection.meta_data["train_test"] \
             and collection.meta_data["splits"] == 1:
            raise NotImplementedError()
        else:
            # The pattern of the train and test files generated by crossvalidation
            data_pattern =  os.path.join(dataset_dir,
                                         collection.meta_data["data_pattern"])
            # One example arff file in which WEKa can look up relation name etc.
            sample_dataset =  data_pattern.replace("_run", "_run0")\
                                          .replace("_sp_","_sp0_")\
                                          .replace("_tt","_train")
            self.params.update({"sample_dataset": sample_dataset,
                                "data_pattern": data_pattern})
        # Add custom parameters for the weka command template
        for parameter_name, parameter_value in parametrization.iteritems():
            self.params[parameter_name + "_abbr"] = parameter_value
            # Auto-expand abbreviations
            if parameter_value in self.abbreviations:
                parameter_value = self.abbreviations[parameter_value]
            elif parameter_name == 'classifier':
                import warnings
                warnings.warn("Did not find classifier abbreviation %s. "
                              " Expecting full name." % parameter_value)
            self.params[parameter_name] = parameter_value

        # Build the WEKA command by repeatedly replacing all placeholders in 
        # the template 
        while True:
            instantiated_template = command_template % self.params
            if instantiated_template == command_template:
                # All placeholders replace 
                self.weka_command = instantiated_template
                break
            else:
                # We have to continue since we are not converged
                command_template = instantiated_template
        
        self.handler_class = None
        
        WEKAClassificationProcess.unique_id += 1
Example #25
0
    def prepare_training(self, training_files, potentials, operation):
        """ Prepares pyspace live for training.

        Prepares everything for training of pyspace live,
        i.e. creates flows based on the dataflow specs
        and configures them.
        """
        online_logger.info( "Preparing Training")
        self.potentials = potentials
        self.operation = operation

        online_logger.info( "Creating flows..")
        for key in self.potentials.keys():
            spec_base = self.potentials[key]["configuration"].spec_dir
            if self.operation == "train":
                self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"])
                online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"])

            elif self.operation in ("prewindowing", "prewindowing_offline"):
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"])
                else:
                    self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"])
                online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"])

            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"])
                else:
                    self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"])
                online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"])

            self.training_active_potential[key] = multiprocessing.Value("b",False)

        online_logger.info("Path variables set for NodeChains")

        # check if multiple potentials are given for training
        if isinstance(training_files, list):
            self.training_data = training_files
        else:
            self.training_data = [training_files]

        # Training is done in separate processes, we send the time series
        # windows to these threads via two queues
        online_logger.info( "Initializing Queues")
        for key in self.potentials.keys():
            self.queue[key] = multiprocessing.Queue()


        def flow_generator(key):
            """create a generator to yield all the abri flow windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block = True, timeout = None)
                if window == None: break
                yield window

        # Create the actual data flows
        for key in self.potentials.keys():

            if self.operation == "train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                         flow_spec = file(self.potentials[key]["node_chain"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["node_chain"])
            elif self.operation in ("prewindowing", "prewindowing_offline"):
                online_logger.info("loading prewindowing flow..")
                online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"]))

                self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                             flow_spec = file(self.potentials[key]["prewindowing_flow"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["prewindowing_flow"])
            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                                     flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    # create windower
                    online_logger.info( "Creating Windower")
                    online_logger.info(self.potentials[key]["windower_spec_path_train"])
                    self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]))
                    replace_start_and_end_markers = True
                else:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    replace_start_and_end_markers = False

                final_collection = TimeSeriesDataset()
                final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data")
                # delete previous training collection
                if os.path.exists(final_collection_path):
                    online_logger.info("deleting old training data collection for " + key)
                    shutil.rmtree(final_collection_path)

                # load all prewindowed collections and
                # append data to the final collection
                prewindowed_sets = \
                    glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
                if len(prewindowed_sets) == 0:
                    online_logger.error("Couldn't find data, please do prewindowing first!")
                    raise Exception
                online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets))

                for s,d in enumerate(prewindowed_sets):
                    collection = BaseDataset.load(d)
                    data = collection.get_data(0, 0, "train")
                    for d,(sample,label) in enumerate(data):

                        if replace_start_and_end_markers:
                            # in case we concatenate multiple 'Window' labeled
                            # sets we have to remove every start- and endmarker
                            for k in sample.marker_name.keys():
                                # find '{S,s}  8' or '{S,s}  9'
                                m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE)
                                if m is not None:
                                    online_logger.info(str("remove %s from %d %d" % (m.group(), s, d)))
                                    del(sample.marker_name[m.group()])

                            if s == len(prewindowed_sets)-1 and \
                                d == len(data)-1:
                                # insert endmarker
                                sample.marker_name["S  9"] = [0.0]
                                online_logger.info("added endmarker" + str(s) + " " + str(d))

                            if s == 0 and d == 0:
                                # insert startmarker
                                sample.marker_name["S  8"] = [0.0]
                                online_logger.info("added startmarker" + str(s) + " " + str(d))

                        final_collection.add_sample(sample, label, True)

                # save final collection (just for debugging)
                os.mkdir(final_collection_path)
                final_collection.store(final_collection_path)

                online_logger.info("stored final collection at " + final_collection_path)

                # load final collection again for training
                online_logger.info("loading data from " + final_collection_path)
                self.prewindowed_data[key] =  BaseDataset.load(final_collection_path)
                self.node_chains[key][0].set_input_dataset(self.prewindowed_data[key])

                flow = open(self.potentials[key]["postprocess_flow"])

            self.node_chain_definitions[key] = yaml.load(flow)
            flow.close()

        # TODO: check if the prewindowing flow is still needed
        # when using the stream mode!
        if self.operation in ("train"):
            online_logger.info( "Removing old flows...")
            try:
                shutil.rmtree(self.flow_storage)
            except:
                online_logger.info("Could not delete flow storage directory")
            os.mkdir(self.flow_storage)
        elif self.operation in ("prewindowing", "prewindowing_offline"):
            # follow this policy:
            # - delete prewindowed data older than 12 hours
            # - always delete trained/stored flows
            now = datetime.datetime.now()
            then = now - datetime.timedelta(hours=12)

            if not os.path.exists(self.prewindowed_data_directory):
                os.mkdir(self.prewindowed_data_directory)
            if not os.path.exists(self.flow_storage):
                os.mkdir(self.flow_storage)

            for key in self.potentials.keys():
                found = self.find_files_older_than(then, \
                        os.path.join(self.prewindowed_data_directory, key))
                if found is not None:
                    for f in found:
                        online_logger.info(str("recursively deleting files in \'%s\'" % f))
                        try:
                            shutil.rmtree(os.path.abspath(f))
                        except Exception as e:
                            # TODO: find a smart solution for this!
                            pass # dir was probably already deleted..

                if os.path.exists(os.path.join(self.prewindowed_data_directory, key, "all_train_data")):
                    shutil.rmtree(os.path.join(self.prewindowed_data_directory, key, "all_train_data"))
                    online_logger.info("deleted concatenated training data for " + key)


        online_logger.info( "Training preparations finished")
        return 0
Example #26
0
    def create(cls,
               operation_spec,
               result_directory,
               debug=False,
               input_paths=[]):
        """
        A factory method that creates an Analysis operation based on the
        information given in the operation specification operation_spec.
        If debug is TRUE the creation of the Analysis Processes will not
        be in a separated thread.
        """
        assert (operation_spec["type"] == "comp_analysis")
        input_path = operation_spec["input_path"]
        summary = BaseDataset.load(
            os.path.join(pySPACE.configuration.storage, input_path))
        data_dict = summary.data
        ## Done

        # Determine the parameters that should be analyzed
        parameters = operation_spec["parameters"]

        # Determine dependent parameters, which don't get extra resolution
        try:
            dep_par = operation_spec["dep_par"]
        except KeyError:
            dep_par = []

        # Determine the metrics that should be plotted
        spec_metrics = operation_spec["metrics"]
        metrics = []
        for metric in spec_metrics:
            if data_dict.has_key(metric):
                metrics.append(metric)
            else:
                import warnings
                warnings.warn('The metric "' + metric +
                              '" is not contained in the results csv file.')
        if len(metrics) == 0:
            warnings.warn(
                'No metric available from spec file, default to first dict entry.'
            )
            metrics.append(data_dict.keys()[0])

        # Determine how many processes will be created
        number_parameter_values = [
            len(set(data_dict[param])) for param in parameters
        ]
        number_processes = cls._numberOfProcesses(0,
                                                  number_parameter_values) + 1

        logscale = False
        if operation_spec.has_key('logscale'):
            logscale = operation_spec['logscale']

        markertype = 'x'
        if operation_spec.has_key('markertype'):
            markertype = operation_spec['markertype']

        if debug == True:
            # To better debug creation of processes we don't limit the queue
            # and create all processes before executing them
            processes = processing.Queue()
            cls._createProcesses(processes, result_directory, data_dict,
                                 parameters, dep_par, metrics, logscale,
                                 markertype, True)
            return cls(processes, operation_spec, result_directory,
                       number_processes)
        else:
            # Create all plot processes by calling a recursive helper method in
            # another thread so that already created processes can be executed
            # although creation of processes is not finished yet. Therefore a queue
            # is used which size is limited to guarantee that not to much objects
            # are created (since this costs memory). However, the actual number
            # of 100 is arbitrary and might be reviewed.
            processes = processing.Queue(100)
            create_process = processing.Process(
                target=cls._createProcesses,
                args=(processes, result_directory, data_dict, parameters,
                      dep_par, metrics, logscale, markertype, True))
            create_process.start()
            # create and return the comp_analysis operation object
            return cls(processes, operation_spec, result_directory,
                       number_processes, create_process)
    def _merge_pickle_files(self, target_dataset_path, source_dataset_pathes):
        """ Concatenate all datasets in source_dataset_pathes and store 
            them in the target dataset"""
        # sort the dataset 
        source_dataset_pathes.sort()
        # load a first dataset, in which the data of all other datasets is assembled
        target_dataset = BaseDataset.load(source_dataset_pathes[0])
        
        # Determine author and date
        try:
            author = getpass.getuser()
        except : 
            author = "Unknown"
        date = time.strftime("%Y%m%d_%H_%M_%S")
        # Delete node_chain file name
        try:
            target_dataset.meta_data.pop("node_chain_file_name")
        except:
            pass
        # Update meta data and store it
        params = target_dataset.meta_data.pop("parameter_setting")
        params["__INPUT_DATASET__"] = \
                 [s_c_p.split(os.sep)[-2] for s_c_p in source_dataset_pathes]
        params["__RESULT_DIRECTORY__"] = self.result_directory
        target_dataset.meta_data.update({"author" : author, 
                      "date" : date, 
                      "dataset_directory" : target_dataset_path,
                      "train_test" : False,
                      "parameter_setting" : params,
                      "changed_time" : self.change_time,
                      "input_dataset_name" : source_dataset_pathes[0][len(
                                        pySPACE.configuration.storage):]
        })
    
        # Concatenate data of all other datasets to target dataset
        for source_dataset_path in source_dataset_pathes[1:]:
            source_dataset = BaseDataset.load(source_dataset_path)
            for run in source_dataset.get_run_numbers():
                for split in source_dataset.get_split_numbers():
                    target_data = target_dataset.get_data(run, split, "test")

                    if self.change_time:
                        # ensure sorted target_data 
                        # TODO: encode this in meta data?  
                        target_data.sort(key=lambda t: t[0].end_time)
                        last_end_time = target_data[-1][0].end_time

                    for ts, l in target_data:
                        if ts.specs == None:
                            ts.specs = {"new_set": False}
                        elif ts.specs.has_key("new_set"):
                            break
                        else:
                            ts.specs["new_set"]= False

                    data = source_dataset.get_data(run, split, "test")

                    if self.change_time:                    
                        # ensure sorted target_data 
                        # TODO: encode this in meta data?
                        data.sort(key=lambda t: t[0].end_time)
                    # flag the first element of the concatenated data list
                    for i, (ts, l) in enumerate(data):
                        if ts.specs == None:
                            ts.specs = {"new_set": i==0}
                        else:
                            ts.specs["new_set"] = (i==0)
                        if self.change_time:
                            ts.start_time = last_end_time + ts.start_time
                            ts.end_time = last_end_time + ts.end_time
                            
                    # actual data is stored in a list that has to be extended
                    target_data.extend(data)
                
        target_dataset.store(target_dataset_path)
Example #28
0
    def __call__(self):
        """ Executes this process on the respective modality """
        ############## Prepare benchmarking ##############
        super(MergeProcess, self).pre_benchmarking()
        
        # For all input collections
        for source_test_collection_path in self.input_collections:
            # Check if the input data is splitted
            # e.g. only a single test file is in the source directory 
            source_files = glob.glob(os.sep.join([source_test_collection_path,
                                                  "data_run0", "*test*"]))
            splitted = len(source_files) > 1
            assert(not splitted)
            source_file_name = str(source_files[-1])
            
            # check if train sets are also present
            train_data_present = len(glob.glob(os.sep.join(
                                 [source_test_collection_path,"data_run0",\
                                  "*train*"]))) > 0
            
            # if training data is present -> use train and test sets separately
            if train_data_present:
                train_set_name_suffix = "train"
            else:
                train_set_name_suffix =  "test"
            
            # We create the collection Rest_vs_Collection
            source_test_collection_name = \
                                   source_test_collection_path.split(os.sep)[-2]
            test_base_collection_name = \
                          source_test_collection_name.strip("}{").split("}{")[0]
            if self.reverse:
                target_collection_name = source_test_collection_name.replace(
                                         test_base_collection_name,
                                         test_base_collection_name + "_vs_Rest")
                key = "train"
            else:
                target_collection_name = source_test_collection_name.replace(
                                         test_base_collection_name,
                                         "Rest_vs_" + test_base_collection_name)
                key = "test"
                
            target_collection_path = os.sep.join([self.result_directory,
                                                  target_collection_name])
            # determine the parameter_settings of the test collection
            test_collection = BaseDataset.load(source_test_collection_path)
            target_collection_params = \
                                 test_collection.meta_data["parameter_setting"]
            target_collection_params["__INPUT_DATASET__"] = \
                                           {key: source_test_collection_name}
            
            if source_file_name.endswith("arff"):
                file_ending = "arff"
                # Copy arff file from input collection to target collection
                source_test_file_path = os.sep.join([source_test_collection_path,
                                        "data_run0","features_sp0" +
                                        train_set_name_suffix + ".arff"])
                target_test_file_path = os.sep.join([target_collection_path,
                                       "data_run0","features_sp0_"+key+".arff"])
            
            elif source_file_name.endswith("pickle"):
                file_ending = "pickle"
                source_test_file_path = source_test_collection_path
                target_test_file_path = target_collection_path
            else:
                raise NotImplementedError("File type not supported in " \
                                                               "MergeOperation")
            
            source_train_pathes = []
            for source_train_collection_path in self.input_collections:
                source_train_collection_name = \
                                  source_train_collection_path.split(os.sep)[-2]
                # We must not use data originating from the same input
                # collection both in train and test files
                if source_test_collection_name == source_train_collection_name:
                    continue
                
                # Check that all constraints are fulfilled for this pair of
                # input collections
                if not all(eval(constraint_template % \
                  {'source_train_collection_name': source_train_collection_name,
                   'source_test_collection_name': source_test_collection_name})
                        for constraint_template in self.collection_constraints):
                    continue
                
                # check if all parameters are stored in the target path
                source_collection = \
                                BaseDataset.load(source_train_collection_path)
                source_collection_params = \
                            source_collection.meta_data["parameter_setting"]
                remaining_params = \
                          [param for param in source_collection_params.items() \
                            if param not in target_collection_params.items() and \
                               param[0] not in ["__INPUT_DATASET__", 
                               "__RESULT_DIRECTORY__", "__OUTPUT_BUNDLE__",
                               "__INPUT_COLLECTION__" ]] # for old data
                if remaining_params != []:
                    for k,v in remaining_params:
                         target_collection_path += "{%s#%s}" % (k,str(v))
                         target_collection_params[k]=v
                   
                if "arff" == file_ending:
                    source_train_file_path = \
                                      os.sep.join([source_train_collection_path, 
                                                "data_run0", "features_sp0_" + \
                                               train_set_name_suffix + ".arff"])
                elif "pickle" == file_ending:
                    source_train_file_path = source_train_collection_path

                else:
                    raise NotImplementedError("File type not supported in " \
                                                              "MergeOperation!")     
                    
                source_train_pathes.append(source_train_file_path)
            
            if "arff" == file_ending:
                target_train_file_path = os.sep.join([target_collection_path,
                                       "data_run0","features_sp0_"+key+".arff"])
            elif "pickle" == file_ending:
                target_train_file_path = target_collection_path
            else:
                raise NotImplementedError("File type not supported in "
                                                              "MergeOperation!")     
            
            if len(source_train_pathes) == 0:
                continue
            
            create_directory(os.sep.join([target_collection_path,
                                          "data_run0"]))
            
            if "arff" == file_ending:
                self._copy_arff_file(source_test_file_path, 
                                     target_test_file_path,
                                     source_test_collection_name, 
                                     target_collection_name)
                                
                self._merge_arff_files(target_train_file_path, 
                                       source_train_pathes,
                                       target_collection_name)
                # Copy metadata.yaml
                # TODO: Adapt to new collection
                input_meta = BaseDataset.load_meta_data(source_test_collection_path)
                BaseDataset.store_meta_data(target_collection_path,input_meta)
            elif "pickle" == file_ending:
                self._copy_pickle_file(source_test_collection_path,
                                       target_collection_path,
                                       train_set_name_suffix)

                self._merge_pickle_files(target_train_file_path, 
                                         source_train_pathes, 
                                         train_set_name_suffix,
                                         target_collection_params)
            else:
                raise NotImplementedError("File type not supported in merge_operation")
            
        ############## Clean up after benchmarking ##############
        super(MergeProcess, self).post_benchmarking()