Example #1
0
    def node_from_yaml(layer_spec):
        """ Load the specs and initialize the layer nodes """
        assert("parameters" in layer_spec
               and "class_labels" in layer_spec["parameters"]
               and "node" in layer_spec["parameters"]),\
                   "Node requires specification of a node and classification labels!"
        scheme = layer_spec["parameters"].pop("scheme", "1vs1")
        # Create all nodes that are packed together in this layer
        layer_nodes = []
        node_spec = layer_spec["parameters"]["node"][0]
        classes = layer_spec["parameters"]["class_labels"]
        if scheme == '1vR':
            for label in layer_spec["parameters"]["class_labels"]:
                node_obj = BaseNode.node_from_yaml(
                    NodeChainFactory.instantiate(node_spec, {"LABEL": label}))
                layer_nodes.append(node_obj)
        else:
            n = len(classes)
            for i in range(n - 1):
                for j in range(i + 1, n):
                    replace_dict = {"LABEL1": classes[i], "LABEL2": classes[j]}
                    node_obj = BaseNode.node_from_yaml(
                        NodeChainFactory.instantiate(node_spec, replace_dict))
                    layer_nodes.append(node_obj)
        layer_spec["parameters"].pop("node")
        layer_spec["parameters"].pop("class_labels")
        # Create the node object
        node_obj = MultiClassLayerNode(nodes=layer_nodes,
                                       **layer_spec["parameters"])

        return node_obj
    def node_from_yaml(layer_spec):
        """ Load the specs and initialize the layer nodes """
        assert("parameters" in layer_spec
               and "class_labels" in layer_spec["parameters"]
               and "node" in layer_spec["parameters"]),\
                   "Node requires specification of a node and classification labels!"
        scheme = layer_spec["parameters"].pop("scheme","1vs1")
        # Create all nodes that are packed together in this layer
        layer_nodes = []
        node_spec = layer_spec["parameters"]["node"][0]
        classes = layer_spec["parameters"]["class_labels"]
        if scheme=='1vR':
            for label in layer_spec["parameters"]["class_labels"]:
                node_obj = BaseNode.node_from_yaml(NodeChainFactory.instantiate(node_spec,{"LABEL":label}))
                layer_nodes.append(node_obj)
        else:
            n=len(classes)
            for i in range(n-1):
                for j in range(i+1,n):
                    replace_dict = {"LABEL1":classes[i],"LABEL2":classes[j]}
                    node_obj = BaseNode.node_from_yaml(NodeChainFactory.instantiate(node_spec,replace_dict))
                    layer_nodes.append(node_obj)
        layer_spec["parameters"].pop("node")
        layer_spec["parameters"].pop("class_labels")
        # Create the node object
        node_obj = MultiClassLayerNode(nodes = layer_nodes,**layer_spec["parameters"])

        return node_obj
Example #3
0
    def __init__(
        self,
        node_chain_spec,
        parameter_setting,
        rel_dataset_dir,
        run,
        split,
        storage_format,
        result_dataset_directory,
        store_node_chain=False,
        hide_parameters=[],
    ):

        super(NodeChainProcess, self).__init__()

        self.node_chain_spec = node_chain_spec
        self.parameter_setting = parameter_setting
        self.rel_dataset_dir = rel_dataset_dir
        self.storage = pySPACE.configuration.storage
        self.run = run
        self.storage_format = storage_format
        self.result_dataset_directory = result_dataset_directory
        self.persistency_dir = os.sep.join([result_dataset_directory, "persistency_run%s" % run])
        create_directory(self.persistency_dir)
        self.store_node_chain = store_node_chain
        self.hide_parameters = hide_parameters

        # reduce_log_level for process creation
        try:
            console_log_level = (
                eval(pySPACE.configuration.console_log_level)
                if hasattr(pySPACE.configuration, "console_log_level")
                else logging.WARNING
            )
        except (AttributeError, NameError):
            console_log_level = logging.WARNING
        try:
            file_log_level = (
                eval(pySPACE.configuration.file_log_level)
                if hasattr(pySPACE.configuration, "file_log_level")
                else logging.INFO
            )
        except (AttributeError, NameError):
            file_log_level = logging.INFO

        self.min_log_level = min(console_log_level, file_log_level)
        pySPACE.configuration.min_log_level = self.min_log_level
        # Replace parameters in spec file
        #        self.node_chain_spec = replace_parameters_and_convert(
        #            self.node_chain_spec, self.parameter_setting)
        self.node_chain_spec = replace_parameters2(self.node_chain_spec, self.parameter_setting)
        # Create node chain
        self.node_chain = NodeChainFactory.flow_from_yaml(Flow_Class=BenchmarkNodeChain, flow_spec=self.node_chain_spec)

        for node in self.node_chain:
            node.current_split = split
        # Remove pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in self.parameter_setting:
            self.parameter_setting = copy.deepcopy(self.parameter_setting)
            self.parameter_setting.pop("__PREPARE_OPERATION__")
    def test_dataflow_from_yaml(self):
        simpleYAMLInput ="""
-
    node : Time_Series_Source
-
    node : Detrending
    parameters : 
        detrend_method : "eval(__import__('pylab').detrend_mean)"
- 
    node : Subsampling
    parameters : 
        target_frequency : 100.0
- 
    node : CSP
    parameters : 
         retained_channels : 4
""" 

        flow = NodeChainFactory.flow_from_yaml(NodeChain,
                                          simpleYAMLInput)
        self.assert_(isinstance(flow, NodeChain) and len(flow) == 4)
        self.assert_(isinstance(flow[0], TimeSeriesSourceNode) and
                     isinstance(flow[1], DetrendingNode) and
                     isinstance(flow[2], SubsamplingNode) and
                     isinstance(flow[3], CSPNode))
        self.assert_(flow[1].detrend_method == pylab.detrend_mean)
        self.assert_(flow[2].target_frequency == 100.0)
        self.assert_(flow[3].retained_channels == 4)
Example #5
0
    def test_dataflow_from_yaml(self):
        simpleYAMLInput = """
-
    node : Time_Series_Source
-
    node : Detrending
    parameters : 
        detrend_method : "eval(__import__('pylab').detrend_mean)"
- 
    node : Subsampling
    parameters : 
        target_frequency : 100.0
- 
    node : CSP
    parameters : 
         retained_channels : 4
"""

        flow = NodeChainFactory.flow_from_yaml(NodeChain, simpleYAMLInput)
        self.assert_(isinstance(flow, NodeChain) and len(flow) == 4)
        self.assert_(
            isinstance(flow[0], TimeSeriesSourceNode)
            and isinstance(flow[1], DetrendingNode)
            and isinstance(flow[2], SubsamplingNode)
            and isinstance(flow[3], CSPNode))
        self.assert_(flow[1].detrend_method == pylab.detrend_mean)
        self.assert_(flow[2].target_frequency == 100.0)
        self.assert_(flow[3].retained_channels == 4)
Example #6
0
    def __init__(self,
                 node_chain_spec,
                 parameter_setting,
                 rel_dataset_dir,
                 run,
                 split,
                 storage_format,
                 result_dataset_directory,
                 store_node_chain=False,
                 hide_parameters=[]):

        super(NodeChainProcess, self).__init__()

        self.node_chain_spec = node_chain_spec
        self.parameter_setting = parameter_setting
        self.rel_dataset_dir = rel_dataset_dir
        self.storage = pySPACE.configuration.storage
        self.run = run
        self.storage_format = storage_format
        self.result_dataset_directory = result_dataset_directory
        self.persistency_dir = os.sep.join(
            [result_dataset_directory,
             "persistency_run%s" % run])
        create_directory(self.persistency_dir)
        self.store_node_chain = store_node_chain
        self.hide_parameters = hide_parameters

        # reduce_log_level for process creation
        try:
            console_log_level = eval(pySPACE.configuration.console_log_level) \
                if hasattr(pySPACE.configuration, "console_log_level") \
                else logging.WARNING
        except (AttributeError, NameError):
            console_log_level = logging.WARNING
        try:
            file_log_level = eval(pySPACE.configuration.file_log_level) \
                if hasattr(pySPACE.configuration, "file_log_level") \
                else logging.INFO
        except (AttributeError, NameError):
            file_log_level = logging.INFO

        self.min_log_level = min(console_log_level, file_log_level)
        pySPACE.configuration.min_log_level = self.min_log_level
        # Replace parameters in spec file
        #        self.node_chain_spec = replace_parameters_and_convert(
        #            self.node_chain_spec, self.parameter_setting)
        self.node_chain_spec = replace_parameters2(self.node_chain_spec,
                                                   self.parameter_setting)
        # Create node chain
        self.node_chain = NodeChainFactory.flow_from_yaml(
            Flow_Class=BenchmarkNodeChain, flow_spec=self.node_chain_spec)

        for node in self.node_chain:
            node.current_split = split
        # Remove pseudo parameter "__PREPARE_OPERATION__"
        if "__PREPARE_OPERATION__" in self.parameter_setting:
            self.parameter_setting = copy.deepcopy(self.parameter_setting)
            self.parameter_setting.pop("__PREPARE_OPERATION__")
Example #7
0
    def prepare_adaptation(self, adaptation_files, datasets):
        """ Prepares the threshold adaptation.
        """

        online_logger.info("Preparing Adaptation")
        online_logger.info("adaptation files:" + str(adaptation_files))
        for key in self.datasets.keys():
            if "threshold_adaptation_flow" in self.datasets[key]:
                spec_base = self.datasets[key]["configuration"].spec_dir
                self.datasets[key]["threshold_adaptation_flow"] = os.path.join(
                    spec_base, self.datasets[key]["threshold_adaptation_flow"])
                online_logger.info(
                    "windower_spec_path:" +
                    self.datasets[key]["windower_spec_threshold_adaptation"])
                online_logger.info(
                    "dataflow_spec_" + key + ":" +
                    self.datasets[key]["threshold_adaptation_flow"])
                self.adaptation_active_potential[key] = multiprocessing.Value(
                    'b', False)

        # start the eeg server
        # check if multiple datasets are given for adaptation
        if hasattr(adaptation_files, '__iter__'):
            self.adaptation_data = adaptation_files
            online_logger.debug("Using multiple data sets:" +
                                str(self.adaptation_data))
        else:
            self.adaptation_data = [adaptation_files]

        # Adaptation is done in separate threads, we send the time series
        # windows to these threads via two queues
        online_logger.info("Initializing Queues")
        for key in self.datasets.keys():
            self.queue[key] = multiprocessing.Queue()
        online_logger.info("Creating flows")

        def flow_generator(key):
            """create a generator to yield all the windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block=True, timeout=None)
                if window == None: break
                yield window

        # Create the actual data flows for S1 vs P3 discrimination
        # and S1 vs LRP discrimination
        for key in self.datasets.keys():
            if "threshold_adaptation_flow" in self.datasets[key]:
                self.aBRI_flow[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(
                        self.datasets[key]["threshold_adaptation_flow"]))
                self.aBRI_flow[key][0].set_generator(flow_generator(key))

        online_logger.info("threshold adaptation preparations finished")
        return 0
Example #8
0
    def prepare_adaptation(self, adaptation_files, datasets, nullmarker_stride_ms = None):
        """ Prepares the threshold adaptation.
        """

        online_logger.info( "Preparing Adaptation")
        online_logger.info( "adaptation files:" + str(adaptation_files))
        
        self.nullmarker_stride_ms = nullmarker_stride_ms
        if self.nullmarker_stride_ms == None:
            online_logger.warn( 'Nullmarker stride interval is %s. You can specify it in your parameter file.' % self.nullmarker_stride_ms)
        else:
            online_logger.info( 'Nullmarker stride interval is set to %s ms' % self.nullmarker_stride_ms)
        
        for key in self.datasets.keys():
            if "threshold_adaptation_flow" in self.datasets[key]:
                spec_base = self.datasets[key]["configuration"].spec_dir
                self.datasets[key]["threshold_adaptation_flow"] = os.path.join(spec_base, self.datasets[key]["threshold_adaptation_flow"])
                online_logger.info( "windower_spec_path:" + self.datasets[key]["windower_spec_threshold_adaptation"])
                online_logger.info( "dataflow_spec_" + key + ":" + self.datasets[key]["threshold_adaptation_flow"])
                self.adaptation_active_potential[key] = multiprocessing.Value('b',False)

        # start the eeg server
        # check if multiple datasets are given for adaptation
        if hasattr(adaptation_files,'__iter__'):
            self.adaptation_data = adaptation_files
            online_logger.debug("Using multiple data sets:" + str(self.adaptation_data))
        else:
            self.adaptation_data = [adaptation_files]


        # Adaptation is done in separate threads, we send the time series
        # windows to these threads via two queues
        online_logger.info( "Initializing Queues")
        for key in self.datasets.keys():
            self.queue[key] = multiprocessing.Queue()
        online_logger.info( "Creating flows")

        def flow_generator(key):
            """create a generator to yield all the windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block = True, timeout = None)
                if window == None: break
                yield window

        # Create the actual data flows for S1 vs P3 discrimination
        # and S1 vs LRP discrimination
        for key in self.datasets.keys():
            if "threshold_adaptation_flow" in self.datasets[key]:
                self.aBRI_flow[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                         flow_spec = file(self.datasets[key]["threshold_adaptation_flow"]))
                self.aBRI_flow[key][0].set_generator(flow_generator(key))

        online_logger.info( "threshold adaptation preparations finished")
        return 0
Example #9
0
    def prepare_training(self,
                         training_files,
                         potentials,
                         operation,
                         nullmarker_stride_ms=None):
        """ Prepares pyspace live for training.

        Prepares everything for training of pyspace live,
        i.e. creates flows based on the dataflow specs
        and configures them.
        """
        online_logger.info("Preparing Training")
        self.potentials = potentials
        self.operation = operation
        self.nullmarker_stride_ms = nullmarker_stride_ms
        if self.nullmarker_stride_ms == None:
            online_logger.warn(
                'Nullmarker stride interval is %s. You can specify it in your parameter file.'
                % self.nullmarker_stride_ms)
        else:
            online_logger.info('Nullmarker stride interval is set to %s ms ' %
                               self.nullmarker_stride_ms)

        online_logger.info("Creating flows..")
        for key in self.potentials.keys():
            spec_base = self.potentials[key]["configuration"].spec_dir
            if self.operation == "train":
                self.potentials[key]["node_chain"] = os.path.join(
                    spec_base, self.potentials[key]["node_chain"])
                online_logger.info("node_chain_spec:" +
                                   self.potentials[key]["node_chain"])

            elif self.operation in ("prewindowing", "prewindowing_offline"):
                self.potentials[key]["prewindowing_flow"] = os.path.join(
                    spec_base, self.potentials[key]["prewindowing_flow"])
                online_logger.info("prewindowing_dataflow_spec: " +
                                   self.potentials[key]["prewindowing_flow"])

            elif self.operation == "prewindowed_train":
                self.potentials[key]["postprocess_flow"] = os.path.join(
                    spec_base, self.potentials[key]["postprocess_flow"])
                online_logger.info("postprocessing_dataflow_spec: " +
                                   self.potentials[key]["postprocess_flow"])

            self.training_active_potential[key] = multiprocessing.Value(
                "b", False)

        online_logger.info("Path variables set for NodeChains")

        # check if multiple potentials are given for training
        if isinstance(training_files, list):
            self.training_data = training_files
        else:
            self.training_data = [training_files]

        # Training is done in separate processes, we send the time series
        # windows to these threads via two queues
        online_logger.info("Initializing Queues")
        for key in self.potentials.keys():
            self.queue[key] = multiprocessing.Queue()

        def flow_generator(key):
            """create a generator to yield all the abri flow windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block=True, timeout=None)
                if window == None: break
                yield window

        # Create the actual data flows
        for key in self.potentials.keys():

            if self.operation == "train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["node_chain"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["node_chain"])
            elif self.operation in ("prewindowing", "prewindowing_offline"):
                online_logger.info("loading prewindowing flow..")
                online_logger.info(
                    "file: " + str(self.potentials[key]["prewindowing_flow"]))

                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["prewindowing_flow"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["prewindowing_flow"])
            elif self.operation == "prewindowed_train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["postprocess_flow"]))
                replace_start_and_end_markers = False

                final_collection = TimeSeriesDataset()
                final_collection_path = os.path.join(
                    self.prewindowed_data_directory, key, "all_train_data")
                # delete previous training collection
                if os.path.exists(final_collection_path):
                    online_logger.info(
                        "deleting old training data collection for " + key)
                    shutil.rmtree(final_collection_path)

                # load all prewindowed collections and
                # append data to the final collection
                prewindowed_sets = \
                    glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
                if len(prewindowed_sets) == 0:
                    online_logger.error(
                        "Couldn't find data, please do prewindowing first!")
                    raise Exception
                online_logger.info("concatenating prewindowed data from " +
                                   str(prewindowed_sets))

                for s, d in enumerate(prewindowed_sets):
                    collection = BaseDataset.load(d)
                    data = collection.get_data(0, 0, "train")
                    for d, (sample, label) in enumerate(data):
                        if replace_start_and_end_markers:
                            # in case we concatenate multiple 'Window' labeled
                            # sets we have to remove every start- and endmarker
                            for k in sample.marker_name.keys():
                                # find '{S,s}  8' or '{S,s}  9'
                                m = re.match("^s\s{0,2}[8,9]{1}$", k,
                                             re.IGNORECASE)
                                if m is not None:
                                    online_logger.info(
                                        str("remove %s from %d %d" %
                                            (m.group(), s, d)))
                                    del (sample.marker_name[m.group()])

                            if s == len(prewindowed_sets)-1 and \
                                d == len(data)-1:
                                # insert endmarker
                                sample.marker_name["S  9"] = [0.0]
                                online_logger.info("added endmarker" + str(s) +
                                                   " " + str(d))

                            if s == 0 and d == 0:
                                # insert startmarker
                                sample.marker_name["S  8"] = [0.0]
                                online_logger.info("added startmarker" +
                                                   str(s) + " " + str(d))

                        final_collection.add_sample(sample, label, True)

                # save final collection (just for debugging)
                os.mkdir(final_collection_path)
                final_collection.store(final_collection_path)

                online_logger.info("stored final collection at " +
                                   final_collection_path)

                # load final collection again for training
                online_logger.info("loading data from " +
                                   final_collection_path)
                self.prewindowed_data[key] = BaseDataset.load(
                    final_collection_path)
                self.node_chains[key][0].set_input_dataset(
                    self.prewindowed_data[key])

                flow = open(self.potentials[key]["postprocess_flow"])

            # create window_stream for every potential

            if self.operation in ("prewindowing"):
                window_spec_file = os.path.join(
                    spec_base, "node_chains", "windower",
                    self.potentials[key]["windower_spec_path_train"])

                self.window_stream[key] = \
                        self.stream_manager.request_window_stream(window_spec_file,
                                                              nullmarker_stride_ms = self.nullmarker_stride_ms)
            elif self.operation in ("prewindowing_offline"):
                pass
            elif self.operation in ("train"):
                pass

            self.node_chain_definitions[key] = yaml.load(flow)
            flow.close()

        # TODO: check if the prewindowing flow is still needed when using the stream mode!
        if self.operation in ("train"):
            online_logger.info("Removing old flows...")
            try:
                shutil.rmtree(self.flow_storage)
            except:
                online_logger.info("Could not delete flow storage directory")
            os.mkdir(self.flow_storage)
        elif self.operation in ("prewindowing", "prewindowing_offline"):
            # follow this policy:
            # - delete prewindowed data older than 12 hours
            # - always delete trained/stored flows
            now = datetime.datetime.now()
            then = now - datetime.timedelta(hours=12)

            if not os.path.exists(self.prewindowed_data_directory):
                os.mkdir(self.prewindowed_data_directory)
            if not os.path.exists(self.flow_storage):
                os.mkdir(self.flow_storage)

            for key in self.potentials.keys():
                found = self.find_files_older_than(then, \
                        os.path.join(self.prewindowed_data_directory, key))
                if found is not None:
                    for f in found:
                        online_logger.info(
                            str("recursively deleting files in \'%s\'" % f))
                        try:
                            shutil.rmtree(os.path.abspath(f))
                        except Exception as e:
                            # TODO: find a smart solution for this!
                            pass  # dir was probably already deleted..

                if os.path.exists(
                        os.path.join(self.prewindowed_data_directory, key,
                                     "all_train_data")):
                    shutil.rmtree(
                        os.path.join(self.prewindowed_data_directory, key,
                                     "all_train_data"))
                    online_logger.info(
                        "deleted concatenated training data for " + key)

        online_logger.info("Training preparations finished")
        return 0
    def _stop_training(self):
        """ Do the optimization step and define final parameter choice
        
        This is the main method of this node!
        
        .. todo:: Allow also parallelization over nominal_ranges! 
        """
        self._log("Starting optimization Process.")
        self.runs = [10 * self.run_number + run for run in range(self.runs)]
        original_flow_template = copy.copy(self.flow_template)
        # Fill in validation parameters in the template
        self.flow_template = NodeChainFactory.replace_parameters_in_node_chain(
            original_flow_template, self.validation_parameter_settings)
        if self.nom_rng is None:
            self.prepare_optimization()
            self.best_parametrization, self.best_performance = \
                self.get_best_parametrization()
            self.performance_dict[self.p2key(self.best_parametrization)] = \
                (self.best_performance, self.best_parametrization)
        else:
            nom_grid = self.search_grid(self.nom_rng)
            iterations = 0
            search_history = []
            # copy flow_template since we have to instantiate for every nom_par
            flow_template = copy.copy(self.flow_template)
            for nom_par in nom_grid:
                # for getting the best parameterization,
                # the class attribute flow_template must be overwritten
                self.flow_template = \
                    NodeChainFactory.replace_parameters_in_node_chain(
                        flow_template, nom_par)
                self.prepare_optimization()
                parametrization, performance = self.get_best_parametrization()
                self.performance_dict[self.p2key(nom_par)] = (performance, 
                                                              parametrization)
                iterations += self.iterations
                search_history.append((nom_par,self.search_history))
                # reinitialize optimization parameters
                self.re_init()
            # reconstructing the overwritten flow for further usage
            self.flow_template = flow_template
            self.iterations = iterations
            self.search_history = sorted(search_history, 
                                     key=lambda t: t[1][-1]["best_performance"])
            best_key = max(sorted(self.performance_dict.items()),
                                                          key=lambda t: t[1])[0]
            self.best_performance, self.best_parametrization = \
                self.performance_dict[best_key]
            self.best_parametrization.update(dict(best_key))
        # when best parameter dict is calculated, this has to be logged
        # or saved and the chosen parameter is used for training on the
        # whole data set, independent of the chosen algorithm
        self._log("Using parameterization %s with optimal performance %s for " \
                  "metric %s." % (self.best_parametrization,
                                  self.best_performance, self.metric))
        # Fill in the final parameters in the flow template
        self.flow_template = NodeChainFactory.replace_parameters_in_node_chain(
            original_flow_template, self.final_training_parameter_settings)
        best_flow_template = self.flow_template
        best_flow_template[1] = {'node': 'All_Train_Splitter'}
        #delete last node
        best_flow_template.pop(-1)
        self.flow = self.generate_subflow(best_flow_template, 
                                          self.best_parametrization, NodeChain)
        self.flow[-1].set_run_number(self.run_number)
        self.flow[0].set_generator(self.train_instances)
        self.flow.train()
        self._log("Training of optimal flow finished")

        # delete training instances that would be stored to disk if this node
        # is saved
        del self.train_instances
Example #11
0
    def _stop_training(self):
        """ Do the optimization step and define final parameter choice
        
        This is the main method of this node!
        
        .. todo:: Allow also parallelization over nominal_ranges! 
        """
        self._log("Starting optimization Process.")
        self.runs = [10 * self.run_number + run for run in range(self.runs)]
        original_flow_template = copy.copy(self.flow_template)
        # Fill in validation parameters in the template
        if not self.validation_parameter_settings == {}:
            self.flow_template = [
                NodeChainFactory.instantiate(
                    template=node,
                    parametrization=self.validation_parameter_settings)
                for node in original_flow_template
            ]
        if self.nom_rng is None:
            self.prepare_optimization()
            self.best_parametrization, self.best_performance = \
                                                 self.get_best_parametrization()
            self.performance_dict[self.p2key(self.best_parametrization)] = \
                              (self.best_performance, self.best_parametrization)
        else:
            nom_grid = self.search_grid(self.nom_rng)
            iterations = 0
            search_history = []
            # copy flow_template since we have to instantiate for every nom_par
            flow_template = copy.copy(self.flow_template)
            for nom_par in nom_grid:
                # for getting the best parameterization,
                # the class attribute flow_template must be overwritten
                self.flow_template = [
                    NodeChainFactory.instantiate(template=node,
                                                 parametrization=nom_par)
                    for node in flow_template
                ]
                self.prepare_optimization()
                parametrization, performance = self.get_best_parametrization()
                self.performance_dict[self.p2key(nom_par)] = (performance,
                                                              parametrization)
                iterations += self.iterations
                search_history.append((nom_par, self.search_history))
                # reinitialize optimization parameters
                self.re_init()
            # reconstructing the overwritten flow for further usage
            self.flow_template = flow_template
            self.iterations = iterations
            self.search_history = sorted(
                search_history, key=lambda t: t[1][-1]["best_performance"])
            best_key = max(sorted(self.performance_dict.items()),
                           key=lambda t: t[1])[0]
            self.best_performance, self.best_parametrization = \
                                                 self.performance_dict[best_key]
            self.best_parametrization.update(dict(best_key))
        # when best parameter dict is calculated, this has to be logged
        # or saved and the chosen parameter is used for training on the
        # whole data set, independent of the chosen algorithm
        self._log("Using parameterization %s with optimal performance %s for " \
                  "metric %s." % (self.best_parametrization,
                                            self.best_performance, self.metric))
        # Fill in the final parameters in the flow template
        if not self.final_training_parameter_settings == {}:
            self.flow_template = [
                NodeChainFactory.instantiate(
                    template=node,
                    parametrization=self.final_training_parameter_settings)
                for node in original_flow_template
            ]
        else:
            self.flow_template = original_flow_template
        best_flow_template = self.flow_template
        best_flow_template[1] = {'node': 'All_Train_Splitter'}
        #delete last node
        best_flow_template.pop(-1)
        self.flow = self.generate_subflow(best_flow_template,
                                          self.best_parametrization, NodeChain)
        self.flow[-1].set_run_number(self.run_number)
        self.flow[0].set_generator(self.train_instances)
        self.flow.train()
        self._log("Training of optimal flow finished")

        # delete training instances that would be stored to disk if this node
        # is saved
        del self.train_instances
Example #12
0
    def prepare_training(self, training_files, potentials, operation):
        """ Prepares pyspace live for training.

        Prepares everything for training of pyspace live,
        i.e. creates flows based on the dataflow specs
        and configures them.
        """
        online_logger.info( "Preparing Training")
        self.potentials = potentials
        self.operation = operation

        online_logger.info( "Creating flows..")
        for key in self.potentials.keys():
            spec_base = self.potentials[key]["configuration"].spec_dir
            if self.operation == "train":
                self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"])
                online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"])

            elif self.operation in ("prewindowing", "prewindowing_offline"):
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"])
                else:
                    self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"])
                online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"])

            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"])
                else:
                    self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"])
                online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"])

            self.training_active_potential[key] = multiprocessing.Value("b",False)

        online_logger.info("Path variables set for NodeChains")

        # check if multiple potentials are given for training
        if isinstance(training_files, list):
            self.training_data = training_files
        else:
            self.training_data = [training_files]

        # Training is done in separate processes, we send the time series
        # windows to these threads via two queues
        online_logger.info( "Initializing Queues")
        for key in self.potentials.keys():
            self.queue[key] = multiprocessing.Queue()


        def flow_generator(key):
            """create a generator to yield all the abri flow windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block = True, timeout = None)
                if window == None: break
                yield window

        # Create the actual data flows
        for key in self.potentials.keys():

            if self.operation == "train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                         flow_spec = file(self.potentials[key]["node_chain"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["node_chain"])
            elif self.operation in ("prewindowing", "prewindowing_offline"):
                online_logger.info("loading prewindowing flow..")
                online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"]))

                self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                             flow_spec = file(self.potentials[key]["prewindowing_flow"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["prewindowing_flow"])
            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                                     flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    # create windower
                    online_logger.info( "Creating Windower")
                    online_logger.info(self.potentials[key]["windower_spec_path_train"])
                    self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]))
                    replace_start_and_end_markers = True
                else:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    replace_start_and_end_markers = False

                final_collection = TimeSeriesDataset()
                final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data")
                # delete previous training collection
                if os.path.exists(final_collection_path):
                    online_logger.info("deleting old training data collection for " + key)
                    shutil.rmtree(final_collection_path)

                # load all prewindowed collections and
                # append data to the final collection
                prewindowed_sets = \
                    glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
                if len(prewindowed_sets) == 0:
                    online_logger.error("Couldn't find data, please do prewindowing first!")
                    raise Exception
                online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets))

                for s,d in enumerate(prewindowed_sets):
                    collection = BaseDataset.load(d)
                    data = collection.get_data(0, 0, "train")
                    for d,(sample,label) in enumerate(data):

                        if replace_start_and_end_markers:
                            # in case we concatenate multiple 'Window' labeled
                            # sets we have to remove every start- and endmarker
                            for k in sample.marker_name.keys():
                                # find '{S,s}  8' or '{S,s}  9'
                                m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE)
                                if m is not None:
                                    online_logger.info(str("remove %s from %d %d" % (m.group(), s, d)))
                                    del(sample.marker_name[m.group()])

                            if s == len(prewindowed_sets)-1 and \
                                d == len(data)-1:
                                # insert endmarker
                                sample.marker_name["S  9"] = [0.0]
                                online_logger.info("added endmarker" + str(s) + " " + str(d))

                            if s == 0 and d == 0:
                                # insert startmarker
                                sample.marker_name["S  8"] = [0.0]
                                online_logger.info("added startmarker" + str(s) + " " + str(d))

                        final_collection.add_sample(sample, label, True)

                # save final collection (just for debugging)
                os.mkdir(final_collection_path)
                final_collection.store(final_collection_path)

                online_logger.info("stored final collection at " + final_collection_path)

                # load final collection again for training
                online_logger.info("loading data from " + final_collection_path)
                self.prewindowed_data[key] =  BaseDataset.load(final_collection_path)
                self.node_chains[key][0].set_input_dataset(self.prewindowed_data[key])

                flow = open(self.potentials[key]["postprocess_flow"])

            self.node_chain_definitions[key] = yaml.load(flow)
            flow.close()

        # TODO: check if the prewindowing flow is still needed
        # when using the stream mode!
        if self.operation in ("train"):
            online_logger.info( "Removing old flows...")
            try:
                shutil.rmtree(self.flow_storage)
            except:
                online_logger.info("Could not delete flow storage directory")
            os.mkdir(self.flow_storage)
        elif self.operation in ("prewindowing", "prewindowing_offline"):
            # follow this policy:
            # - delete prewindowed data older than 12 hours
            # - always delete trained/stored flows
            now = datetime.datetime.now()
            then = now - datetime.timedelta(hours=12)

            if not os.path.exists(self.prewindowed_data_directory):
                os.mkdir(self.prewindowed_data_directory)
            if not os.path.exists(self.flow_storage):
                os.mkdir(self.flow_storage)

            for key in self.potentials.keys():
                found = self.find_files_older_than(then, \
                        os.path.join(self.prewindowed_data_directory, key))
                if found is not None:
                    for f in found:
                        online_logger.info(str("recursively deleting files in \'%s\'" % f))
                        try:
                            shutil.rmtree(os.path.abspath(f))
                        except Exception as e:
                            # TODO: find a smart solution for this!
                            pass # dir was probably already deleted..

                if os.path.exists(os.path.join(self.prewindowed_data_directory, key, "all_train_data")):
                    shutil.rmtree(os.path.join(self.prewindowed_data_directory, key, "all_train_data"))
                    online_logger.info("deleted concatenated training data for " + key)


        online_logger.info( "Training preparations finished")
        return 0