Exemple #1
0
 def get_result_dataset(self):
     """ Return the result """
     # Merges all timeseries inside the collection if merge flag is set to true
     if self.merge:
         merged_time_series = self.merge_time_series(
             self.time_series_collection)
         self.time_series_collection = None
         self.time_series_collection = \
                   TimeSeriesDataset(sort_string=self.sort_string)
         self.time_series_collection.add_sample(merged_time_series,
                                                label='Window',
                                                train=False)
     return self.time_series_collection
Exemple #2
0
    def generate_data_set(self):
        """ Generate a dataset using the given generators """

        self.dataset = TimeSeriesDataset()

        # generate a set of dummy labels to know which class is used later
        label_sequence = numpy.hstack(
            (numpy.ones(self.ir_items), numpy.zeros(self.nir_items)))

        if self.shuffle:
            random.shuffle(label_sequence)

        ts_generator = TestTimeSeriesGenerator()

        current_item = 0  # count produced data objects for drift
        for label in label_sequence:
            if label == 1:
                #generate a data item using the ir_generator
                data_item = \
                    ts_generator.generate_test_data(
                        channels=len(self.channel_names),
                        time_points=self.time_points,
                        function=self.ir_generator,
                        sampling_frequency=self.sampling_frequency,
                        channel_order=True,
                        channel_names=self.channel_names,
                        dtype=numpy.float)
                # Drift:
                data_item = data_item + current_item * self.ir_drift_vector
                self.dataset.add_sample(data_item, self.ir_label, False)

            else:
                #generate a data item using the nir_generator
                data_item = \
                    ts_generator.generate_test_data(
                        channels=len(self.channel_names),
                        time_points=self.time_points,
                        function=self.nir_generator,
                        sampling_frequency=self.sampling_frequency,
                        channel_order=True,
                        channel_names=self.channel_names,
                        dtype=numpy.float)
                # Drift:
                data_item = data_item + current_item * self.nir_drift_vector
                self.dataset.add_sample(data_item, self.nir_label, False)

            current_item += 1. / (self.ir_items + self.nir_items)
Exemple #3
0
    def process_current_split(self):
        """ 
        Compute the results of this sink node for the current split of the data
        into train and test data
        """
        index = 0
        # Compute the time series for the data used for training
        for time_series, label in self.input_node.request_data_for_training(
                False):
            # Do lazy initialization of the class
            if self.time_series_collection == None:
                self.time_series_collection = \
                            TimeSeriesDataset(sort_string=self.sort_string)

            if index < self.max_num_stored_objects:
                # Add sample
                self.time_series_collection.add_sample(
                    time_series,
                    label=label,
                    train=True,
                    split=self.current_split,
                    run=self.run_number)
            index += 1

        # Compute the time series for the data used for testing
        index = 0
        for time_series, label in self.input_node.request_data_for_testing():
            # Do lazy initialization of the class
            # (maybe there were no training examples)
            if self.time_series_collection == None:
                self.time_series_collection = \
                            TimeSeriesDataset(sort_string=self.sort_string)

            if index < self.max_num_stored_objects:
                # Add sample
                self.time_series_collection.add_sample(
                    time_series,
                    label=label,
                    train=False,
                    split=self.current_split,
                    run=self.run_number)
            index += 1
 def get_result_dataset(self):
     """ Return the result """
     # Merges all timeseries inside the collection if merge flag is set to true
     if self.merge:
         merged_time_series = self.merge_time_series(self.time_series_collection)
         self.time_series_collection = None
         self.time_series_collection = \
                   TimeSeriesDataset(sort_string=self.sort_string)
         self.time_series_collection.add_sample(merged_time_series,
                                              label = 'Window',
                                              train = False)
     return self.time_series_collection
 def process_current_split(self):
     """ 
     Compute the results of this sink node for the current split of the data
     into train and test data
     """
     index = 0
     # Compute the time series for the data used for training
     for time_series, label in self.input_node.request_data_for_training(False):
         # Do lazy initialization of the class 
         if self.time_series_collection == None:
             self.time_series_collection = \
                         TimeSeriesDataset(sort_string=self.sort_string)
         
         if index < self.max_num_stored_objects:
             # Add sample
             self.time_series_collection.add_sample(time_series,
                                                    label = label,
                                                    train = True,
                                                    split = self.current_split,
                                                    run = self.run_number)
         index += 1
         
     # Compute the time series for the data used for testing
     index = 0
     for time_series, label in self.input_node.request_data_for_testing():
         # Do lazy initialization of the class 
         # (maybe there were no training examples)
         if self.time_series_collection == None:
             self.time_series_collection = \
                         TimeSeriesDataset(sort_string=self.sort_string)
         
         if index < self.max_num_stored_objects:
             # Add sample
             self.time_series_collection.add_sample(time_series,
                                                label = label,
                                                train = False,
                                                split = self.current_split,
                                                run = self.run_number)
         index += 1
    def generate_data_set(self):
        """ Generate a dataset using the given generators """
        
        self.dataset = TimeSeriesDataset()
        
        # generate a set of dummy labels to know which class is used later
        label_sequence = numpy.hstack((numpy.ones(self.ir_items),numpy.zeros(self.nir_items)))
        
        if self.shuffle:
            random.shuffle(label_sequence)
            
        ts_generator = TestTimeSeriesGenerator()
        
        current_item = 0 # count produced data objects for drift
        for label in label_sequence:
            if label == 1:
                #generate a data item using the ir_generator
                data_item = \
                    ts_generator.generate_test_data(
                        channels=len(self.channel_names), 
                        time_points=self.time_points, 
                        function=self.ir_generator,
                        sampling_frequency=self.sampling_frequency,
                        channel_order=True,
                        channel_names=self.channel_names,
                        dtype=numpy.float)
                # Drift:
                data_item = data_item + current_item*self.ir_drift_vector
                self.dataset.add_sample(data_item,self.ir_label,False)
                
            else:
                #generate a data item using the nir_generator
                data_item = \
                    ts_generator.generate_test_data(
                        channels=len(self.channel_names), 
                        time_points=self.time_points, 
                        function=self.nir_generator,
                        sampling_frequency=self.sampling_frequency,
                        channel_order=True,
                        channel_names=self.channel_names,
                        dtype=numpy.float)
                # Drift:
                data_item = data_item + current_item*self.nir_drift_vector
                self.dataset.add_sample(data_item,self.nir_label,False)

            current_item += 1./(self.ir_items+self.nir_items)
class DataGenerationTimeSeriesSourceNode(TimeSeriesSourceNode):
    """ Generate data of two classes for testing
    
    This node can generate data according to the specifications
    of two different DataGenerators. 
    
    It generates objects of the type TimeSeries
    
    **Parameters**
        :ir_generator:
            A generator of type DataGenerator for data items of the 
            information relevant class.
            If it is specified in a node chain, it should be given as a
            string.
            
            (*optional, default: 100*)
            
        :nir_generator:
            A generator of type DataGenerator for data items of the 
            not information relevant class.
            If it is specified in a node chain, it should be given as a
            string.
            
            (*optional, default: 100*)
            
        :ir_items:
            Number of items that should be generated for the ir class.
            
            (*optional, default: 100*)
            
        :nir_items:
            Number of items that should be generated for the non ir class.
            
            (*optional, default: 100*)
            
        :channel_names:
            List of strings for the channel names. 
            Determines also the number of generated
            channels.
            
            (*optional*)

        :num_channels:
            Number of channels. Unused, if channel_names is set.
            
            (*optional, default: 16*)
            
        :ir_label:
            The label for the ir_class.
            
            (*optional, default: 'Target'*)
            
        :nir_label:
            The label for the ir_class.
            
            (*optional, default: 'Standard'*)
            
        :shuffle:
            If the data items for the two classes are shuffled.
            
            (*optional, default: True*)
            
        :time_points:
            Number of points per channel in a generated TimeSeries
            object. 
            
            (*optional, default: 100*)
            
         :sampling_frequency:
             Sampling rate of the generated data.
             Important for sines etc.
             
             A generated time series object has a
             temporal length of time_points/sampling_frequency
         
             (*optional, default: 1000*)
             
          :ir_drift_vector:
              Drift of the ir class data.
              Specify a vector (numpy array) of shape (time_points,num_channels)
              and the a linear drift in this direction will be added to the
              generated data:
              [0 * ir_drift_vector]                       added to first sample,
              [1/(ir_items+nir_items) * ir_drift_vector]  to the second sample
              [...]                                       and so on, until
              [1 * ir_drift_vector]                       added to last sample.
              
              The specification of the drift vector in the specification can, e.g.,
              be done like this:
              ir_drift_vector : "eval(__import__('numpy').asarray([[1,1],[2,2]]))"
              
              (*optional, default: None*)
            
          :nir_drift_vector:
              Drift of the ir class data. See ir_drift_vector. 
              
              (*optional, default: None*)              
              
    **Exemplary Call**
    
    .. code-block:: yaml
    
        -
            node : Data_Generation_Source
            parameters :
                ir_generator : "Adder([SineGenerator(),GaussianNoiseGenerator()])"
                nir_generator : "GaussianNoiseGenerator()"
                
    
    :Author: Hendrik Woehrle
    :Created: 201/07/27
    """
    
    def __init__(self, 
                 ir_generator="Adder([Sine(),GaussianNoise()])",
                 nir_generator="GaussianNoise()",
                 ir_items=100,
                 nir_items=100,
                 ir_drift_vector=None,
                 nir_drift_vector=None,
                 channel_names=None,
                 num_channels=16,
                 ir_label='Target',
                 nir_label='Standard',
                 time_points=100,
                 sampling_frequency=1000,
                 shuffle=True,
                 **kwargs):
        super(DataGenerationTimeSeriesSourceNode, self).__init__(**kwargs)
        
        if type(ir_generator) == str:
            ir_generator = eval(ir_generator)
        
        if type(nir_generator) == str:
            nir_generator = eval(nir_generator)

        ir_generator.sampling_frequency = sampling_frequency
        nir_generator.sampling_frequency = sampling_frequency
        
        run_number = 0
        
        dataset = None

        if not channel_names is None:
            num_channels = len(channel_names)
        else:
            channel_names = []
            for i in xrange(num_channels):
                channel_names.append(str(i))
        
        # Translate drift "None" to zero-vector 
        if ir_drift_vector is None:
            ir_drift_vector = numpy.zeros((time_points,num_channels))
        if nir_drift_vector is None:
            nir_drift_vector = numpy.zeros((time_points,num_channels))
        
        self.set_permanent_attributes(dataset=dataset,
                                      ir_generator=ir_generator,
                                      nir_generator=nir_generator,
                                      ir_items=ir_items,
                                      nir_items=nir_items,
                                      channel_names=channel_names,
                                      num_channels=num_channels,
                                      ir_label=ir_label,
                                      nir_label=nir_label,
                                      time_points=time_points,
                                      sampling_frequency=sampling_frequency,
                                      shuffle=shuffle,
                                      run_number=run_number,
                                      data_for_testing=None,
                                      data_for_training=None,
                                      ir_drift_vector=ir_drift_vector,
                                      nir_drift_vector=nir_drift_vector)
        
        self.generate_data_set()
        
    def set_input_dataset(self, dataset):
        """ Instead of using a given dataset, a new one is generated """
        self.generate_data_set()
        
        
    def generate_data_set(self):
        """ Generate a dataset using the given generators """
        
        self.dataset = TimeSeriesDataset()
        
        # generate a set of dummy labels to know which class is used later
        label_sequence = numpy.hstack((numpy.ones(self.ir_items),numpy.zeros(self.nir_items)))
        
        if self.shuffle:
            random.shuffle(label_sequence)
            
        ts_generator = TestTimeSeriesGenerator()
        
        current_item = 0 # count produced data objects for drift
        for label in label_sequence:
            if label == 1:
                #generate a data item using the ir_generator
                data_item = \
                    ts_generator.generate_test_data(
                        channels=len(self.channel_names), 
                        time_points=self.time_points, 
                        function=self.ir_generator,
                        sampling_frequency=self.sampling_frequency,
                        channel_order=True,
                        channel_names=self.channel_names,
                        dtype=numpy.float)
                # Drift:
                data_item = data_item + current_item*self.ir_drift_vector
                self.dataset.add_sample(data_item,self.ir_label,False)
                
            else:
                #generate a data item using the nir_generator
                data_item = \
                    ts_generator.generate_test_data(
                        channels=len(self.channel_names), 
                        time_points=self.time_points, 
                        function=self.nir_generator,
                        sampling_frequency=self.sampling_frequency,
                        channel_order=True,
                        channel_names=self.channel_names,
                        dtype=numpy.float)
                # Drift:
                data_item = data_item + current_item*self.nir_drift_vector
                self.dataset.add_sample(data_item,self.nir_label,False)

            current_item += 1./(self.ir_items+self.nir_items)
Exemple #8
0
    def prepare_training(self,
                         training_files,
                         potentials,
                         operation,
                         nullmarker_stride_ms=None):
        """ Prepares pyspace live for training.

        Prepares everything for training of pyspace live,
        i.e. creates flows based on the dataflow specs
        and configures them.
        """
        online_logger.info("Preparing Training")
        self.potentials = potentials
        self.operation = operation
        self.nullmarker_stride_ms = nullmarker_stride_ms
        if self.nullmarker_stride_ms == None:
            online_logger.warn(
                'Nullmarker stride interval is %s. You can specify it in your parameter file.'
                % self.nullmarker_stride_ms)
        else:
            online_logger.info('Nullmarker stride interval is set to %s ms ' %
                               self.nullmarker_stride_ms)

        online_logger.info("Creating flows..")
        for key in self.potentials.keys():
            spec_base = self.potentials[key]["configuration"].spec_dir
            if self.operation == "train":
                self.potentials[key]["node_chain"] = os.path.join(
                    spec_base, self.potentials[key]["node_chain"])
                online_logger.info("node_chain_spec:" +
                                   self.potentials[key]["node_chain"])

            elif self.operation in ("prewindowing", "prewindowing_offline"):
                self.potentials[key]["prewindowing_flow"] = os.path.join(
                    spec_base, self.potentials[key]["prewindowing_flow"])
                online_logger.info("prewindowing_dataflow_spec: " +
                                   self.potentials[key]["prewindowing_flow"])

            elif self.operation == "prewindowed_train":
                self.potentials[key]["postprocess_flow"] = os.path.join(
                    spec_base, self.potentials[key]["postprocess_flow"])
                online_logger.info("postprocessing_dataflow_spec: " +
                                   self.potentials[key]["postprocess_flow"])

            self.training_active_potential[key] = multiprocessing.Value(
                "b", False)

        online_logger.info("Path variables set for NodeChains")

        # check if multiple potentials are given for training
        if isinstance(training_files, list):
            self.training_data = training_files
        else:
            self.training_data = [training_files]

        # Training is done in separate processes, we send the time series
        # windows to these threads via two queues
        online_logger.info("Initializing Queues")
        for key in self.potentials.keys():
            self.queue[key] = multiprocessing.Queue()

        def flow_generator(key):
            """create a generator to yield all the abri flow windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block=True, timeout=None)
                if window == None: break
                yield window

        # Create the actual data flows
        for key in self.potentials.keys():

            if self.operation == "train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["node_chain"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["node_chain"])
            elif self.operation in ("prewindowing", "prewindowing_offline"):
                online_logger.info("loading prewindowing flow..")
                online_logger.info(
                    "file: " + str(self.potentials[key]["prewindowing_flow"]))

                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["prewindowing_flow"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["prewindowing_flow"])
            elif self.operation == "prewindowed_train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(
                    Flow_Class=NodeChain,
                    flow_spec=file(self.potentials[key]["postprocess_flow"]))
                replace_start_and_end_markers = False

                final_collection = TimeSeriesDataset()
                final_collection_path = os.path.join(
                    self.prewindowed_data_directory, key, "all_train_data")
                # delete previous training collection
                if os.path.exists(final_collection_path):
                    online_logger.info(
                        "deleting old training data collection for " + key)
                    shutil.rmtree(final_collection_path)

                # load all prewindowed collections and
                # append data to the final collection
                prewindowed_sets = \
                    glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
                if len(prewindowed_sets) == 0:
                    online_logger.error(
                        "Couldn't find data, please do prewindowing first!")
                    raise Exception
                online_logger.info("concatenating prewindowed data from " +
                                   str(prewindowed_sets))

                for s, d in enumerate(prewindowed_sets):
                    collection = BaseDataset.load(d)
                    data = collection.get_data(0, 0, "train")
                    for d, (sample, label) in enumerate(data):
                        if replace_start_and_end_markers:
                            # in case we concatenate multiple 'Window' labeled
                            # sets we have to remove every start- and endmarker
                            for k in sample.marker_name.keys():
                                # find '{S,s}  8' or '{S,s}  9'
                                m = re.match("^s\s{0,2}[8,9]{1}$", k,
                                             re.IGNORECASE)
                                if m is not None:
                                    online_logger.info(
                                        str("remove %s from %d %d" %
                                            (m.group(), s, d)))
                                    del (sample.marker_name[m.group()])

                            if s == len(prewindowed_sets)-1 and \
                                d == len(data)-1:
                                # insert endmarker
                                sample.marker_name["S  9"] = [0.0]
                                online_logger.info("added endmarker" + str(s) +
                                                   " " + str(d))

                            if s == 0 and d == 0:
                                # insert startmarker
                                sample.marker_name["S  8"] = [0.0]
                                online_logger.info("added startmarker" +
                                                   str(s) + " " + str(d))

                        final_collection.add_sample(sample, label, True)

                # save final collection (just for debugging)
                os.mkdir(final_collection_path)
                final_collection.store(final_collection_path)

                online_logger.info("stored final collection at " +
                                   final_collection_path)

                # load final collection again for training
                online_logger.info("loading data from " +
                                   final_collection_path)
                self.prewindowed_data[key] = BaseDataset.load(
                    final_collection_path)
                self.node_chains[key][0].set_input_dataset(
                    self.prewindowed_data[key])

                flow = open(self.potentials[key]["postprocess_flow"])

            # create window_stream for every potential

            if self.operation in ("prewindowing"):
                window_spec_file = os.path.join(
                    spec_base, "node_chains", "windower",
                    self.potentials[key]["windower_spec_path_train"])

                self.window_stream[key] = \
                        self.stream_manager.request_window_stream(window_spec_file,
                                                              nullmarker_stride_ms = self.nullmarker_stride_ms)
            elif self.operation in ("prewindowing_offline"):
                pass
            elif self.operation in ("train"):
                pass

            self.node_chain_definitions[key] = yaml.load(flow)
            flow.close()

        # TODO: check if the prewindowing flow is still needed when using the stream mode!
        if self.operation in ("train"):
            online_logger.info("Removing old flows...")
            try:
                shutil.rmtree(self.flow_storage)
            except:
                online_logger.info("Could not delete flow storage directory")
            os.mkdir(self.flow_storage)
        elif self.operation in ("prewindowing", "prewindowing_offline"):
            # follow this policy:
            # - delete prewindowed data older than 12 hours
            # - always delete trained/stored flows
            now = datetime.datetime.now()
            then = now - datetime.timedelta(hours=12)

            if not os.path.exists(self.prewindowed_data_directory):
                os.mkdir(self.prewindowed_data_directory)
            if not os.path.exists(self.flow_storage):
                os.mkdir(self.flow_storage)

            for key in self.potentials.keys():
                found = self.find_files_older_than(then, \
                        os.path.join(self.prewindowed_data_directory, key))
                if found is not None:
                    for f in found:
                        online_logger.info(
                            str("recursively deleting files in \'%s\'" % f))
                        try:
                            shutil.rmtree(os.path.abspath(f))
                        except Exception as e:
                            # TODO: find a smart solution for this!
                            pass  # dir was probably already deleted..

                if os.path.exists(
                        os.path.join(self.prewindowed_data_directory, key,
                                     "all_train_data")):
                    shutil.rmtree(
                        os.path.join(self.prewindowed_data_directory, key,
                                     "all_train_data"))
                    online_logger.info(
                        "deleted concatenated training data for " + key)

        online_logger.info("Training preparations finished")
        return 0
Exemple #9
0
class TimeSeriesSinkNode(BaseNode):
    """ Collect all :mod:`time series objects <pySPACE.resources.data_types.time_series>` in a :mod:`collection <pySPACE.resources.dataset_defs.time_series>`
    
    **Parameters**
    
      :sort_string: 
          A lambda function string that is passed to the TimeSeriesDataset and
          evaluated before the data is stored.
                    
          (*optional, default: None*)
          
      :max_num_stored_objects:
          Number of maximal stored time series objects. Can be used if only a part
          of a dataset should be exported, e.g. for size purposes in debugging.
          Applies to train and test set separately.
          
          (*optional, default: numpy.inf*)
    
      :merge:
         Can be set to true if the use wants to get one timeseries containing the
         entier input data
         
         (*optional, default: False*)

    **Exemplary Call**

    .. code-block:: yaml

        - 
            node: Time_Series_Sink

    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2008/11/28    
    :LastChange: 2011/04/13 Anett Seeland ([email protected])        
    """
    def __init__(self, sort_string=None, merge=False, **kwargs):
        super(TimeSeriesSinkNode, self).__init__(**kwargs)

        self.set_permanent_attributes(
            sort_string=sort_string,
            merge=merge,
            # This will be created lazily
            time_series_collection=None,
            max_num_stored_objects=numpy.inf)

    def reset(self):
        """
        Reset the state of the object to the clean state it had after its
        initialization
        """
        # We have to create a temporary reference since we remove
        # the self.permanent_state reference in the next step by overwriting
        # self.__dict__
        tmp = self.permanent_state
        # TODO: just a hack to get it working quickly...
        tmp["time_series_collection"] = self.time_series_collection
        self.__dict__ = copy.copy(tmp)
        self.permanent_state = tmp

    def is_trainable(self):
        """ Returns whether this node is trainable. """
        # Though this node is not really trainable, it returns true in order
        # to get trained. The reason is that during this training phase,
        # it stores all time windows along with their class label
        return True

    def _get_train_set(self, use_test_data):
        """ Returns the data that can be used for training """
        # We take data that is provided by the input node for training
        # NOTE: This might involve training of the preceding nodes
        train_set = self.input_node.request_data_for_training(use_test_data)

        # Add the data provided by the input node for testing to the
        # training set
        # NOTE: This node is not really learning but creating a labeled set
        #       of time windows. Because of that it must take all
        #       data for training (even when use_test_data is False)
        train_set = itertools.chain(train_set,
                                    self.input_node.request_data_for_testing())
        return train_set

    def is_supervised(self):
        """ Returns whether this node requires supervised training """
        return True

    def _execute(self, data):
        # We simply pass the given data on to the next node
        return data

    def _train(self, data, label):
        # We simply pass the given data on to the next node
        return (data, label)

    def process_current_split(self):
        """ 
        Compute the results of this sink node for the current split of the data
        into train and test data
        """
        index = 0
        # Compute the time series for the data used for training
        for time_series, label in self.input_node.request_data_for_training(
                False):
            # Do lazy initialization of the class
            if self.time_series_collection == None:
                self.time_series_collection = \
                            TimeSeriesDataset(sort_string=self.sort_string)

            if index < self.max_num_stored_objects:
                # Add sample
                self.time_series_collection.add_sample(
                    time_series,
                    label=label,
                    train=True,
                    split=self.current_split,
                    run=self.run_number)
            index += 1

        # Compute the time series for the data used for testing
        index = 0
        for time_series, label in self.input_node.request_data_for_testing():
            # Do lazy initialization of the class
            # (maybe there were no training examples)
            if self.time_series_collection == None:
                self.time_series_collection = \
                            TimeSeriesDataset(sort_string=self.sort_string)

            if index < self.max_num_stored_objects:
                # Add sample
                self.time_series_collection.add_sample(
                    time_series,
                    label=label,
                    train=False,
                    split=self.current_split,
                    run=self.run_number)
            index += 1

    def merge_time_series(self, input_collection):
        """ Merges all timeseries of the input_collection to one big timeseries """
        # Retriev the time series from the input_collection
        input_timeseries = input_collection.get_data(0, 0, 'test')
        # Get the data from the first timeseries
        output_data = input_timeseries[0][0].get_data()
        # Change the endtime of the first timeseries to the one of the last
        # timeseries inside the input_collection
        input_timeseries[0][0].end_time = input_timeseries[-1][0].end_time
        # For all the remaining timeseries
        for ts in input_timeseries[1:]:
            # Concatenate the data...
            output_data = numpy.vstack((output_data, ts[0].get_data()))
            # ... and add the marker to the first timeseries
            if (len(ts[0].marker_name) > 0):
                for k in ts[0].marker_name:
                    if (not input_timeseries[0][0].marker_name.has_key(k)):
                        input_timeseries[0][0].marker_name[k] = []
                    for time in ts[0].marker_name[k]:
                        input_timeseries[0][0].marker_name[k].append(
                            time + ts[0].start_time +
                            input_timeseries[0][0].start_time)
        # Use the meta information from the first timeseries e.g. marker start/end_time
        # and create a new timeseries with the concatenated data
        merged_time_series = TimeSeries.replace_data(input_timeseries[0][0],
                                                     output_data)
        # Change the name of the merged_time_series
        merged_time_series.name = "%s, length %d ms, %s" % (merged_time_series.name.split(',')[0], \
                                                            (len(merged_time_series)*1000.0)/merged_time_series.sampling_frequency,\
                                                            merged_time_series.name.split(',')[-1])

        return merged_time_series

    def get_result_dataset(self):
        """ Return the result """
        # Merges all timeseries inside the collection if merge flag is set to true
        if self.merge:
            merged_time_series = self.merge_time_series(
                self.time_series_collection)
            self.time_series_collection = None
            self.time_series_collection = \
                      TimeSeriesDataset(sort_string=self.sort_string)
            self.time_series_collection.add_sample(merged_time_series,
                                                   label='Window',
                                                   train=False)
        return self.time_series_collection
Exemple #10
0
class TimeSeriesSinkNode(BaseNode):
    """ Collect all :mod:`time series objects <pySPACE.resources.data_types.time_series>` in a :mod:`collection <pySPACE.resources.dataset_defs.time_series>`
    
    **Parameters**
    
      :sort_string: 
          A lambda function string that is passed to the TimeSeriesDataset and
          evaluated before the data is stored.
                    
          (*optional, default: None*)
          
      :max_num_stored_objects:
          Number of maximal stored time series objects. Can be used if only a part
          of a dataset should be exported, e.g. for size purposes in debugging.
          Applies to train and test set separately.
          
          (*optional, default: numpy.inf*)
    
      :merge:
         Can be set to true if the use wants to get one timeseries containing the
         entier input data
         
         (*optional, default: False*)

    **Exemplary Call**

    .. code-block:: yaml

        - 
            node: Time_Series_Sink

    :Author: Jan Hendrik Metzen ([email protected])
    :Created: 2008/11/28    
    :LastChange: 2011/04/13 Anett Seeland ([email protected])        
    """
    input_types = ["TimeSeries"]

    def __init__(self, sort_string=None, merge = False, **kwargs):
        super(TimeSeriesSinkNode, self).__init__(**kwargs)
        
        self.set_permanent_attributes(sort_string=sort_string,
                                      merge = merge,
                                      # This will be created lazily
                                      time_series_collection = None,
                                      max_num_stored_objects = numpy.inf) 
    
    def reset(self):
        """
        Reset the state of the object to the clean state it had after its
        initialization
        """
        # We have to create a temporary reference since we remove 
        # the self.permanent_state reference in the next step by overwriting
        # self.__dict__
        tmp = self.permanent_state
        # TODO: just a hack to get it working quickly...
        tmp["time_series_collection"] = self.time_series_collection 
        self.__dict__ = copy.copy(tmp)
        self.permanent_state = tmp
    
    def is_trainable(self):
        """ Returns whether this node is trainable. """
        # Though this node is not really trainable, it returns true in order
        # to get trained. The reason is that during this training phase, 
        # it stores all time windows along with their class label
        return True
    
    def _get_train_set(self, use_test_data):
        """ Returns the data that can be used for training """
        # We take data that is provided by the input node for training
        # NOTE: This might involve training of the preceding nodes
        train_set = self.input_node.request_data_for_training(use_test_data)
        
        # Add the data provided by the input node for testing to the
        # training set
        # NOTE: This node is not really learning but creating a labeled set
        #       of time windows. Because of that it must take all
        #       data for training (even when use_test_data is False) 
        train_set = itertools.chain(train_set,
                                    self.input_node.request_data_for_testing())
        return train_set
    
    def is_supervised(self):
        """ Returns whether this node requires supervised training """
        return True
    
    def _train(self, data, label):
        # We do nothing
        pass
        
    def process_current_split(self):
        """ 
        Compute the results of this sink node for the current split of the data
        into train and test data
        """
        index = 0
        # Compute the time series for the data used for training
        for time_series, label in self.input_node.request_data_for_training(False):
            # Do lazy initialization of the class 
            if self.time_series_collection == None:
                self.time_series_collection = \
                            TimeSeriesDataset(sort_string=self.sort_string)
            
            if index < self.max_num_stored_objects:
                # Add sample
                self.time_series_collection.add_sample(time_series,
                                                       label = label,
                                                       train = True,
                                                       split = self.current_split,
                                                       run = self.run_number)
            index += 1
            
        # Compute the time series for the data used for testing
        index = 0
        for time_series, label in self.input_node.request_data_for_testing():
            # Do lazy initialization of the class 
            # (maybe there were no training examples)
            if self.time_series_collection == None:
                self.time_series_collection = \
                            TimeSeriesDataset(sort_string=self.sort_string)
            
            if index < self.max_num_stored_objects:
                # Add sample
                self.time_series_collection.add_sample(time_series,
                                                   label = label,
                                                   train = False,
                                                   split = self.current_split,
                                                   run = self.run_number)
            index += 1

    
    def merge_time_series(self, input_collection):
        """ Merges all timeseries of the input_collection to one big timeseries """
        # Retriev the time series from the input_collection
        input_timeseries = input_collection.get_data(0,0,'test')
        # Get the data from the first timeseries
        output_data = input_timeseries[0][0]
        skiped_range = output_data.start_time

        # Change the endtime of the first timeseries to the one of the last
        # timeseries inside the input_collection
        input_timeseries[0][0].end_time = input_timeseries[-1][0].end_time
        # For all the remaining timeseries

        for ts in input_timeseries[1:]:
            # Concatenate the data...
            output_data = numpy.vstack((output_data,ts[0]))
            # ... and add the marker to the first timeseries
            if(len(ts[0].marker_name) > 0):
                for k in ts[0].marker_name:
                    if(not input_timeseries[0][0].marker_name.has_key(k)):
                        input_timeseries[0][0].marker_name[k] = []
                    for time in ts[0].marker_name[k]:
                        input_timeseries[0][0].marker_name[k].append(time+ts[0].start_time - skiped_range)
        # Use the meta information from the first timeseries e.g. marker start/end_time
        # and create a new timeseries with the concatenated data
        merged_time_series = TimeSeries.replace_data(input_timeseries[0][0],output_data)
        # Change the name of the merged_time_series
        merged_time_series.name = "%s, length %d ms, %s" % (merged_time_series.name.split(',')[0], \
                                                            (len(merged_time_series)*1000.0)/merged_time_series.sampling_frequency,\
                                                            merged_time_series.name.split(',')[-1])
        
        return merged_time_series

        
    def get_result_dataset(self):
        """ Return the result """
        # Merges all timeseries inside the collection if merge flag is set to true
        if self.merge:
            merged_time_series = self.merge_time_series(self.time_series_collection)
            self.time_series_collection = None
            self.time_series_collection = \
                      TimeSeriesDataset(sort_string=self.sort_string)
            self.time_series_collection.add_sample(merged_time_series,
                                                 label = 'Window',
                                                 train = False)
        return self.time_series_collection
Exemple #11
0
    def prepare_training(self, training_files, potentials, operation):
        """ Prepares pyspace live for training.

        Prepares everything for training of pyspace live,
        i.e. creates flows based on the dataflow specs
        and configures them.
        """
        online_logger.info( "Preparing Training")
        self.potentials = potentials
        self.operation = operation

        online_logger.info( "Creating flows..")
        for key in self.potentials.keys():
            spec_base = self.potentials[key]["configuration"].spec_dir
            if self.operation == "train":
                self.potentials[key]["node_chain"] = os.path.join(spec_base, self.potentials[key]["node_chain"])
                online_logger.info( "node_chain_spec:" + self.potentials[key]["node_chain"])

            elif self.operation in ("prewindowing", "prewindowing_offline"):
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["stream_prewindowing_flow"])
                else:
                    self.potentials[key]["prewindowing_flow"] = os.path.join(spec_base, self.potentials[key]["prewindowing_flow"])
                online_logger.info( "prewindowing_dataflow_spec: " + self.potentials[key]["prewindowing_flow"])

            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["stream_postprocess_flow"])
                else:
                    self.potentials[key]["postprocess_flow"] = os.path.join(spec_base, self.potentials[key]["postprocess_flow"])
                online_logger.info( "postprocessing_dataflow_spec: " + self.potentials[key]["postprocess_flow"])

            self.training_active_potential[key] = multiprocessing.Value("b",False)

        online_logger.info("Path variables set for NodeChains")

        # check if multiple potentials are given for training
        if isinstance(training_files, list):
            self.training_data = training_files
        else:
            self.training_data = [training_files]

        # Training is done in separate processes, we send the time series
        # windows to these threads via two queues
        online_logger.info( "Initializing Queues")
        for key in self.potentials.keys():
            self.queue[key] = multiprocessing.Queue()


        def flow_generator(key):
            """create a generator to yield all the abri flow windows"""
            # Yield all windows until a None item is found in the queue
            while True:
                window = self.queue[key].get(block = True, timeout = None)
                if window == None: break
                yield window

        # Create the actual data flows
        for key in self.potentials.keys():

            if self.operation == "train":
                self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                         flow_spec = file(self.potentials[key]["node_chain"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["node_chain"])
            elif self.operation in ("prewindowing", "prewindowing_offline"):
                online_logger.info("loading prewindowing flow..")
                online_logger.info("file: " + str(self.potentials[key]["prewindowing_flow"]))

                self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                             flow_spec = file(self.potentials[key]["prewindowing_flow"]))
                self.node_chains[key][0].set_generator(flow_generator(key))
                flow = open(self.potentials[key]["prewindowing_flow"])
            elif self.operation == "prewindowed_train":
                if self.potentials[key].has_key("stream") and self.potentials[key]["stream"] == True:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain,
                                                                     flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    # create windower
                    online_logger.info( "Creating Windower")
                    online_logger.info(self.potentials[key]["windower_spec_path_train"])
                    self.node_chains[key][0].set_windower_spec_file(os.path.join(spec_base, "node_chains", "windower", self.potentials[key]["windower_spec_path_train"]))
                    replace_start_and_end_markers = True
                else:
                    self.node_chains[key] = NodeChainFactory.flow_from_yaml(Flow_Class = NodeChain, flow_spec = file(self.potentials[key]["postprocess_flow"]))
                    replace_start_and_end_markers = False

                final_collection = TimeSeriesDataset()
                final_collection_path = os.path.join(self.prewindowed_data_directory, key, "all_train_data")
                # delete previous training collection
                if os.path.exists(final_collection_path):
                    online_logger.info("deleting old training data collection for " + key)
                    shutil.rmtree(final_collection_path)

                # load all prewindowed collections and
                # append data to the final collection
                prewindowed_sets = \
                    glob.glob(os.path.join(self.prewindowed_data_directory, key, "*"))
                if len(prewindowed_sets) == 0:
                    online_logger.error("Couldn't find data, please do prewindowing first!")
                    raise Exception
                online_logger.info("concatenating prewindowed data from " + str(prewindowed_sets))

                for s,d in enumerate(prewindowed_sets):
                    collection = BaseDataset.load(d)
                    data = collection.get_data(0, 0, "train")
                    for d,(sample,label) in enumerate(data):

                        if replace_start_and_end_markers:
                            # in case we concatenate multiple 'Window' labeled
                            # sets we have to remove every start- and endmarker
                            for k in sample.marker_name.keys():
                                # find '{S,s}  8' or '{S,s}  9'
                                m = re.match("^s\s{0,2}[8,9]{1}$", k, re.IGNORECASE)
                                if m is not None:
                                    online_logger.info(str("remove %s from %d %d" % (m.group(), s, d)))
                                    del(sample.marker_name[m.group()])

                            if s == len(prewindowed_sets)-1 and \
                                d == len(data)-1:
                                # insert endmarker
                                sample.marker_name["S  9"] = [0.0]
                                online_logger.info("added endmarker" + str(s) + " " + str(d))

                            if s == 0 and d == 0:
                                # insert startmarker
                                sample.marker_name["S  8"] = [0.0]
                                online_logger.info("added startmarker" + str(s) + " " + str(d))

                        final_collection.add_sample(sample, label, True)

                # save final collection (just for debugging)
                os.mkdir(final_collection_path)
                final_collection.store(final_collection_path)

                online_logger.info("stored final collection at " + final_collection_path)

                # load final collection again for training
                online_logger.info("loading data from " + final_collection_path)
                self.prewindowed_data[key] =  BaseDataset.load(final_collection_path)
                self.node_chains[key][0].set_input_dataset(self.prewindowed_data[key])

                flow = open(self.potentials[key]["postprocess_flow"])

            self.node_chain_definitions[key] = yaml.load(flow)
            flow.close()

        # TODO: check if the prewindowing flow is still needed
        # when using the stream mode!
        if self.operation in ("train"):
            online_logger.info( "Removing old flows...")
            try:
                shutil.rmtree(self.flow_storage)
            except:
                online_logger.info("Could not delete flow storage directory")
            os.mkdir(self.flow_storage)
        elif self.operation in ("prewindowing", "prewindowing_offline"):
            # follow this policy:
            # - delete prewindowed data older than 12 hours
            # - always delete trained/stored flows
            now = datetime.datetime.now()
            then = now - datetime.timedelta(hours=12)

            if not os.path.exists(self.prewindowed_data_directory):
                os.mkdir(self.prewindowed_data_directory)
            if not os.path.exists(self.flow_storage):
                os.mkdir(self.flow_storage)

            for key in self.potentials.keys():
                found = self.find_files_older_than(then, \
                        os.path.join(self.prewindowed_data_directory, key))
                if found is not None:
                    for f in found:
                        online_logger.info(str("recursively deleting files in \'%s\'" % f))
                        try:
                            shutil.rmtree(os.path.abspath(f))
                        except Exception as e:
                            # TODO: find a smart solution for this!
                            pass # dir was probably already deleted..

                if os.path.exists(os.path.join(self.prewindowed_data_directory, key, "all_train_data")):
                    shutil.rmtree(os.path.join(self.prewindowed_data_directory, key, "all_train_data"))
                    online_logger.info("deleted concatenated training data for " + key)


        online_logger.info( "Training preparations finished")
        return 0
Exemple #12
0
class DataGenerationTimeSeriesSourceNode(TimeSeriesSourceNode):
    """ Generate data of two classes for testing
    
    This node can generate data according to the specifications
    of two different DataGenerators. 
    
    It generates objects of the type TimeSeries
    
    **Parameters**
        :ir_generator:
            A generator of type DataGenerator for data items of the 
            information relevant class.
            If it is specified in a node chain, it should be given as a
            string.
            
            (*optional, default: 100*)
            
        :nir_generator:
            A generator of type DataGenerator for data items of the 
            not information relevant class.
            If it is specified in a node chain, it should be given as a
            string.
            
            (*optional, default: 100*)
            
        :ir_items:
            Number of items that should be generated for the ir class.
            
            (*optional, default: 100*)
            
        :nir_items:
            Number of items that should be generated for the non ir class.
            
            (*optional, default: 100*)
            
        :channel_names:
            List of strings for the channel names. 
            Determines also the number of generated
            channels.
            
            (*optional*)

        :num_channels:
            Number of channels. Unused, if channel_names is set.
            
            (*optional, default: 16*)
            
        :ir_label:
            The label for the ir_class.
            
            (*optional, default: 'Target'*)
            
        :nir_label:
            The label for the ir_class.
            
            (*optional, default: 'Standard'*)
            
        :shuffle:
            If the data items for the two classes are shuffled.
            
            (*optional, default: True*)
            
        :time_points:
            Number of points per channel in a generated TimeSeries
            object. 
            
            (*optional, default: 100*)
            
         :sampling_frequency:
             Sampling rate of the generated data.
             Important for sines etc.
             
             A generated time series object has a
             temporal length of time_points/sampling_frequency
         
             (*optional, default: 1000*)
             
          :ir_drift_vector:
              Drift of the ir class data.
              Specify a vector (numpy array) of shape (time_points,num_channels)
              and the a linear drift in this direction will be added to the
              generated data:
              [0 * ir_drift_vector]                       added to first sample,
              [1/(ir_items+nir_items) * ir_drift_vector]  to the second sample
              [...]                                       and so on, until
              [1 * ir_drift_vector]                       added to last sample.
              
              The specification of the drift vector in the specification can, e.g.,
              be done like this:
              ir_drift_vector : "eval(__import__('numpy').asarray([[1,1],[2,2]]))"
              
              (*optional, default: None*)
            
          :nir_drift_vector:
              Drift of the ir class data. See ir_drift_vector. 
              
              (*optional, default: None*)              
              
    **Exemplary Call**
    
    .. code-block:: yaml
    
        -
            node : Data_Generation_Source
            parameters :
                ir_generator : "Adder([SineGenerator(),GaussianNoiseGenerator()])"
                nir_generator : "GaussianNoiseGenerator()"
                
    
    :Author: Hendrik Woehrle
    :Created: 201/07/27
    """
    def __init__(self,
                 ir_generator="Adder([Sine(),GaussianNoise()])",
                 nir_generator="GaussianNoise()",
                 ir_items=100,
                 nir_items=100,
                 ir_drift_vector=None,
                 nir_drift_vector=None,
                 channel_names=None,
                 num_channels=16,
                 ir_label='Target',
                 nir_label='Standard',
                 time_points=100,
                 sampling_frequency=1000,
                 shuffle=True,
                 **kwargs):
        super(DataGenerationTimeSeriesSourceNode, self).__init__(**kwargs)

        if type(ir_generator) == str:
            ir_generator = eval(ir_generator)

        if type(nir_generator) == str:
            nir_generator = eval(nir_generator)

        ir_generator.sampling_frequency = sampling_frequency
        nir_generator.sampling_frequency = sampling_frequency

        run_number = 0

        dataset = None

        if not channel_names is None:
            num_channels = len(channel_names)
        else:
            channel_names = []
            for i in xrange(num_channels):
                channel_names.append(str(i))

        # Translate drift "None" to zero-vector
        if ir_drift_vector is None:
            ir_drift_vector = numpy.zeros((time_points, num_channels))
        if nir_drift_vector is None:
            nir_drift_vector = numpy.zeros((time_points, num_channels))

        self.set_permanent_attributes(dataset=dataset,
                                      ir_generator=ir_generator,
                                      nir_generator=nir_generator,
                                      ir_items=ir_items,
                                      nir_items=nir_items,
                                      channel_names=channel_names,
                                      num_channels=num_channels,
                                      ir_label=ir_label,
                                      nir_label=nir_label,
                                      time_points=time_points,
                                      sampling_frequency=sampling_frequency,
                                      shuffle=shuffle,
                                      run_number=run_number,
                                      data_for_testing=None,
                                      data_for_training=None,
                                      ir_drift_vector=ir_drift_vector,
                                      nir_drift_vector=nir_drift_vector)

        self.generate_data_set()

    def set_input_dataset(self, dataset):
        """ Instead of using a given dataset, a new one is generated """
        self.generate_data_set()

    def generate_data_set(self):
        """ Generate a dataset using the given generators """

        self.dataset = TimeSeriesDataset()

        # generate a set of dummy labels to know which class is used later
        label_sequence = numpy.hstack(
            (numpy.ones(self.ir_items), numpy.zeros(self.nir_items)))

        if self.shuffle:
            random.shuffle(label_sequence)

        ts_generator = TestTimeSeriesGenerator()

        current_item = 0  # count produced data objects for drift
        for label in label_sequence:
            if label == 1:
                #generate a data item using the ir_generator
                data_item = \
                    ts_generator.generate_test_data(
                        channels=len(self.channel_names),
                        time_points=self.time_points,
                        function=self.ir_generator,
                        sampling_frequency=self.sampling_frequency,
                        channel_order=True,
                        channel_names=self.channel_names,
                        dtype=numpy.float)
                # Drift:
                data_item = data_item + current_item * self.ir_drift_vector
                self.dataset.add_sample(data_item, self.ir_label, False)

            else:
                #generate a data item using the nir_generator
                data_item = \
                    ts_generator.generate_test_data(
                        channels=len(self.channel_names),
                        time_points=self.time_points,
                        function=self.nir_generator,
                        sampling_frequency=self.sampling_frequency,
                        channel_order=True,
                        channel_names=self.channel_names,
                        dtype=numpy.float)
                # Drift:
                data_item = data_item + current_item * self.nir_drift_vector
                self.dataset.add_sample(data_item, self.nir_label, False)

            current_item += 1. / (self.ir_items + self.nir_items)