コード例 #1
0
    def request_data_for_training(self, use_test_data):
        """
        Returns the time windows that can be used for training of subsequent nodes
        """
        # TODO:Is all this really necessary?
        if not use_test_data:
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "train")
            else:
                key = (0, self.current_split, "train")
            # Check if there is training data for the current split and run
            if key in self.dataset.data.keys():
                self._log("Accessing input dataset's training prediction vectors.")
                self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(),
                                                          caching=self.caching)
            else:
                # Returns an iterator that iterates over an empty sequence
                # (i.e. an iterator that is immediately exhausted), since
                # this node does not provide any data that is explicitly
                # dedicated for training
                self._log("No training data available.")
                self.data_for_training = MemoizeGenerator((x for x in [].__iter__()),
                                                          caching=self.caching)
        else:
            # Return the test data as there is no additional data that
            # was dedicated for training
            return self.request_data_for_testing()

        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
コード例 #2
0
    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes
        
        .. todo:: to document
        """
        assert (self.input_node != None)

        self._log("Data for training is requested.", level=logging.DEBUG)

        # If we haven't computed the data for training yet
        if self.data_for_training == None:
            self._log("Producing data for training.", level=logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)

            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            train_data_generator = \
                     itertools.imap(lambda (data, label) : (self.execute(data), label),
                                    self.external_training_set)

            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching)

        self._log("Data for training finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
コード例 #3
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:
            self._log("Accessing input dataset's test feature vector windows.")
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            test_data_generator = self.dataset.get_data(*key).__iter__()

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
コード例 #4
0
    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """
        if self.data_for_testing is None:
            # set window definition for test phase windower file
            self.window_definition = \
                Windower._load_window_spec(self.windower_spec_file,
                                           self.local_window_conf)
            test_data = list(self.input_node.request_data_for_testing())

            # create stream of windows
            self.window_stream(test_data)

            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                   for (sample, label) in self.marker_windower)

            self.data_for_testing = MemoizeGenerator(test_data_generator)

            # Return a fresh copy of the generator
            return self.data_for_testing.fresh()
        else:
            return self.data_for_testing.fresh()
コード例 #5
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        self._log("Requesting test data...")
        # If we haven't read the data for testing yet
        if self.data_for_testing is None:

            self._log("Start streaming.")

            self.dataset.set_window_defs(
                window_definition=self.window_definition,
                nullmarker_stride_ms=self.nullmarker_stride_ms,
                no_overlap=self.no_overlap,
                data_consistency_check=self.data_consistency_check)

            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            # Create a generator that emits the windows
            test_data_generator = ((sample, label)
                                   for (sample,
                                        label) in self.dataset.get_data(*key))

            self.data_for_testing = \
                MemoizeGenerator(test_data_generator,
                                 caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
コード例 #6
0
    def process(self):
        """ Processes all data that is provided by the input node

        Returns a generator that yields the data after being processed by this
        node.
        """
        assert(self.input_node != None), "No input node specified!"
        # Assert  that this node has already been trained
        assert(not self.is_trainable() or
               self.get_remaining_train_phase() == 0), "Node not trained!"
               
        data_generator = \
                itertools.imap(lambda (data, label):
                               (self.execute(data), label),
                               self.input_node.process())
                
        self.client = TimeSeriesClient(ts_stream = data_generator)
        
        self.client.connect()
        self.marker_windower = MarkerWindower(data_client=self.client,
                                              windowdefs=self.window_definition,
                                              stridems=self.nullmarker_stride_ms)
        
        if self.marker_windower == None:
            self.window_stream()

        # Create a generator that emits the windows
        test_data_generator = ((sample, label) \
                               for (sample, label) in self.marker_windower)

        self.data_for_testing = MemoizeGenerator(test_data_generator)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
コード例 #7
0
    def request_data_for_testing(self):
        """ Returns the data that can be used for testing of subsequent nodes

        The principle of obtaining the testing data are the same as the principles
        used in obtaining the training data set. The only difference here is that,
        in the case in which there is no testing data available, we allow for the
        training data to be used as testing data.
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:
            self._log("Accessing input dataset's test feature vector windows.")
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "test")
            else:
                key = (0, self.current_split, "test")

            test_data_generator = self.dataset.get_data(*key).__iter__()

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
コード例 #8
0
    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes of the node chain

        A call to this method might involve training of the node chain up this
        node. If use_test_data is true, all available data is used for
        training, otherwise only the data that is explicitly for training.
        """
        assert (self.input_node != None)

        self._log("Data for training is requested.", level=logging.DEBUG)

        # If we haven't computed the data for training yet
        if self.data_for_training == None:
            self._log("Producing data for training.", level=logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)
            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            # This line crashes without the NodeMetaclass bug fix
            train_data_generator = \
                 itertools.imap(lambda (data, label) :
                                self.print_data(data, label),
                                self.input_node.request_data_for_training(
                                                                use_test_data))
            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching)

        self._log("Data for training finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
コード例 #9
0
    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes of the node chain

        A call to this node might involve evaluating the whole node chain
        up to this node.
        """
        assert (self.input_node != None)

        self._log("Data for testing is requested.", level=logging.DEBUG)

        # If we haven't computed the data for testing yet
        if self.data_for_testing == None:
            # Assert  that this node has already been trained
            assert (not self.is_trainable()
                    or self.get_remaining_train_phase() == 0)
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that'll
            # yield the same sequence
            self._log("Producing data for testing.", level=logging.DEBUG)
            test_data_generator = \
                itertools.imap(lambda (data, label):
                               self.print_data(data, label),
                               self.input_node.request_data_for_testing())
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        self._log("Data for testing finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
コード例 #10
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data that can be used for training of subsequent nodes

        This method streams training data and sends it to the subsequent nodes.
        If one looks at the tutorial related to building new nodes (available in
        the tutorial section), one can see exactly where the ``request_data``
        methods are put to use.

        The following example is one that was extracted from the
        :mod:`~pySPACE.missions.nodes.source.feature_vector_source.FeatureVectorSourceNode`

        which should(in theory at least) be implementable for all types of data.
        """
        if not use_test_data:
            # If the input dataset consists only of one single run,
            # we use this as input for all runs to be conducted (i.e. we
            # rely on later randomization of the order). Otherwise
            # we use the data for this run number
            if self.dataset.meta_data["runs"] > 1:
                key = (self.run_number, self.current_split, "train")
            else:
                key = (0, self.current_split, "train")
            # Check if there is training data for the current split and run
            if key in self.dataset.data.keys():
                self._log(
                    "Accessing input dataset's training feature vector windows."
                )
                self.data_for_training = MemoizeGenerator(
                    self.dataset.get_data(*key).__iter__(),
                    caching=self.caching)
            else:
                # Returns an iterator that iterates over an empty sequence
                # (i.e. an iterator that is immediately exhausted), since
                # this node does not provide any data that is explicitly
                # dedicated for training
                self._log("No training data available.")
                self.data_for_training = MemoizeGenerator(
                    (x for x in [].__iter__()), caching=self.caching)
        else:
            # Return the test data as there is no additional data that
            # was dedicated for training
            return self.request_data_for_testing()

        # Return a fresh copy of the generator
        return self.data_for_training.fresh()
コード例 #11
0
ファイル: transfer_splitter.py プロジェクト: pyspace/test
 def request_data_for_testing(self):
     # Create split lazily when required
     if self.split_indices_test == None:
         self._create_split()
     
     # Create test data generator
     self.data_for_testing = MemoizeGenerator(
           self.data[i] for i in self.split_indices_test[self.current_split])
     
     return self.data_for_testing.fresh()
コード例 #12
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data that can be used for training of subsequent nodes

        .. todo:: to document
        """

        # set window definition for train phase windower file
        self.window_definition = \
            Windower._load_window_spec(self.windower_spec_file_train,
                                       self.local_window_conf)

        self._log("Requesting train data...")
        if self.data_for_training is None:
            if not use_test_data:
                # Get training and test data (with labels)
                train_data = \
                    list(self.input_node.request_data_for_training(use_test_data=use_test_data))
                # If training or test data is an empty list
                if train_data == []:
                    self.data_for_training = MemoizeGenerator(
                        (x for x in [].__iter__()), caching=True)
                    return self.data_for_training.fresh()
                # create stream of
                self.window_stream(train_data)

                # Create a generator that emits the windows
                train_data_generator = ((sample, label)
                                        for (sample,
                                             label) in self.marker_windower)
                self.data_for_training = MemoizeGenerator(train_data_generator,
                                                          caching=True)
                return self.data_for_training.fresh()

            else:
                # Return the test data as there is no additional data that
                # was dedicated for training
                self.data_for_training = self.request_data_for_testing()
                return self.data_for_training
        else:
            return self.data_for_training.fresh()
コード例 #13
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data for training of subsequent nodes

        .. todo:: to document
        """
        # Create split lazily when required
        if self.train_data == None:
            self._create_split()

        # Create training data generator
        self.data_for_training = \
                MemoizeGenerator(instance for instance in self.train_data)
        
        return self.data_for_training.fresh()
コード例 #14
0
    def request_data_for_testing(self):
        """ Returns the data for testing of subsequent nodes

        .. todo:: to document
        """
        # Create cv-splits lazily when required
        if self.split_indices == None:
            self._create_splits()

        # Only that data can be used for testing which is explicitly
        # specified for this purpose by the current cv-split
        self.data_for_testing = MemoizeGenerator(
            self.data[i] for i in self.split_indices[self.current_split])

        return self.data_for_testing.fresh()
コード例 #15
0
    def request_data_for_training(self, use_test_data):
        """ Returns the data for training of subsequent nodes

        .. todo:: to document
        """
        # Create cv-splits lazily when required
        if self.split_indices == None:
            self._create_splits()

        # All data can be used for training which is not explicitly
        # specified for testing by the current cv-split
        self.data_for_training = MemoizeGenerator(
            self.data[i] for i in range(len(self.data))
            if not i in self.split_indices[self.current_split])

        return self.data_for_training.fresh()
コード例 #16
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """
        # If we haven't read the data for testing yet
        if self.data_for_testing == None:

            generated_data = self.generate_random_data()

            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                     for (sample, label) in generated_data)

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=True)

        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
コード例 #17
0
    def request_data_for_testing(self):
        """
        Returns the data that can be used for testing of subsequent nodes

        .. todo:: to document
        """

        # If we haven't read the data for testing yet
        if self.data_for_testing is None:
            self.time_series = [(TimeSeries(input_array=numpy.ones((2, 2)) * i,
                                            channel_names=["X", "Y"],
                                            sampling_frequency=2),
                                 random.choice(["A", "B"])) for i in range(23)]
            # Create a generator that emits the windows
            test_data_generator = ((sample, label) \
                                     for (sample, label) in self.time_series)

            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=True)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
コード例 #18
0
    def request_data_for_testing(self):
        """ Returns data for testing of subsequent nodes

        .. todo:: to document
        """
        assert(self.input_node is not None)
        
        self._log("Data for testing is requested.", level=logging.DEBUG)
        
        # If we haven't computed the data for testing yet
        if self.data_for_testing is None:
            # Assert  that this node has already been trained
            assert(not self.is_trainable() or 
                   self.get_remaining_train_phase() == 0)
            
            # Divide available instances according to label
            all_instances = defaultdict(list)
            
            for instance, label in self.input_node.request_data_for_testing():
                all_instances[label].append(instance)
            
            retained_instances = self.balance_instances(all_instances)
            
            # Compute a generator the yields the test data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that will
            # yield the same sequence
            self._log("Producing data for testing.", level=logging.DEBUG)
            test_data_generator = ((self.execute(data), label)
                                   for (data, label) in retained_instances)
                    
            self.data_for_testing = MemoizeGenerator(test_data_generator,
                                                     caching=self.caching)
        self._log("Data for testing finished", level=logging.DEBUG)
        # Return a fresh copy of the generator
        return self.data_for_testing.fresh()
コード例 #19
0
    def request_data_for_training(self, use_test_data):
        """ Returns data for training of subsequent nodes
        
        .. todo:: to document
        
        .. note::
              This method works differently in InstanceSelectionNode
              than in other nodes: Only *percentage_selected* of the available
              data are returned.
        """
        
        assert(self.input_node is not None)
        if self.train_percentage_selected > 100:
            self._log("Train percentage of %f reduced to 100." %
                      self.train_percentage_selected,
                      level=logging.ERROR)
            self.train_percentage_selected = 100
        self._log("Data for training is requested.", level=logging.DEBUG)

        if self.train_percentage_selected == 100:
            return super(InstanceSelectionNode, self).request_data_for_training(
                use_test_data)

        # If we haven't computed the data for training yet
        if self.data_for_training is None:
            self._log("Producing data for training.", level=logging.DEBUG)
            # Train this node
            self.train_sweep(use_test_data)
            
            # Divide available instances according to label
            all_instances = defaultdict(list)
            for instance, label in self.input_node.request_data_for_training(
                    use_test_data):
                all_instances[label].append(instance)
                
            self._log("Keeping only %s percent of training data" %
                      self.train_percentage_selected,
                      level=logging.DEBUG)
            r = random.Random(self.run_number)
            # Retain only *percentage_selected* percent of the data
            retained_instances = []

            for label, instances in all_instances.iteritems():
                # enable random choice of samples
                r.shuffle(instances)
                if not self.reduce_class or \
                        self.train_percentage_selected == 100:
                    end_index = int(round(len(instances) *
                                          self.train_percentage_selected / 100))
                elif not (self.reduce_class == label):
                    end_index = len(instances)
                else:  # self.reduce_class==label--> reduction needed
                    end_index = int(round(len(instances) *
                                          self.train_percentage_selected / 100))

                retained_instances.extend(zip(instances[0:end_index],
                                              [label]*end_index))
            # mix up samples between the different labels
            r.shuffle(retained_instances)
            # Compute a generator the yields the train data and
            # encapsulate it in an object that memoizes its outputs and
            # provides a "fresh" method that returns a new generator that will
            # yield the same sequence            
            train_data_generator = ((self.execute(data), label)
                                    for (data, label) in retained_instances)
                     
            self.data_for_training = MemoizeGenerator(train_data_generator,
                                                      caching=self.caching) 
        
        self._log("Data for training finished", level=logging.DEBUG)
        # Return a fresh copy of the generator  
        return self.data_for_training.fresh()