def request_data_for_training(self, use_test_data): """ Returns the time windows that can be used for training of subsequent nodes """ # TODO:Is all this really necessary? if not use_test_data: # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "train") else: key = (0, self.current_split, "train") # Check if there is training data for the current split and run if key in self.dataset.data.keys(): self._log("Accessing input dataset's training prediction vectors.") self.data_for_training = MemoizeGenerator(self.dataset.get_data(*key).__iter__(), caching=self.caching) else: # Returns an iterator that iterates over an empty sequence # (i.e. an iterator that is immediately exhausted), since # this node does not provide any data that is explicitly # dedicated for training self._log("No training data available.") self.data_for_training = MemoizeGenerator((x for x in [].__iter__()), caching=self.caching) else: # Return the test data as there is no additional data that # was dedicated for training return self.request_data_for_testing() # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document """ assert (self.input_node != None) self._log("Data for training is requested.", level=logging.DEBUG) # If we haven't computed the data for training yet if self.data_for_training == None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence train_data_generator = \ itertools.imap(lambda (data, label) : (self.execute(data), label), self.external_training_set) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing == None: self._log("Accessing input dataset's test feature vector windows.") # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") test_data_generator = self.dataset.get_data(*key).__iter__() self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data for testing of subsequent nodes .. todo:: to document """ if self.data_for_testing is None: # set window definition for test phase windower file self.window_definition = \ Windower._load_window_spec(self.windower_spec_file, self.local_window_conf) test_data = list(self.input_node.request_data_for_testing()) # create stream of windows self.window_stream(test_data) # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.marker_windower) self.data_for_testing = MemoizeGenerator(test_data_generator) # Return a fresh copy of the generator return self.data_for_testing.fresh() else: return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ self._log("Requesting test data...") # If we haven't read the data for testing yet if self.data_for_testing is None: self._log("Start streaming.") self.dataset.set_window_defs( window_definition=self.window_definition, nullmarker_stride_ms=self.nullmarker_stride_ms, no_overlap=self.no_overlap, data_consistency_check=self.data_consistency_check) if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") # Create a generator that emits the windows test_data_generator = ((sample, label) for (sample, label) in self.dataset.get_data(*key)) self.data_for_testing = \ MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def process(self): """ Processes all data that is provided by the input node Returns a generator that yields the data after being processed by this node. """ assert(self.input_node != None), "No input node specified!" # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0), "Node not trained!" data_generator = \ itertools.imap(lambda (data, label): (self.execute(data), label), self.input_node.process()) self.client = TimeSeriesClient(ts_stream = data_generator) self.client.connect() self.marker_windower = MarkerWindower(data_client=self.client, windowdefs=self.window_definition, stridems=self.nullmarker_stride_ms) if self.marker_windower == None: self.window_stream() # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.marker_windower) self.data_for_testing = MemoizeGenerator(test_data_generator) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes The principle of obtaining the testing data are the same as the principles used in obtaining the training data set. The only difference here is that, in the case in which there is no testing data available, we allow for the training data to be used as testing data. """ # If we haven't read the data for testing yet if self.data_for_testing == None: self._log("Accessing input dataset's test feature vector windows.") # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "test") else: key = (0, self.current_split, "test") test_data_generator = self.dataset.get_data(*key).__iter__() self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes of the node chain A call to this method might involve training of the node chain up this node. If use_test_data is true, all available data is used for training, otherwise only the data that is explicitly for training. """ assert (self.input_node != None) self._log("Data for training is requested.", level=logging.DEBUG) # If we haven't computed the data for training yet if self.data_for_training == None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence # This line crashes without the NodeMetaclass bug fix train_data_generator = \ itertools.imap(lambda (data, label) : self.print_data(data, label), self.input_node.request_data_for_training( use_test_data)) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_testing(self): """ Returns data for testing of subsequent nodes of the node chain A call to this node might involve evaluating the whole node chain up to this node. """ assert (self.input_node != None) self._log("Data for testing is requested.", level=logging.DEBUG) # If we haven't computed the data for testing yet if self.data_for_testing == None: # Assert that this node has already been trained assert (not self.is_trainable() or self.get_remaining_train_phase() == 0) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that'll # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) test_data_generator = \ itertools.imap(lambda (data, label): self.print_data(data, label), self.input_node.request_data_for_testing()) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns the data that can be used for training of subsequent nodes This method streams training data and sends it to the subsequent nodes. If one looks at the tutorial related to building new nodes (available in the tutorial section), one can see exactly where the ``request_data`` methods are put to use. The following example is one that was extracted from the :mod:`~pySPACE.missions.nodes.source.feature_vector_source.FeatureVectorSourceNode` which should(in theory at least) be implementable for all types of data. """ if not use_test_data: # If the input dataset consists only of one single run, # we use this as input for all runs to be conducted (i.e. we # rely on later randomization of the order). Otherwise # we use the data for this run number if self.dataset.meta_data["runs"] > 1: key = (self.run_number, self.current_split, "train") else: key = (0, self.current_split, "train") # Check if there is training data for the current split and run if key in self.dataset.data.keys(): self._log( "Accessing input dataset's training feature vector windows." ) self.data_for_training = MemoizeGenerator( self.dataset.get_data(*key).__iter__(), caching=self.caching) else: # Returns an iterator that iterates over an empty sequence # (i.e. an iterator that is immediately exhausted), since # this node does not provide any data that is explicitly # dedicated for training self._log("No training data available.") self.data_for_training = MemoizeGenerator( (x for x in [].__iter__()), caching=self.caching) else: # Return the test data as there is no additional data that # was dedicated for training return self.request_data_for_testing() # Return a fresh copy of the generator return self.data_for_training.fresh()
def request_data_for_testing(self): # Create split lazily when required if self.split_indices_test == None: self._create_split() # Create test data generator self.data_for_testing = MemoizeGenerator( self.data[i] for i in self.split_indices_test[self.current_split]) return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns the data that can be used for training of subsequent nodes .. todo:: to document """ # set window definition for train phase windower file self.window_definition = \ Windower._load_window_spec(self.windower_spec_file_train, self.local_window_conf) self._log("Requesting train data...") if self.data_for_training is None: if not use_test_data: # Get training and test data (with labels) train_data = \ list(self.input_node.request_data_for_training(use_test_data=use_test_data)) # If training or test data is an empty list if train_data == []: self.data_for_training = MemoizeGenerator( (x for x in [].__iter__()), caching=True) return self.data_for_training.fresh() # create stream of self.window_stream(train_data) # Create a generator that emits the windows train_data_generator = ((sample, label) for (sample, label) in self.marker_windower) self.data_for_training = MemoizeGenerator(train_data_generator, caching=True) return self.data_for_training.fresh() else: # Return the test data as there is no additional data that # was dedicated for training self.data_for_training = self.request_data_for_testing() return self.data_for_training else: return self.data_for_training.fresh()
def request_data_for_training(self, use_test_data): """ Returns the data for training of subsequent nodes .. todo:: to document """ # Create split lazily when required if self.train_data == None: self._create_split() # Create training data generator self.data_for_training = \ MemoizeGenerator(instance for instance in self.train_data) return self.data_for_training.fresh()
def request_data_for_testing(self): """ Returns the data for testing of subsequent nodes .. todo:: to document """ # Create cv-splits lazily when required if self.split_indices == None: self._create_splits() # Only that data can be used for testing which is explicitly # specified for this purpose by the current cv-split self.data_for_testing = MemoizeGenerator( self.data[i] for i in self.split_indices[self.current_split]) return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns the data for training of subsequent nodes .. todo:: to document """ # Create cv-splits lazily when required if self.split_indices == None: self._create_splits() # All data can be used for training which is not explicitly # specified for testing by the current cv-split self.data_for_training = MemoizeGenerator( self.data[i] for i in range(len(self.data)) if not i in self.split_indices[self.current_split]) return self.data_for_training.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing == None: generated_data = self.generate_random_data() # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in generated_data) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=True) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns the data that can be used for testing of subsequent nodes .. todo:: to document """ # If we haven't read the data for testing yet if self.data_for_testing is None: self.time_series = [(TimeSeries(input_array=numpy.ones((2, 2)) * i, channel_names=["X", "Y"], sampling_frequency=2), random.choice(["A", "B"])) for i in range(23)] # Create a generator that emits the windows test_data_generator = ((sample, label) \ for (sample, label) in self.time_series) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=True) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_testing(self): """ Returns data for testing of subsequent nodes .. todo:: to document """ assert(self.input_node is not None) self._log("Data for testing is requested.", level=logging.DEBUG) # If we haven't computed the data for testing yet if self.data_for_testing is None: # Assert that this node has already been trained assert(not self.is_trainable() or self.get_remaining_train_phase() == 0) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_testing(): all_instances[label].append(instance) retained_instances = self.balance_instances(all_instances) # Compute a generator the yields the test data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that will # yield the same sequence self._log("Producing data for testing.", level=logging.DEBUG) test_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_testing = MemoizeGenerator(test_data_generator, caching=self.caching) self._log("Data for testing finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_testing.fresh()
def request_data_for_training(self, use_test_data): """ Returns data for training of subsequent nodes .. todo:: to document .. note:: This method works differently in InstanceSelectionNode than in other nodes: Only *percentage_selected* of the available data are returned. """ assert(self.input_node is not None) if self.train_percentage_selected > 100: self._log("Train percentage of %f reduced to 100." % self.train_percentage_selected, level=logging.ERROR) self.train_percentage_selected = 100 self._log("Data for training is requested.", level=logging.DEBUG) if self.train_percentage_selected == 100: return super(InstanceSelectionNode, self).request_data_for_training( use_test_data) # If we haven't computed the data for training yet if self.data_for_training is None: self._log("Producing data for training.", level=logging.DEBUG) # Train this node self.train_sweep(use_test_data) # Divide available instances according to label all_instances = defaultdict(list) for instance, label in self.input_node.request_data_for_training( use_test_data): all_instances[label].append(instance) self._log("Keeping only %s percent of training data" % self.train_percentage_selected, level=logging.DEBUG) r = random.Random(self.run_number) # Retain only *percentage_selected* percent of the data retained_instances = [] for label, instances in all_instances.iteritems(): # enable random choice of samples r.shuffle(instances) if not self.reduce_class or \ self.train_percentage_selected == 100: end_index = int(round(len(instances) * self.train_percentage_selected / 100)) elif not (self.reduce_class == label): end_index = len(instances) else: # self.reduce_class==label--> reduction needed end_index = int(round(len(instances) * self.train_percentage_selected / 100)) retained_instances.extend(zip(instances[0:end_index], [label]*end_index)) # mix up samples between the different labels r.shuffle(retained_instances) # Compute a generator the yields the train data and # encapsulate it in an object that memoizes its outputs and # provides a "fresh" method that returns a new generator that will # yield the same sequence train_data_generator = ((self.execute(data), label) for (data, label) in retained_instances) self.data_for_training = MemoizeGenerator(train_data_generator, caching=self.caching) self._log("Data for training finished", level=logging.DEBUG) # Return a fresh copy of the generator return self.data_for_training.fresh()