def _apply_fun(self, blob_generator, fun): threadsafe_generator = ThreadsafeIter(blob_generator,len(self.pipelines)) executor = concurrent.futures.ThreadPoolExecutor(max_workers=len(self.pipelines)) # Start the load operations and mark each future with its URL futures = [executor.submit(fun, pipeline, threadsafe_generator) for pipeline in self.pipelines] generators = [gen for gen in concurrent.futures.as_completed(futures)] has_more_blobs = True while has_more_blobs: b = Blob() out_blobs = [next(gen.result()) for gen in generators] blob_uuids = [blob.meta.uuid for blob in out_blobs] # If not all UUIDs are equal or num outputs is not the same as the number of pipelines if not out_blobs or blob_uuids.count(blob_uuids[0]) != len(blob_uuids) or len(out_blobs) != len(self.pipelines): logging.error("Number of elements changed within ParallelAlgorithm pipelines. This is not allowed!") raise Exception("Error") # If there are no more blobs, we are done if len(out_blobs) == 0: has_more_blobs = False else: b.data = [blob.data.ravel() for blob in out_blobs] b.data = hstack(b.data) b.meta = out_blobs[0].meta yield b logging.info("Finished training in ParallelAlgorithm")
def _train(self, blob_generator): # First, collect all elements of the input data = [] labels = [] metas = [] for blob in blob_generator: if self.use_sparse is None: # Determine automatically by comparing size sparse_vec = scipy.sparse.csr_matrix(blob.data.ravel()) sparse_memory_req = sparse_vec.data.nbytes + sparse_vec.indptr.nbytes + sparse_vec.indices.nbytes self.use_sparse = sparse_memory_req < blob.data.nbytes logging.debug( 'Using sparse format for collecting features: %s' % self.use_sparse) logging.debug('Blob data needs %i' % blob.data.nbytes) logging.debug('%i with sparse vs %i with dense' % (sparse_memory_req, blob.data.nbytes)) if self.use_sparse: data.append(scipy.sparse.csr_matrix(blob.data.ravel())) else: data.append(blob.data.ravel()) labels.append(blob.meta.label) metas.append(blob.meta) # Stack data to matrix explicitly here, as both fit and predict # would to this stacking otherwise try: if self.use_sparse: data = scipy.sparse.vstack(data) data = data.astype(np.float64) else: data = np.array(data, dtype=np.float64) except ValueError: logging.error( "Length of all feature vectors need to be the same for Classificator training." ) raise Exception logging.warning( 'Training the model with feature dim %i, this might take a while' % data.shape[1]) self.model.fit(data, labels) logging.warning('Finished') for (d, m) in zip(self.model.decision_function(data), metas): b = Blob() b.data = d b.meta = m yield b
def _train(self, blob_generator): # If you need all data at once: # Remember the metas! # Example data = [] labels = [] metas = [] for blob in blob_generator: data.append(blob.data.ravel()) labels.append(blob.meta.label) metas.append(blob.meta) numpy_data = vstack(data) # process numpy_data # ... # Create generator for next layer for d, m in zip(data, metas): b = Blob() b.data = d b.meta = m yield b
def _train(self, blob_generator): # First, collect all elements of the input data = [] labels = [] metas = [] for blob in blob_generator: data.append(self._add_bias(blob.data.ravel())) labels.append(blob.meta.label) metas.append(blob.meta) try: data = vstack(data) except ValueError: logging.error( "Size of all input data needs to be the same for SVM training." ) raise Exception self.svm_model.fit(data, labels) for (d, m) in zip(self.svm_model.predict(data), metas): b = Blob() b.data = d b.meta = m yield b