def setUpClass(cls): """Set up model to test.""" cls = cls._prep_data(cls) cls.mod = Incremental( StreamingRFC(n_estimators_per_chunk=1, max_n_estimators=39, verbose=1)) # Set expected number of estimators # This should be set manually depending on data. cls.expected_n_estimators = 10 # Set helper values super().setUpClass()
def setUpClass(cls): """Set up model to test.""" cls = cls._prep_data(cls) cls.mod = Incremental( StreamingRFC(n_estimators_per_chunk=20, n_jobs=-1, max_n_estimators=np.inf, verbose=1)) # Set expected number of estimators cls.expected_n_estimators = 200 # Set helper values super().setUpClass()
def setUpClass(cls): """Set up model to test.""" cls.n_samples = 1000 cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4), random_state=0, n_features=40, centers=2, cluster_std=100) cls.mod = StreamingRFC(n_estimators_per_chunk=1, max_n_estimators=39) # Set expected number of estimators cls.expected_n_estimators = 39 # Set helper values super().setUpClass()
def setUpClass(cls): """Set up model to test.""" cls.spf_n_fits = 10 cls.spf_sample_prop = 0.1 cls.dask_feeding = False cls.n_estimators_per_sample = 10 cls.mod = StreamingRFC( verbose=1, n_estimators_per_chunk=cls.n_estimators_per_sample, max_n_estimators=np.inf, dask_feeding=cls.dask_feeding, spf_sample_prop=cls.spf_sample_prop, spf_n_fits=cls.spf_n_fits) super().setUpClass()
def setUpClass(cls): cls._prep_data(cls) n_iter = 3 cls.srfc_grid = RandomizedSearchCV(StreamingRFC(n_jobs=2, verbose=1), param_distributions=SRFCGRID, scoring='roc_auc', n_iter=n_iter * 2, verbose=2, n_jobs=3, cv=4) cls.rfc_grid = RandomizedSearchCV(RandomForestClassifier(n_jobs=2), param_distributions=RFCGRID, scoring='roc_auc', n_iter=n_iter, verbose=2, n_jobs=3, cv=4)
def setUpClass(cls): """Set up model to test.""" cls.n_samples = 1000 cls.x, cls.y = dask_ml.datasets.make_blobs(n_samples=2e5, chunks=1e4, random_state=0, n_features=40, centers=2, cluster_std=100) cls.mod = StreamingRFC(n_estimators_per_chunk=1, max_features=cls.x.shape[1], max_n_estimators=np.inf) # Set expected number of estimators cls.expected_n_estimators = 100 # Set helper values super().setUpClass()
def run_on_blobs(): x, y = dask_ml.datasets.make_blobs(n_samples=1e8, chunks=1e5, random_state=0, centers=3) x = dd.dataframe.from_array(x) y = dd.dataframe.from_array(y) print(f"Rows: {x.shape[0].compute()}") ests_per_chunk = 4 chunks = len(x.divisions) srfc = Incremental(StreamingRFC(n_estimators_per_chunk=ests_per_chunk, max_n_estimators=np.inf, verbose=1, n_jobs=4)) srfc.fit(x, y, classes=y.unique().compute())
def _generate_comparable_models(self, srfc_n_estimators_per_chunk: int, srfc_n_partial_fit_calls: int, srfc_sample_prop: float, n_jobs: int = 4): """ Set values for streaming models and different set ups. Create two comparable rfcs designed to see equivalent numbers of rows. Two RFCs are required to compare to different settings. One should see the equivalent of all the data once, the other should see more. This should cover the following srfc model combinations: - "Manual" feeding (using .partial_fit): - Sequential: Will see all the data once (sample size is n / n_partial_fit_calls) per estimator - Random: Will see sample_prop * n * n_partial_fit per estimator - "Auto" feeding - spf: (dask_feeding==False). Will see Will see sample_prop * n * n_partial_fit per estimator. - dask: (dask_feeding=True). Will see all the data once (sample size is n / n_partial_fit_calls) per estimator (?) Need to verify this. :param srfc_n_estimators_per_chunk: Number of estimators per chunk. :param srfc_n_partial_fit_calls: Number of calls to partial fit. Either used manual-sequential or manual-random, or supplied to fit to handle. In the case of manual-sequential, the size of the sample is set dynamically to split the data into this number of chunks (all data is seen once). :param srfc_sample_prop: The proportion of data to sample in when feeding .partial_fit with manual-random or by using .fit. :return: """ self.srfc_n_estimators_per_chunk = srfc_n_estimators_per_chunk self.srfc_n_partial_fit_calls = srfc_n_partial_fit_calls self.srfc_sample_prop = srfc_sample_prop # Number of estimators for RFC # Set so overall it will see an equal number of rows to the srfc using spf self.rfc_n_estimators = int(self.srfc_n_estimators_per_chunk * self.srfc_n_partial_fit_calls * self.srfc_sample_prop) # Make another that will see the same number of rows as the models that see the data once self.rfc_once_n_estimators = int(self.srfc_n_estimators_per_chunk * self.srfc_n_partial_fit_calls) self.rfc = RandomForestClassifier(n_estimators=self.rfc_n_estimators, n_jobs=n_jobs) self.rfc_once = RandomForestClassifier( n_estimators=self.rfc_once_n_estimators, n_jobs=n_jobs) # "Manual-sequential" and "manual-random" srfc # Parameters are the same and object is cloned before fitting. self.srfc = StreamingRFC( n_estimators_per_chunk=self.srfc_n_estimators_per_chunk, n_jobs=n_jobs) # "Auto-spf" srfc self.srfc_spf = StreamingRFC(dask_feeding=False, n_estimators_per_chunk=self.srfc_n_estimators_per_chunk, \ spf_n_fits=self.srfc_n_partial_fit_calls, spf_sample_prop=self.srfc_sample_prop, n_jobs=n_jobs) # "Auto-dask" srfc self.srfc_dask = StreamingRFC( dask_feeding=True, n_estimators_per_chunk=self.srfc_n_estimators_per_chunk, n_jobs=n_jobs)
def setUp(self): self.mod = StreamingRFC(n_estimators_per_chunk=1, max_n_estimators=np.inf, verbose=2)