コード例 #1
0
    def setUpClass(cls):
        """Set up model to test."""
        cls = cls._prep_data(cls)
        cls.mod = Incremental(
            StreamingRFC(n_estimators_per_chunk=1,
                         max_n_estimators=39,
                         verbose=1))

        # Set expected number of estimators
        # This should be set manually depending on data.
        cls.expected_n_estimators = 10

        # Set helper values
        super().setUpClass()
コード例 #2
0
    def setUpClass(cls):
        """Set up model to test."""
        cls = cls._prep_data(cls)
        cls.mod = Incremental(
            StreamingRFC(n_estimators_per_chunk=20,
                         n_jobs=-1,
                         max_n_estimators=np.inf,
                         verbose=1))

        # Set expected number of estimators
        cls.expected_n_estimators = 200

        # Set helper values
        super().setUpClass()
コード例 #3
0
    def setUpClass(cls):
        """Set up model to test."""
        cls.n_samples = 1000
        cls.x, cls.y = sklearn.datasets.make_blobs(n_samples=int(2e4),
                                                   random_state=0,
                                                   n_features=40,
                                                   centers=2,
                                                   cluster_std=100)

        cls.mod = StreamingRFC(n_estimators_per_chunk=1, max_n_estimators=39)

        # Set expected number of estimators
        cls.expected_n_estimators = 39

        # Set helper values
        super().setUpClass()
コード例 #4
0
    def setUpClass(cls):
        """Set up model to test."""
        cls.spf_n_fits = 10
        cls.spf_sample_prop = 0.1
        cls.dask_feeding = False
        cls.n_estimators_per_sample = 10

        cls.mod = StreamingRFC(
            verbose=1,
            n_estimators_per_chunk=cls.n_estimators_per_sample,
            max_n_estimators=np.inf,
            dask_feeding=cls.dask_feeding,
            spf_sample_prop=cls.spf_sample_prop,
            spf_n_fits=cls.spf_n_fits)

        super().setUpClass()
コード例 #5
0
    def setUpClass(cls):
        cls._prep_data(cls)

        n_iter = 3
        cls.srfc_grid = RandomizedSearchCV(StreamingRFC(n_jobs=2, verbose=1),
                                           param_distributions=SRFCGRID,
                                           scoring='roc_auc',
                                           n_iter=n_iter * 2,
                                           verbose=2,
                                           n_jobs=3,
                                           cv=4)

        cls.rfc_grid = RandomizedSearchCV(RandomForestClassifier(n_jobs=2),
                                          param_distributions=RFCGRID,
                                          scoring='roc_auc',
                                          n_iter=n_iter,
                                          verbose=2,
                                          n_jobs=3,
                                          cv=4)
コード例 #6
0
    def setUpClass(cls):
        """Set up model to test."""
        cls.n_samples = 1000
        cls.x, cls.y = dask_ml.datasets.make_blobs(n_samples=2e5,
                                                   chunks=1e4,
                                                   random_state=0,
                                                   n_features=40,
                                                   centers=2,
                                                   cluster_std=100)

        cls.mod = StreamingRFC(n_estimators_per_chunk=1,
                               max_features=cls.x.shape[1],
                               max_n_estimators=np.inf)

        # Set expected number of estimators
        cls.expected_n_estimators = 100

        # Set helper values
        super().setUpClass()
コード例 #7
0
def run_on_blobs():
    x, y = dask_ml.datasets.make_blobs(n_samples=1e8,
                                       chunks=1e5,
                                       random_state=0,
                                       centers=3)

    x = dd.dataframe.from_array(x)
    y = dd.dataframe.from_array(y)

    print(f"Rows: {x.shape[0].compute()}")

    ests_per_chunk = 4
    chunks = len(x.divisions)

    srfc = Incremental(StreamingRFC(n_estimators_per_chunk=ests_per_chunk,
                                    max_n_estimators=np.inf,
                                    verbose=1,
                                    n_jobs=4))
    srfc.fit(x, y,
             classes=y.unique().compute())
コード例 #8
0
    def _generate_comparable_models(self,
                                    srfc_n_estimators_per_chunk: int,
                                    srfc_n_partial_fit_calls: int,
                                    srfc_sample_prop: float,
                                    n_jobs: int = 4):
        """
        Set values for streaming models and different set ups. Create two comparable rfcs designed to see
        equivalent numbers of rows.

        Two RFCs are required to compare to different settings. One should see the equivalent of all the data once,
        the other should see more. This should cover the following srfc model combinations:

        - "Manual" feeding (using .partial_fit):
          - Sequential: Will see all the data once (sample size is n / n_partial_fit_calls) per estimator
          - Random: Will see sample_prop * n * n_partial_fit per estimator
        - "Auto" feeding
          - spf: (dask_feeding==False). Will see Will see sample_prop * n * n_partial_fit per estimator.
          - dask: (dask_feeding=True). Will see all the data once (sample size is n / n_partial_fit_calls)
                   per estimator (?) Need to verify this.

        :param srfc_n_estimators_per_chunk: Number of estimators per chunk.
        :param srfc_n_partial_fit_calls: Number of calls to partial fit. Either used manual-sequential or manual-random,
                                         or supplied to fit to handle. In the case of manual-sequential, the size of the
                                         sample is set dynamically to split the data into this number of chunks (all
                                         data is seen once).
        :param srfc_sample_prop: The proportion of data to sample in when feeding .partial_fit with manual-random or
                                 by using .fit.
        :return:
        """
        self.srfc_n_estimators_per_chunk = srfc_n_estimators_per_chunk
        self.srfc_n_partial_fit_calls = srfc_n_partial_fit_calls
        self.srfc_sample_prop = srfc_sample_prop

        # Number of estimators for RFC
        # Set so overall it will see an equal number of rows to the srfc using spf
        self.rfc_n_estimators = int(self.srfc_n_estimators_per_chunk *
                                    self.srfc_n_partial_fit_calls *
                                    self.srfc_sample_prop)

        # Make another that will see the same number of rows as the models that see the data once
        self.rfc_once_n_estimators = int(self.srfc_n_estimators_per_chunk *
                                         self.srfc_n_partial_fit_calls)

        self.rfc = RandomForestClassifier(n_estimators=self.rfc_n_estimators,
                                          n_jobs=n_jobs)

        self.rfc_once = RandomForestClassifier(
            n_estimators=self.rfc_once_n_estimators, n_jobs=n_jobs)

        # "Manual-sequential" and "manual-random" srfc
        # Parameters are the same and object is cloned before fitting.
        self.srfc = StreamingRFC(
            n_estimators_per_chunk=self.srfc_n_estimators_per_chunk,
            n_jobs=n_jobs)

        # "Auto-spf" srfc
        self.srfc_spf = StreamingRFC(dask_feeding=False,
                                     n_estimators_per_chunk=self.srfc_n_estimators_per_chunk, \
                                     spf_n_fits=self.srfc_n_partial_fit_calls,
                                     spf_sample_prop=self.srfc_sample_prop,
                                     n_jobs=n_jobs)
        # "Auto-dask" srfc
        self.srfc_dask = StreamingRFC(
            dask_feeding=True,
            n_estimators_per_chunk=self.srfc_n_estimators_per_chunk,
            n_jobs=n_jobs)
コード例 #9
0
 def setUp(self):
     self.mod = StreamingRFC(n_estimators_per_chunk=1,
                             max_n_estimators=np.inf,
                             verbose=2)