コード例 #1
0
 def __init__(self, X, y, dataset_name, X_val=None, dont_persist_results=False):
     self.trials = []
     self.trials_store = TrialsRecorder(dataset_name)
     self.X = X
     self.y = y
     self.X_val = X_val
     self.dont_persist_results = dont_persist_results
コード例 #2
0
class ParameterSweep:
    def __init__(self, X, y, dataset_name, X_val=None, dont_persist_results=False):
        self.trials = []
        self.trials_store = TrialsRecorder(dataset_name)
        self.X = X
        self.y = y
        self.X_val = X_val
        self.dont_persist_results = dont_persist_results

    @staticmethod
    def getParamsGenerator(params):
        params_keys = list(params)
        for params_values in itertools.product(*params.values()):
            yield dict(zip(params_keys, params_values))

    @staticmethod
    def getRandomParams(params):
        arr = list(ParameterSweep.getParamsGenerator(params))
        random.shuffle(arr)
        return arr

    def run_configuration(self, Clf, params, scorer, splits, use_proba=False, save_y_oof=False):
        scores = []
        t0 = time()

        clf = Clf(**params)

        if save_y_oof:
            y_oof = np.zeros(len(self.y))
        if self.X_val is not None:
            y_vals = []

        for i, (train_idx, test_idx) in enumerate(splits):
            t_i = time()
            X_train, X_test, y_train, y_test = self.X[train_idx], self.X[test_idx], self.y[train_idx], self.y[test_idx]

            clf.fit(X_train, y_train)

            y_pred = clf.predict_proba(X_test)[:,1] if use_proba else clf.predict(X_test)

            if save_y_oof:
                y_oof[list(test_idx)] = y_pred
            if self.X_val is not None:
                y_vals.append(clf.predict_proba(self.X_val)[:,1] if use_proba else clf.predict(self.X_val))


            scores.append(scorer(y_test, y_pred))
            logger.debug(" iter {0}: {1:2.2} sec \t score: {2:.4},  running: {3:.4}".format(i, time()-t_i, scores[-1], np.mean(scores)))

        stats = {"score": np.mean(scores), "clf": Clf.__name__, "params": params, "num_folds": len(splits),
                 "train_size": X_train.shape[0], "test_size": X_test.shape[0], "sample_size": X_train.shape[0] + X_test.shape[0]}

        if save_y_oof:
            stats["y_oof"] = y_oof
        if self.X_val is not None:
            stats["y_val"] = np.mean(y_vals, axis=0) if use_proba else mode(y_vals).mode

        logger.info(" time: {0:.2f} sec,  score: {1:.4f} +/- {2:.4f}, stats: {3}".format(time()-t0, np.mean(scores), np.std(scores), stats))

        self.trials.append(stats)

        if not self.dont_persist_results:
            self.trials_store.saveTrial(stats)

        return stats

    def run_configuration_sweep(self, Clf, param_values, scorer, splits, search_plan, use_proba=False, save_y_oof=False, num_samples=10):
        params_samples = []
        if search_plan == "grid":
            params_samples = list(ParameterSweep.getParamsGenerator(param_values))
        if search_plan == "random":
            params_samples = ParameterSweep.getRandomParams(param_values)[:num_samples]

        stats = []
        for params in params_samples:
            stats.append(self.run_configuration(Clf, params, scorer, splits, use_proba, save_y_oof))

        return stats

    def get_trials(self, query=None):
        return [x for x in self.trials_store.getAllTrials(query)]

    def clear_all_trials(self):
        self.trials_store.clear_all_trials()