def gen_var_result_scores(clf, X_train, X_test, y_train=None, n_rounds=100, verbose=False, fit=True): """ Given classifier and training data, return variable importances on the test data on a per-sample basis. UPDATED. Result scores represent the difference between the observed score and the mean score obtained if the specific variable is resampled from the training data randomly. """ if fit: if verbose: print 'Training model...' sys.stdout.flush() t0 = time.time() clf.fit(X_train, y_train) if verbose: t1 = time.time() print 'Training took %.2f seconds' % (t1 - t0) real_scores = clf.predict_proba(X_test)[:, 1] result_scores = np.zeros(X_test.shape) if verbose: pb = ProgressBar() progress = 0 for var in range(X_train.shape[1]): single_var_scores = np.zeros([X_test.shape[0], n_rounds]) X_test_mod = np.copy(X_test) for j in range(n_rounds): if verbose: progress += 1 pb.update_progress(progress / float(n_rounds * X_train.shape[1])) X_test_mod[:, var] = np.random.choice(X_train[:, var], X_test.shape[0], replace=True) single_var_scores[:, j] = clf.predict_proba(X_test_mod)[:, 1] result_scores[:, var] = np.abs(real_scores - np.mean(single_var_scores, axis=1)) return result_scores
def partial_plot(clf, X_, x_name, labels, n_points=100, lims=None, n_samp=1000, categorical=False): X = X_.copy() N = len(X) if lims == None: x_min = X[x_name].min() x_max = X[x_name].max() else: x_min = lims[0] x_max = lims[1] if categorical: x = np.array([x_min, x_max] * int(n_points / 2.)) else: x = np.linspace(x_min, x_max, n_points) p = [] pb = ProgressBar() for i, x_i in enumerate(x): X[x_name] = [x_i] * N _idx = np.random.randint( N, size=n_samp) #sub sample to reduce time to evaluate p.append( clf.predict_proba(X.values[_idx], labels=labels[_idx])[1].mean(0)) pb.update_progress(i / n_points) return x, np.array(p)
def partial_dependence(clf, data, cols, percentiles=(5, 95), cluster=True, n_clusters=100, n_points=100, logit=False, mesh_grid_output=False, show_progress=True, verbose=False): """Partial dependence of ``cols``. Partial dependence plots show the marginal effects of one or more features on the model output. Parameters ---------- clf : Sklean classifier class, A fitted sklearn classifier object data : Numpy array of data, Numpy array containing columns as features and rows as examples cols : List of ints, Features to comput partial dependence over. percentiles : (max, min) range tuple, default = (5, 95), Range for cols to be computed over, given in percentiles of distributions. cluster : Boolean, default = True, If true data is clustered using K-means. Cluster centers are used to compute partial dependence rather than data. This dramatically reduces computation time at little cost in accuracy. n_clusters : int, default = 100, Number of K-means clusters to compute. n_points : int, default = 100, Number of grid points used to calcualte partial dependence per feature. If number of featues is 2 or more this number should be reduced. logit : Boolean, default = False, If true perform logit transformation on model outputs. mesh_grid_output : Boolean, default = False, If true ouput is given as numpy mesh grid rather than flat arrays. This option is useful for 2D and 3D plots. show_progress : Boolean, default = True, Show progress bar. verbose : Boolean, default = False, output level. Returns ------- res : List of numpy arrays, This list contains the feature values evaluated at along with partial dependence results. res[:-1] contains cols points while res[-1] is the partial dependence. By default arrays are 1D however if grid_mesh_output = True the returned list will contain meshgirds. """ X = data.copy() Z_mins = np.percentile(X[:, cols], percentiles[0], axis=0) Z_maxs = np.percentile(X[:, cols], percentiles[1], axis=0) Z_mesh = np.meshgrid(*[ np.linspace(z_jmin, z_jmax, n_points)[:-1] for z_jmin, z_jmax in zip(Z_mins, Z_maxs) ]) Z = np.stack((z_j.flatten() for z_j in Z_mesh), axis=-1) if cluster: X = KMeans(n_clusters, verbose=int(verbose)).fit(X).cluster_centers_ N, N_Z = len(X), len(Z) pb = ProgressBar() p = [] for i, z_i in enumerate(Z): X[:, cols] = np.tile(z_i, (N, 1)) probs = clf.predict_proba(X).T[1] if logit: probs = np.log(probs / (1 - probs)) p.append(probs.mean(0)) if show_progress: pb.update_progress(i / N_Z) Y = np.array(p) if mesh_grid_output: Y_res = Y.reshape(Z_mesh[0].shape) Z_res = Z_mesh else: Z_res = [Z_i.flatten() for Z_i in Z_mesh] Y_res = Y res = Z_res + [Y_res] return res