Beispiel #1
0
def gen_var_result_scores(clf, X_train, X_test, y_train=None,
                              n_rounds=100, verbose=False, fit=True):
    """
    Given classifier and training data, return variable importances on the
    test data on a per-sample basis. UPDATED.
    
    Result scores represent the difference between the observed score and
    the mean score obtained if the specific variable is resampled from the
    training data randomly.
    """
    if fit:
        if verbose:
            print 'Training model...'
            sys.stdout.flush()
            t0 = time.time()
        clf.fit(X_train, y_train)
        if verbose:
            t1 = time.time()
            print 'Training took %.2f seconds' % (t1 - t0)
    real_scores = clf.predict_proba(X_test)[:, 1]
    result_scores = np.zeros(X_test.shape)
    if verbose:
        pb = ProgressBar()
        progress = 0
    for var in range(X_train.shape[1]):
        single_var_scores = np.zeros([X_test.shape[0], n_rounds])
        X_test_mod = np.copy(X_test)
        for j in range(n_rounds):
            if verbose:
                progress += 1
                pb.update_progress(progress / float(n_rounds * X_train.shape[1]))
            X_test_mod[:, var] = np.random.choice(X_train[:, var], X_test.shape[0], replace=True)
            single_var_scores[:, j] = clf.predict_proba(X_test_mod)[:, 1]
        result_scores[:, var] = np.abs(real_scores - np.mean(single_var_scores, axis=1))
    return result_scores
Beispiel #2
0
def partial_plot(clf,
                 X_,
                 x_name,
                 labels,
                 n_points=100,
                 lims=None,
                 n_samp=1000,
                 categorical=False):
    X = X_.copy()
    N = len(X)
    if lims == None:
        x_min = X[x_name].min()
        x_max = X[x_name].max()
    else:
        x_min = lims[0]
        x_max = lims[1]
    if categorical:
        x = np.array([x_min, x_max] * int(n_points / 2.))
    else:
        x = np.linspace(x_min, x_max, n_points)
    p = []
    pb = ProgressBar()
    for i, x_i in enumerate(x):
        X[x_name] = [x_i] * N
        _idx = np.random.randint(
            N, size=n_samp)  #sub sample to reduce time to evaluate
        p.append(
            clf.predict_proba(X.values[_idx], labels=labels[_idx])[1].mean(0))
        pb.update_progress(i / n_points)
    return x, np.array(p)
def partial_dependence(clf,
                       data,
                       cols,
                       percentiles=(5, 95),
                       cluster=True,
                       n_clusters=100,
                       n_points=100,
                       logit=False,
                       mesh_grid_output=False,
                       show_progress=True,
                       verbose=False):
    """Partial dependence of ``cols``.
    
    Partial dependence plots show the marginal effects of one or more features on 
    the model output. 
    
    
    Parameters
    ----------
    
    clf : Sklean classifier class,
        A fitted sklearn classifier object
        
    data : Numpy array of data,
        Numpy array containing columns as features and 
        rows as examples
        
    cols : List of ints, 
        Features to comput partial dependence over. 
    
    percentiles : (max, min) range tuple, default = (5, 95),
        Range for cols to be computed over, given in percentiles
        of distributions. 
        
    cluster : Boolean, default = True, 
        If true data is clustered using K-means. Cluster centers are
        used to compute partial dependence rather than data. This 
        dramatically reduces computation time at little cost in 
        accuracy. 
        
    n_clusters : int, default = 100,
        Number of K-means clusters to compute. 
        
    n_points : int, default = 100, 
        Number of grid points used to calcualte partial dependence per feature. 
        If number of featues is 2 or more this number should be reduced. 
        
    logit : Boolean, default = False, 
        If true perform logit transformation on model outputs. 
        
    mesh_grid_output : Boolean, default = False, 
        If true ouput is given as numpy mesh grid rather than
        flat arrays. This option is useful for 2D and 3D plots.
        
    show_progress : Boolean, default = True, 
        Show progress bar. 
        
    verbose : Boolean, default = False, 
        output level. 
    
    
    
    Returns
    -------
    
    res : List of numpy arrays,
        This list contains the feature values evaluated at along with 
        partial dependence results. res[:-1] contains cols points while res[-1]
        is the partial dependence. By default arrays are 1D however if 
        grid_mesh_output = True the returned list will contain meshgirds.
        
     """
    X = data.copy()

    Z_mins = np.percentile(X[:, cols], percentiles[0], axis=0)
    Z_maxs = np.percentile(X[:, cols], percentiles[1], axis=0)

    Z_mesh = np.meshgrid(*[
        np.linspace(z_jmin, z_jmax, n_points)[:-1]
        for z_jmin, z_jmax in zip(Z_mins, Z_maxs)
    ])
    Z = np.stack((z_j.flatten() for z_j in Z_mesh), axis=-1)

    if cluster:
        X = KMeans(n_clusters, verbose=int(verbose)).fit(X).cluster_centers_

    N, N_Z = len(X), len(Z)

    pb = ProgressBar()
    p = []
    for i, z_i in enumerate(Z):
        X[:, cols] = np.tile(z_i, (N, 1))

        probs = clf.predict_proba(X).T[1]
        if logit:
            probs = np.log(probs / (1 - probs))
        p.append(probs.mean(0))

        if show_progress: pb.update_progress(i / N_Z)

    Y = np.array(p)

    if mesh_grid_output:
        Y_res = Y.reshape(Z_mesh[0].shape)
        Z_res = Z_mesh
    else:
        Z_res = [Z_i.flatten() for Z_i in Z_mesh]
        Y_res = Y

    res = Z_res + [Y_res]

    return res