コード例 #1
0
def plot_violin_groups(data_pop_list,
                       labels=None,
                       title="",
                       xlabel="",
                       ylabel="",
                       filename=None,
                       dpi=150,
                       orientation='vertical',
                       samefig=False,
                       xlim=None,
                       ylim=None,
                       palette=None):
    # Create our own figure unless user requests we don't
    if not samefig: fig = P.figure()

    if labels is None:
        labels = ["pop %d" % i for i in range(len(data_pop_list))]

    # Must assemble the data into a single 1D vector of all populations with repeating labels
    data = DD.DataDict()
    for array, label in zip(data_pop_list, labels):
        data.append('label', [
            label,
        ] * len(array))
        data.append('value', array)

    if "vert" in orientation.lower():
        x = data['label']
        y = data['value']
    else:
        x = data['value']
        y = data['label']

    SB.violinplot(x=x, y=y, palette=palette)

    if title is not None: P.title(title)
    if xlabel is not None: P.xlabel(xlabel)
    if ylabel is not None: P.ylabel(ylabel)
    if xlim is not None: P.xlim(xlim)
    if ylim is not None: P.ylim(ylim)

    mlib.plot.markplot()

    if "vert" in orientation.lower():
        mlib.plot.disable_axis_offset(x_axis=False)
    else:
        mlib.plot.disable_axis_offset(y_axis=False)

    if filename is not None: P.savefig(filename, dpi=dpi)

    return fig
コード例 #2
0
ファイル: cluster.py プロジェクト: JPLMLIA/MLIB
def DBSCAN(indata, eps=0.3, min_samples=10, algorithm='auto', leaf_size=30):
    from sklearn.cluster import DBSCAN as dbscan_clustering

    # Handle incoming dictionary
    if isinstance(indata, dict):
        indata = DD.DataDict(indata).as_array()
    # Handle incoming DataDict
    elif isinstance(indata, DD.DataDict):
        indata = indata.as_array()
    # Handle incoming list of arrays
    elif isinstance(indata, list):
        indata = N.array(indata).T

    # Perform clustering
    dbfit = dbscan_clustering(eps=eps,
                              min_samples=min_samples,
                              algorithm=algorithm,
                              leaf_size=leaf_size).fit(indata)
    # Extract mask for core points and cluster labels from object
    mask_corepoints = N.zeros_like(dbfit.labels_, dtype=bool)
    mask_corepoints[dbfit.core_sample_indices_] = True
    cluster_labels = dbfit.labels_

    return cluster_labels, mask_corepoints
コード例 #3
0
ファイル: regression.py プロジェクト: JPLMLIA/MLIB
def mvlr_lasso(X, Y, alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, precompute=True, max_iter=-1,
               positive=False, selection='random', random_state=None, return_rms=True):
    """ Performs an iterative sparse linear fit. Impractically slow for any decent data size, alas.

    Args:
        X     : A 1 or 2D array containing the independent vectors
        Y     : A 1D array containing the values to regress
        alpha : regularization term, 0 = no regularization (normal MVLR), 1 = sparsity enforcement, higher increases strictness
        fit_intercept: Permit a non-zero intercept, useful if data isn't zero-centered beforehand
        normalize    : Normalize data before fitting
        copy_X       : Make a copy of X to protect against modification, but runs more slowly and takes more memory
        precompute   : Precompute Gram matrix to speed up calculations
        max_iter     : Maximum number of iterations to permit, -1 does not limit
        positive     : Forces coefficients to be positive combinations
        selection    : 'random' updates are much more swift to converge, 'cyclic' is... not?
        random_state : Random seed to use, None means use current state
        return_rms   : Predict the training data and measure training rms (slower)

    Returns:
        coefficients: Linear weights on each independent variable
        rms         : If requested via return_rms, include tuple of rms and y_predict
        y_predict   : See above

    >>> X = N.array([[0,1,2,3,4,5,6,7,8,9],[1,0,1,0,1,0,1,0,1,0],[1,0,0,1,0,0,1,0,0,1]]).astype(N.float64).T

    >>> X.shape
    (10, 3)

    >>> X
    array([[ 0.,  1.,  1.],
           [ 1.,  0.,  0.],
           [ 2.,  1.,  0.],
           [ 3.,  0.,  1.],
           [ 4.,  1.,  0.],
           [ 5.,  0.,  0.],
           [ 6.,  1.,  1.],
           [ 7.,  0.,  0.],
           [ 8.,  1.,  0.],
           [ 9.,  0.,  1.]])

    >>> Y = N.array( 2*X[:,0] + 5*X[:,1] + 10 )
    >>> Y
    array([ 15.,  12.,  19.,  16.,  23.,  20.,  27.,  24.,  31.,  28.])

    No sparsity enforcement

    >>> N.random.seed(0)
    >>> coefficients, intercept, rms, Y_pred = mvlr_lasso(X, Y, alpha = 0.01, fit_intercept = True, copy_X = True, return_rms = True, positive = False)
    >>> coefficients[0]
    1.9974444...
    >>> coefficients[1]
    4.9556657...
    >>> coefficients[2]
    -0.0

    >>> print('{:.10f}'.format(intercept))
    10.0336672510

    >>> print('{:.10f}'.format(rms))
    0.0221045772

    >>> Y_pred
    array([ 14.989333  ,  12.03111167,  18.98422183,  16.0260005 ,
            22.97911067,  20.02088933,  26.9739995 ,  24.01577817,
            30.96888833,  28.010667  ])

    Strong Sparsity enforcement

    >>> N.random.seed(0)
    >>> coefficients, intercept, rms, Y_pred = mvlr_lasso(X, Y, alpha = 2.0, fit_intercept = True, copy_X = True, return_rms = True, positive = False)
    >>> coefficients
    array([ 1.60606061,  0.        ,  0.        ])

    >>> intercept
    14.272727272727273

    >>> rms
    2.5584085962673258

    >>> Y_pred
    array([ 14.27272727,  15.87878788,  17.48484848,  19.09090909,
            20.6969697 ,  22.3030303 ,  23.90909091,  25.51515152,
            27.12121212,  28.72727273])

    """

    Y = N.array(Y).squeeze()

    from sklearn.linear_model import Lasso

    # Ensure train_feature_data is an Ndarray (samples, feats)

    # Handle incoming dictionary
    if isinstance(X, dict):
        X = DD.DataDict(X).as_array()
    # Handle incoming DataDict
    elif isinstance(X, DD.DataDict):
        X = X.as_array()
    # Handle incoming list of arrays
    elif isinstance(X, list):
        X = N.array(X).T

    Y = N.array(Y)

    # Enforce X, Y match dimension
    if X.shape[0] != len(Y):
        raise Exception('Must provide regression target each sample, dimension mismatch between X %s & Y %s' % (
        str(X.shape), str(len(Y))))

    # Perform fit
    regressor = Lasso(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X,
                      precompute=precompute,
                      max_iter=max_iter, positive=positive, selection=selection, random_state=random_state)

    regressor.fit(X, Y)

    coefficients = regressor.coef_
    intercept = regressor.intercept_

    # How well did we do?
    if return_rms:
        Y_predict = regressor.predict(X)
        rms = NUM.rms(Y_predict - Y)
        return coefficients, intercept, rms, Y_predict
    else:
        return coefficients, intercept
コード例 #4
0
def QDA_train(X,
              Y,
              reg_param=0.0,
              priors=None,
              store_covariance=False,
              tol=0.0001):
    """ Trains a quadratic discriminant analysis classifier from sklearn and returns the classifier. Extremely fast, even with 200k x 39 feats
    Args:
        X           : 2D Narray (numsamps x numfeats) feature data
        Y           : 1D iterable of class labels
        reg_param   : Regularization parameter (float, 0-1). Reg term becomes (1-reg_param)*sigma + reg_param*N.eye(n_feat)
        priors      : Prior probability of classes. Narray(num_classes)
        store_covariance: Return class covariance matrix
        tol         : Stopping condition, threshold used for rank estimation in SVD solver

    Returns:
        trained_classifier

    # >>> X = [[0, 0], [1, 1]]
    # >>> Y = [0, 1]
    # >>> classifier = QDA_train(X, Y)

    # >>> print(classifier.predict([[0, 0]]))
    # [0]

    # >>> print(classifier.predict([[1, 1]]))
    # [1]

    # >>> print(classifier.predict([[-0.8, -1],[0.8,1]]))
    # [0 1]

    Test 200k example
    >>> numsamp = 200000
    >>> numfeat = 3
    >>> N.random.seed(0)
    >>> X = N.random.random((numsamp, numfeat))
    >>> Y = N.linspace(0,3.99999,numsamp).astype(int)
    >>> classifier = QDA_train(X, Y)
    >>> N.random.seed(0)
    >>> classifier.predict(N.random.random(numfeat).reshape(1, -1))
    array([3])

    Test 200k, binary classification
    >>> numsamp = 200000
    >>> numfeat = 3
    >>> N.random.seed(0)
    >>> X = N.random.random((numsamp, numfeat))
    >>> Y = (N.linspace(0,.9999999,numsamp) + 0.5).astype(int)
    >>> classifier = QDA_train(X, Y)
    >>> N.random.seed(0)
    >>> classifier.predict(N.random.random(numfeat).reshape(1, -1))
    array([1])

    """

    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

    # Ensure X is an Ndarray (samples, feats)

    # Handle incoming dictionary
    if isinstance(X, dict):
        X = DD.DataDict(X).as_array()
    # Handle incoming DataDict
    elif isinstance(X, DD.DataDict):
        X = X.as_array()
    # Handle incoming list of arrays
    elif isinstance(X, list):
        X = N.array(X).T

    # Perform fit to training data
    QDA = QuadraticDiscriminantAnalysis(reg_param=reg_param,
                                        priors=None,
                                        store_covariance=False,
                                        tol=0.0001)

    QDA.fit(X, Y)

    return QDA
コード例 #5
0
def LDA_train(X,
              Y,
              solver='svd',
              shrinkage=None,
              priors=None,
              n_components=None,
              tol=0.0001):
    """ Trains an LDA classifier from sklearn and returns the classifier. Extremely fast, even with 200k x 39 feats
    Args:
        X           : 2D Narray (numsamps x numfeats) feature data
        Y           : 1D iterable of class labels
        solver      : Solver type to use: 'svd', 'lsqr','eigen'
        shrinkage   : A number 0-1 that varies between empirical covariance matrix (0) and diagonal matrix of individual variances (1).
                      Used for cases when number of samples < number_features, where empirical covariance is unreliable.
                      Ignored for svd, default None, 'auto' uses Ledoit-Wolf lemma, otherwise float between 0-1
        priors      : Prior probability of classes. Narray(num_classes)
        n_components: Number of components (output planes) for dimensionality reduction (< n_classes -1)
                      Typically this is 1 to use direct LDA best-separating plane, but can be used like PCA to return top N separating planes
        store_covariance: Return class covariance matrix
        tol         : Stopping condition, threshold used for rank estimation in SVD solver

    Returns:
        trained_classifier

    # >>> X = [[0, 0], [1, 1]]
    # >>> Y = [0, 1]
    # >>> classifier = LDA_train(X, Y)

    # >>> print(classifier.predict([[0, 0]]))
    # [0]

    # >>> print(classifier.predict([[1, 1]]))
    # [1]

    # >>> print(classifier.predict([[-0.8, -1],[0.8,1]]))
    # [0 1]

    Test 200k example
    >>> numsamp = 200000
    >>> numfeat = 3
    >>> N.random.seed(0)
    >>> X = N.random.random((numsamp, numfeat))
    >>> Y = N.linspace(0,4,numsamp).astype(int)
    >>> classifier = LDA_train(X, Y)
    >>> N.random.seed(0)
    >>> classifier.predict(N.random.random(numfeat).reshape(1, -1))
    array([3])

    Get out the hyperplane definitions
    >>> separating_slopes = classifier.coef_.squeeze()
    >>> separating_means  = classifier.means_

    >>> separating_slopes.shape
    (5, 3)

    >>> separating_slopes
    array([[  2.70440523e-02,  -3.80858158e-02,   2.41489887e-02],
           [ -2.53418850e-02,   2.79816369e-03,   6.21784910e-03],
           [ -7.77643702e-03,   1.32056000e-02,  -3.35055363e-02],
           [  6.01276895e-03,   2.21513897e-02,   3.08449021e-03],
           [  3.08105498e+00,  -3.44472837e+00,   2.71349840e+00]])

    >>> separating_means.shape
    (5, 3)

    >>> separating_means
    array([[ 0.50161415,  0.4972585 ,  0.50199664],
           [ 0.49726039,  0.50065976,  0.50050506],
           [ 0.49872434,  0.50152527,  0.49718801],
           [ 0.49987251,  0.50227821,  0.5002453 ],
           [ 0.75526211,  0.21382493,  0.72614352]])


    Test 200k, binary classification
    >>> numsamp = 200000
    >>> numfeat = 3
    >>> N.random.seed(0)
    >>> X = N.random.random((numsamp, numfeat))
    >>> Y = (N.linspace(0,.9999999,numsamp) + 0.5).astype(int)
    >>> classifier = LDA_train(X, Y)
    >>> N.random.seed(0)
    >>> classifier.predict(N.random.random(numfeat).reshape(1, -1))
    array([1])

    Get out the hyperplane definitions
    >>> separating_slopes     = classifier.coef_.squeeze()
    >>> separating_intercepts = classifier.intercept_

    Get the class means
    >>> class_means  = classifier.means_

    >>> separating_slopes.shape
    (3,)

    >>> separating_slopes
    array([-0.00170067,  0.03528707, -0.03036636])

    >>> separating_intercepts.shape
    (1,)

    >>> separating_intercepts
    array([-0.00162669])

    >>> class_means.shape
    (2, 3)

    >>> class_means
    array([[ 0.49943727,  0.49895913,  0.50125085],
           [ 0.49930098,  0.50189886,  0.49871892]])

    """

    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

    # Ensure X is an Ndarray (samples, feats)

    # Handle incoming dictionary
    if isinstance(X, dict):
        X = DD.DataDict(X).as_array()
    # Handle incoming DataDict
    elif isinstance(X, DD.DataDict):
        X = X.as_array()
    # Handle incoming list of arrays
    elif isinstance(X, list):
        X = N.array(X).T

    # output: coefs (n_features,) or (n_classes, n_features)
    # output: intercept (n_features,)
    # output: covariance matrix
    # output: explained_variance_ratio (n_components,) The percentage of variance explained by each selected component. Only for eigen solver.
    # output: means (n_classes, n_features)
    # output: priors (n_classes,)
    # output: scalings (rank, n_classes-1), scaling of features in the space spanned by the class centroids
    # output: xbar(n_features,) overall mean
    # output: classes (n_classes) Unique class labels

    # Perform fit to training data
    LDA = LinearDiscriminantAnalysis(solver='svd',
                                     shrinkage=None,
                                     priors=None,
                                     n_components=None,
                                     store_covariance=False,
                                     tol=0.0001)

    LDA.fit(X, Y)

    return LDA