def plot_violin_groups(data_pop_list, labels=None, title="", xlabel="", ylabel="", filename=None, dpi=150, orientation='vertical', samefig=False, xlim=None, ylim=None, palette=None): # Create our own figure unless user requests we don't if not samefig: fig = P.figure() if labels is None: labels = ["pop %d" % i for i in range(len(data_pop_list))] # Must assemble the data into a single 1D vector of all populations with repeating labels data = DD.DataDict() for array, label in zip(data_pop_list, labels): data.append('label', [ label, ] * len(array)) data.append('value', array) if "vert" in orientation.lower(): x = data['label'] y = data['value'] else: x = data['value'] y = data['label'] SB.violinplot(x=x, y=y, palette=palette) if title is not None: P.title(title) if xlabel is not None: P.xlabel(xlabel) if ylabel is not None: P.ylabel(ylabel) if xlim is not None: P.xlim(xlim) if ylim is not None: P.ylim(ylim) mlib.plot.markplot() if "vert" in orientation.lower(): mlib.plot.disable_axis_offset(x_axis=False) else: mlib.plot.disable_axis_offset(y_axis=False) if filename is not None: P.savefig(filename, dpi=dpi) return fig
def DBSCAN(indata, eps=0.3, min_samples=10, algorithm='auto', leaf_size=30): from sklearn.cluster import DBSCAN as dbscan_clustering # Handle incoming dictionary if isinstance(indata, dict): indata = DD.DataDict(indata).as_array() # Handle incoming DataDict elif isinstance(indata, DD.DataDict): indata = indata.as_array() # Handle incoming list of arrays elif isinstance(indata, list): indata = N.array(indata).T # Perform clustering dbfit = dbscan_clustering(eps=eps, min_samples=min_samples, algorithm=algorithm, leaf_size=leaf_size).fit(indata) # Extract mask for core points and cluster labels from object mask_corepoints = N.zeros_like(dbfit.labels_, dtype=bool) mask_corepoints[dbfit.core_sample_indices_] = True cluster_labels = dbfit.labels_ return cluster_labels, mask_corepoints
def mvlr_lasso(X, Y, alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, precompute=True, max_iter=-1, positive=False, selection='random', random_state=None, return_rms=True): """ Performs an iterative sparse linear fit. Impractically slow for any decent data size, alas. Args: X : A 1 or 2D array containing the independent vectors Y : A 1D array containing the values to regress alpha : regularization term, 0 = no regularization (normal MVLR), 1 = sparsity enforcement, higher increases strictness fit_intercept: Permit a non-zero intercept, useful if data isn't zero-centered beforehand normalize : Normalize data before fitting copy_X : Make a copy of X to protect against modification, but runs more slowly and takes more memory precompute : Precompute Gram matrix to speed up calculations max_iter : Maximum number of iterations to permit, -1 does not limit positive : Forces coefficients to be positive combinations selection : 'random' updates are much more swift to converge, 'cyclic' is... not? random_state : Random seed to use, None means use current state return_rms : Predict the training data and measure training rms (slower) Returns: coefficients: Linear weights on each independent variable rms : If requested via return_rms, include tuple of rms and y_predict y_predict : See above >>> X = N.array([[0,1,2,3,4,5,6,7,8,9],[1,0,1,0,1,0,1,0,1,0],[1,0,0,1,0,0,1,0,0,1]]).astype(N.float64).T >>> X.shape (10, 3) >>> X array([[ 0., 1., 1.], [ 1., 0., 0.], [ 2., 1., 0.], [ 3., 0., 1.], [ 4., 1., 0.], [ 5., 0., 0.], [ 6., 1., 1.], [ 7., 0., 0.], [ 8., 1., 0.], [ 9., 0., 1.]]) >>> Y = N.array( 2*X[:,0] + 5*X[:,1] + 10 ) >>> Y array([ 15., 12., 19., 16., 23., 20., 27., 24., 31., 28.]) No sparsity enforcement >>> N.random.seed(0) >>> coefficients, intercept, rms, Y_pred = mvlr_lasso(X, Y, alpha = 0.01, fit_intercept = True, copy_X = True, return_rms = True, positive = False) >>> coefficients[0] 1.9974444... >>> coefficients[1] 4.9556657... >>> coefficients[2] -0.0 >>> print('{:.10f}'.format(intercept)) 10.0336672510 >>> print('{:.10f}'.format(rms)) 0.0221045772 >>> Y_pred array([ 14.989333 , 12.03111167, 18.98422183, 16.0260005 , 22.97911067, 20.02088933, 26.9739995 , 24.01577817, 30.96888833, 28.010667 ]) Strong Sparsity enforcement >>> N.random.seed(0) >>> coefficients, intercept, rms, Y_pred = mvlr_lasso(X, Y, alpha = 2.0, fit_intercept = True, copy_X = True, return_rms = True, positive = False) >>> coefficients array([ 1.60606061, 0. , 0. ]) >>> intercept 14.272727272727273 >>> rms 2.5584085962673258 >>> Y_pred array([ 14.27272727, 15.87878788, 17.48484848, 19.09090909, 20.6969697 , 22.3030303 , 23.90909091, 25.51515152, 27.12121212, 28.72727273]) """ Y = N.array(Y).squeeze() from sklearn.linear_model import Lasso # Ensure train_feature_data is an Ndarray (samples, feats) # Handle incoming dictionary if isinstance(X, dict): X = DD.DataDict(X).as_array() # Handle incoming DataDict elif isinstance(X, DD.DataDict): X = X.as_array() # Handle incoming list of arrays elif isinstance(X, list): X = N.array(X).T Y = N.array(Y) # Enforce X, Y match dimension if X.shape[0] != len(Y): raise Exception('Must provide regression target each sample, dimension mismatch between X %s & Y %s' % ( str(X.shape), str(len(Y)))) # Perform fit regressor = Lasso(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, precompute=precompute, max_iter=max_iter, positive=positive, selection=selection, random_state=random_state) regressor.fit(X, Y) coefficients = regressor.coef_ intercept = regressor.intercept_ # How well did we do? if return_rms: Y_predict = regressor.predict(X) rms = NUM.rms(Y_predict - Y) return coefficients, intercept, rms, Y_predict else: return coefficients, intercept
def QDA_train(X, Y, reg_param=0.0, priors=None, store_covariance=False, tol=0.0001): """ Trains a quadratic discriminant analysis classifier from sklearn and returns the classifier. Extremely fast, even with 200k x 39 feats Args: X : 2D Narray (numsamps x numfeats) feature data Y : 1D iterable of class labels reg_param : Regularization parameter (float, 0-1). Reg term becomes (1-reg_param)*sigma + reg_param*N.eye(n_feat) priors : Prior probability of classes. Narray(num_classes) store_covariance: Return class covariance matrix tol : Stopping condition, threshold used for rank estimation in SVD solver Returns: trained_classifier # >>> X = [[0, 0], [1, 1]] # >>> Y = [0, 1] # >>> classifier = QDA_train(X, Y) # >>> print(classifier.predict([[0, 0]])) # [0] # >>> print(classifier.predict([[1, 1]])) # [1] # >>> print(classifier.predict([[-0.8, -1],[0.8,1]])) # [0 1] Test 200k example >>> numsamp = 200000 >>> numfeat = 3 >>> N.random.seed(0) >>> X = N.random.random((numsamp, numfeat)) >>> Y = N.linspace(0,3.99999,numsamp).astype(int) >>> classifier = QDA_train(X, Y) >>> N.random.seed(0) >>> classifier.predict(N.random.random(numfeat).reshape(1, -1)) array([3]) Test 200k, binary classification >>> numsamp = 200000 >>> numfeat = 3 >>> N.random.seed(0) >>> X = N.random.random((numsamp, numfeat)) >>> Y = (N.linspace(0,.9999999,numsamp) + 0.5).astype(int) >>> classifier = QDA_train(X, Y) >>> N.random.seed(0) >>> classifier.predict(N.random.random(numfeat).reshape(1, -1)) array([1]) """ from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis # Ensure X is an Ndarray (samples, feats) # Handle incoming dictionary if isinstance(X, dict): X = DD.DataDict(X).as_array() # Handle incoming DataDict elif isinstance(X, DD.DataDict): X = X.as_array() # Handle incoming list of arrays elif isinstance(X, list): X = N.array(X).T # Perform fit to training data QDA = QuadraticDiscriminantAnalysis(reg_param=reg_param, priors=None, store_covariance=False, tol=0.0001) QDA.fit(X, Y) return QDA
def LDA_train(X, Y, solver='svd', shrinkage=None, priors=None, n_components=None, tol=0.0001): """ Trains an LDA classifier from sklearn and returns the classifier. Extremely fast, even with 200k x 39 feats Args: X : 2D Narray (numsamps x numfeats) feature data Y : 1D iterable of class labels solver : Solver type to use: 'svd', 'lsqr','eigen' shrinkage : A number 0-1 that varies between empirical covariance matrix (0) and diagonal matrix of individual variances (1). Used for cases when number of samples < number_features, where empirical covariance is unreliable. Ignored for svd, default None, 'auto' uses Ledoit-Wolf lemma, otherwise float between 0-1 priors : Prior probability of classes. Narray(num_classes) n_components: Number of components (output planes) for dimensionality reduction (< n_classes -1) Typically this is 1 to use direct LDA best-separating plane, but can be used like PCA to return top N separating planes store_covariance: Return class covariance matrix tol : Stopping condition, threshold used for rank estimation in SVD solver Returns: trained_classifier # >>> X = [[0, 0], [1, 1]] # >>> Y = [0, 1] # >>> classifier = LDA_train(X, Y) # >>> print(classifier.predict([[0, 0]])) # [0] # >>> print(classifier.predict([[1, 1]])) # [1] # >>> print(classifier.predict([[-0.8, -1],[0.8,1]])) # [0 1] Test 200k example >>> numsamp = 200000 >>> numfeat = 3 >>> N.random.seed(0) >>> X = N.random.random((numsamp, numfeat)) >>> Y = N.linspace(0,4,numsamp).astype(int) >>> classifier = LDA_train(X, Y) >>> N.random.seed(0) >>> classifier.predict(N.random.random(numfeat).reshape(1, -1)) array([3]) Get out the hyperplane definitions >>> separating_slopes = classifier.coef_.squeeze() >>> separating_means = classifier.means_ >>> separating_slopes.shape (5, 3) >>> separating_slopes array([[ 2.70440523e-02, -3.80858158e-02, 2.41489887e-02], [ -2.53418850e-02, 2.79816369e-03, 6.21784910e-03], [ -7.77643702e-03, 1.32056000e-02, -3.35055363e-02], [ 6.01276895e-03, 2.21513897e-02, 3.08449021e-03], [ 3.08105498e+00, -3.44472837e+00, 2.71349840e+00]]) >>> separating_means.shape (5, 3) >>> separating_means array([[ 0.50161415, 0.4972585 , 0.50199664], [ 0.49726039, 0.50065976, 0.50050506], [ 0.49872434, 0.50152527, 0.49718801], [ 0.49987251, 0.50227821, 0.5002453 ], [ 0.75526211, 0.21382493, 0.72614352]]) Test 200k, binary classification >>> numsamp = 200000 >>> numfeat = 3 >>> N.random.seed(0) >>> X = N.random.random((numsamp, numfeat)) >>> Y = (N.linspace(0,.9999999,numsamp) + 0.5).astype(int) >>> classifier = LDA_train(X, Y) >>> N.random.seed(0) >>> classifier.predict(N.random.random(numfeat).reshape(1, -1)) array([1]) Get out the hyperplane definitions >>> separating_slopes = classifier.coef_.squeeze() >>> separating_intercepts = classifier.intercept_ Get the class means >>> class_means = classifier.means_ >>> separating_slopes.shape (3,) >>> separating_slopes array([-0.00170067, 0.03528707, -0.03036636]) >>> separating_intercepts.shape (1,) >>> separating_intercepts array([-0.00162669]) >>> class_means.shape (2, 3) >>> class_means array([[ 0.49943727, 0.49895913, 0.50125085], [ 0.49930098, 0.50189886, 0.49871892]]) """ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # Ensure X is an Ndarray (samples, feats) # Handle incoming dictionary if isinstance(X, dict): X = DD.DataDict(X).as_array() # Handle incoming DataDict elif isinstance(X, DD.DataDict): X = X.as_array() # Handle incoming list of arrays elif isinstance(X, list): X = N.array(X).T # output: coefs (n_features,) or (n_classes, n_features) # output: intercept (n_features,) # output: covariance matrix # output: explained_variance_ratio (n_components,) The percentage of variance explained by each selected component. Only for eigen solver. # output: means (n_classes, n_features) # output: priors (n_classes,) # output: scalings (rank, n_classes-1), scaling of features in the space spanned by the class centroids # output: xbar(n_features,) overall mean # output: classes (n_classes) Unique class labels # Perform fit to training data LDA = LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) LDA.fit(X, Y) return LDA