Example #1
0
def factor_analysis(embedding_dataframe, fraction, n_components):
  """Projects the embeddings by factor analysis using negative controls.

  It would be interesting to explore factor analysis because it is a natural way
  to extract important latent features from the data, and PCA is actually a
  special case of factor analysis. When the variances of the error term in
  factor analysis are identical and go towards zero, the posterior estimate of
  the latent variables becomes exactly PCA.

  TVN is essentially PCA without dimension reduction. Compared with TVN, the
  drawback of factor analysis is that it requires to specify the number of
  latent variables. As an ad-hoc approach, I would suggest specifying it as the
  number of unique treatments.

  Args:
    embedding_dataframe: Pandas dataframe of the embeddings with each row as a
      sample.
    fraction: Fraction of negative control samples used to estimate parameters
      in factor analysis.
    n_components: Number of latent variables. If -1, specify n_components as
      the number of unique treatments.

  Returns:
     A Pandas dataframe with a reduced number of dimensions.
  """
  # specify the number of latent variables as the number of unique treatments,
  # excluding the negative control
  if n_components == -1:
    n_components = embedding_dataframe.reset_index()[[
        metadata.COMPOUND, metadata.CONCENTRATION
    ]].drop_duplicates().shape[0] - 1
  factor_analysis_object = decomposition.FactorAnalysis(
      n_components=n_components)
  factor_analysis_object.fit(
      get_negative_controls(embedding_dataframe).sample(frac=fraction,
                                                        axis=0).values)
  return pd.DataFrame(
      data=factor_analysis_object.transform(embedding_dataframe.values),
      index=embedding_dataframe.index)
Example #2
0
def myFA(X, label_refine, label, n_components, max_iter=2):
    n = X.shape[0]
    if len(label_refine) != n:
        label_refine = [0] * n
        label = ['no ground truth']
        print('No ground truth provided in this dataset')

    estimator = decomposition.FactorAnalysis(n_components=n_components,
                                             max_iter=2)
    t0 = time()
    X_fa = estimator.fit_transform(X)
    t1 = time()
    plt.figure(figsize=(30, 10))
    plt.suptitle(
        "Factor Analysis on dataset with accepted %i experiments, each with %i covariates. \nClasses: %s "
        % (X.shape[0], X.shape[1], label),
        fontsize=24)

    k = len(label)

    for i in [1, 2]:
        plt.subplot(1, 2, i)
        plt.title("Independent components - FastICA' (%.2g sec)" % (t1 - t0))
        for j, lab in zip(np.linspace(0, k - 1, k), label):
            plt.scatter(
                X_fa[label_refine == j, np.mod(i, 2)],
                X_fa[label_refine == j, np.mod(i, 2) + 1]  #
                ,
                cmap=plt.cm.Spectral,
                label=lab)
        plt.xlabel("%i principal component" % (np.mod(i, 2) + 1), fontsize=14)
        plt.ylabel("%i principal component" % (np.mod(i, 2) + 2), fontsize=14)
        plt.legend(loc=1)
        plt.axis()

    plt.show()
    components = estimator.components_

    return X_fa, components
Example #3
0
            match = np.in1d(catalog["cluster_id"], selected_cluster_ids)
            K_trials = np.arange(1, np.max([2 * K_true, 10]))

            # Standard GMM
            K_best, converged, metrics = utils.converged_mixture_model(
                X[match], mixture.GaussianMixture, K_trials, **gmm_kwds)

            results.append((K_true, K_best, converged))

            running_delta_gmm += abs(K_true - K_best)

            print("Stantard GMM", results[-1], running_delta_gmm)

            # Now factor analysis.
            model = decomposition.FactorAnalysis(n_components=1)
            model = model.fit(X[match])

            # Now run a GMM on the transformed X data.
            X_transformed = model.transform(X[match])
            K_best, converged, metrics = utils.converged_mixture_model(
                X_transformed, mixture.GaussianMixture, K_trials)

            slf_results.append((K_true, K_best, converged))
            running_delta_slf += abs(K_true - K_best)

            print("SLF + GMM   ", slf_results[-1], running_delta_slf)

            if K_true > 5:
                raise a
Example #4
0
                                      batch_size=3,
                                      random_state=rng), True),
    ('MiniBatchDictionaryLearning',
     decomposition.MiniBatchDictionaryLearning(n_components=15,
                                               alpha=0.1,
                                               n_iter=50,
                                               batch_size=3,
                                               random_state=rng), True),
    ('Cluster centers - MiniBatchKMeans',
     MiniBatchKMeans(n_clusters=n_components,
                     tol=1e-3,
                     batch_size=20,
                     max_iter=50,
                     random_state=rng), True),
    ('Factor Analysis components - FA',
     decomposition.FactorAnalysis(n_components=n_components,
                                  max_iter=20), True),
]

# #############################################################################
# Plot a sample of the input data

plot_gallery("First centered Olivetti faces", faces_centered[:n_components])

# #############################################################################
# Do the estimation and plot it

for name, estimator, center in estimators:
    print("Extracting the top %d %s..." % (n_components, name))
    t0 = time()
    data = faces
    if center:
Example #5
0
data_lle.shape[1]
data_lle.shape[0]
err_lle

""" t-SNE """

tsne = manifold.TSNE(n_components=3)
data_tsne = tsne.fit_transform(d5)
data_tsne.shape[1]
tsne1 =data_tsne[:,0]
tsne2 =data_tsne[:,1]
tsne3 =data_tsne[:,2]

""" Factor Analysis """

fa = decomposition.FactorAnalysis()
data_fa = fa.fit_transform(d5)
fa.components_[:,0]
fa.components_[:,1]

fa1 = data_fa[:,0]
fa2 = data_fa[:,1]
fa3 = data_fa[:,2]

fa.get_precision()
fa.loglike_

data_fa.shape[0]
data_fa.shape[1]

"""Non Negative Matrix Factorization """
Example #6
0
import numpy as np
import pandas as pd
from sklearn import decomposition as dc
from scipy.stats import zscore
import matplotlib.pyplot as plt
c = pd.read_excel("Pan11_1_1.xlsx", usecols=np.arange(1, 7))
c = c.values.astype(float)
d = zscore(c)  #数据标准化
r = np.corrcoef(d.T)  #求相关系数矩阵
f = pd.ExcelWriter('Pan11_1_2.xlsx')
pd.DataFrame(r).to_excel(f)
f.save()
val, vec = np.linalg.eig(r)
cs = np.cumsum(val)  #求特征值的累加和
print("特征值为:", val, "\n累加和为:", cs)
fa = dc.FactorAnalysis(n_components=2)  #构建模型
fa.fit(d)  #求解最大方差的模型
print("载荷矩阵为:\n", fa.components_)
print("特殊方差为:\n", fa.noise_variance_)
dd = fa.fit_transform(d)  #计算因子得分
w = val[:2] / sum(val[:2])  #计算两个因子的权重
df = np.dot(dd, w)  #计算每个评价对象的因子总分
tf = np.sum(c, axis=1)  #计算每个评价对象的实分总分
#构造pandas数据框,第1列到第5列数据分别为因子1得分,因子2得分,因子总分、实分总分和序号
pdf = pd.DataFrame(np.c_[dd, df, tf, np.arange(1, 53)],
                   columns=['f1', 'f2', 'yf', 'tf', 'xh'])
spdf1 = pdf.sort_values(by='yf', ascending=False)  #y因子总分从高到低排序
spdf2 = pdf.sort_values(by='tf', ascending=False)  #实分总分从高到低排序
print("排序结果为:\n", spdf1, '\n', spdf2)
s = ['A' + str(i) for i in range(1, 53)]
plt.rc('font', family='SimHei')
Example #7
0
X1[:,2]=100*R[:,2] #50*r3
	
print('cov(data):')
print(np.cov(X1.T))
print(' ')

#PCA:
pca=decomposition.PCA(n_components=2)
pca.fit(X1)
pca_comp = pca.components_
print('PCA:')
print(pca_comp)
print(' ')

#FA:
fa=decomposition.FactorAnalysis(n_components=2, max_iter=200)
fa.fit(X1)
fa_comp = fa.components_
print('Factor Analysis:')
print(fa_comp)
print(' ')


#%%############## PROBLEM 3.D ##########################

R2=np.random.normal(size=(10^5,3))

X2=np.zeros((10^5,3))
X2[:,0]=R2[:,0] #v1=r1
X2[:,1]=20*R2[:,1] #v2=15*r2
X2[:,2]=200*R2[:,2] #300*r3
Example #8
0
def main():
    progname = os.path.basename(sys.argv[0])
    usage = """prog [options] <input stack> <output basis> [reprojections]
This too provides a variety of dimensionality reduction methods. This new version
uses scikit.learn, which provides a greater variety of algorithms, but must load 
all data into memory. If working with a large file, you may want to consider using
--step to operate on a limited subset of the data.

If specified, [reprojections] will contain projections of the full input stack
(ignoring --step) into the basis subspace represented as a single image. This 
obviates the need for e2basis.py, and permits use of nonlinear decompositions.

---
Performs multivariate statistical analysis on a stack of images. Writes
a set of Eigenimages which can be uses as a basis set for reducing
the dimensionality of a data set (noise reduction). Typically this
basis set is then used to reproject the data (e2basis.py) and
classify the data based on the projected vectors. If the
output file supports arbitrary metadata (like HDF), Eigenvalues
are stored in the 'eigval' parameter in each image.

Note: The mean value is subtracted from each image prior to MSA
calculation. The mean image is stored as the first image in the output
file, though it is not part of the orthonormal basis when
handled this way."""

    parser = EMArgumentParser(usage=usage, version=EMANVERSION)

    parser.add_argument(
        "--mode",
        type=str,
        help="Mode should be one of: pca, sparsepca, fastica, factan, lda, nmf",
        default="pca")
    parser.add_argument(
        "--nomean",
        action="store_true",
        help="Suppress writing the average image as the first output image",
        default=False)
    parser.add_argument(
        "--nomeansub",
        action="store_true",
        help=
        "Suppress subtracting the mean from each input image, also implies --nomean",
        default=False)
    parser.add_argument("--nbasis",
                        "-n",
                        type=int,
                        help="Number of basis images to generate.",
                        default=20)
    parser.add_argument(
        "--maskfile",
        "-M",
        type=str,
        help=
        "File containing a mask defining the pixels to include in the Eigenimages"
    )
    parser.add_argument(
        "--projin",
        type=str,
        default=None,
        help=
        "When generating subspace projections, use this file instead of the input used for the MSA"
    )
    parser.add_argument(
        "--normproj",
        action="store_true",
        help=
        "When generating subspace projections, normalize each projection vector to unit length",
        default=False)
    parser.add_argument(
        "--mask",
        type=int,
        help=
        "Mask radius, negative values imply ny/2+1+mask, --mask=0 disables, --maskfile overrides",
        default=0)
    parser.add_argument(
        "--simmx",
        type=str,
        help=
        "Will use transformations from simmx on each particle prior to analysis"
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help=
        "Perform a careful normalization of input images before MSA. Otherwise normalization is not modified until after mean subtraction.",
        default=False)
    parser.add_argument(
        "--step",
        type=str,
        default="0,1",
        help=
        "Specify <init>,<step>[,last]. Processes only a subset of the input data. For example, 0,2 would process only the even numbered particles"
    )
    parser.add_argument(
        "--ppid",
        type=int,
        help="Set the PID of the parent process, used for cross platform PPID",
        default=-1)
    parser.add_argument(
        "--verbose",
        "-v",
        dest="verbose",
        action="store",
        metavar="n",
        type=int,
        default=0,
        help=
        "verbose level [0-9], higher number means higher level of verboseness")

    #parser.add_argument("--gui",action="store_true",help="Start the GUI for interactive boxing",default=False)
    #parser.add_argument("--boxsize","-B",type=int,help="Box size in pixels",default=-1)
    #parser.add_argument("--dbin","-D",type=str,help="Filename to read an existing box database from",default=None)

    (options, args) = parser.parse_args()
    if len(args) < 2: parser.error("Input and output filenames required")

    logid = E2init(sys.argv, options.ppid)

    if options.verbose > 0: print("Beginning MSA")

    # Number of image s in the input file
    nfile = EMUtil.get_image_count(args[0])

    try:
        step = [int(i) for i in options.step.split(",")]
        if len(step) == 1: step = (0, step[0], nfile)
        elif len(step) == 2: step.append(nfile)
        elif len(step) == 3:
            if step[2] <= 0:
                step[2] += nfile  # undocumented negative final value permitted
        else:
            raise Exception
    except:
        print("Invalid --step specification")
        sys.exit(1)

    # setup mask image
    if options.maskfile:
        mask = EMData(options.maskfile, 0)
        if mask["mean_nonzero"] != 1.0:
            print("ERROR: maskfile must be a binary mask (1/0 only)")
            sys.exit(1)
    else:
        # default is no masking
        mask = EMData(args[0], 0)
        mask.to_one()
        # negative values handled by mask.sharp
        if options.mask != 0:
            mask.process_inplace("mask.sharp", {"outer_radius": options.mask})

    # Memory usage warning >2G raw data
    n = (step[2] - step[0]) // step[1]
    nval = int(mask["square_sum"])
    #	print(args[0],n,nval)
    if options.verbose or n * nval > 500000000:
        print("Estimated memory usage (mb): ", n * nval * 4 / 2**20)

    # Read all image data into numpy array
    if options.simmx: data = simmx_get(args[0], options.simmx, mask, step)
    else: data = normal_get(args[0], mask, step)

    if options.normalize:
        for i in range(len(data)):
            data[i] /= np.linalg.norm(data[i])

    # first output image is the mean of the input vectors, which has been subtracted from each vector
    try:
        os.unlink(args[1])
    except:
        pass
    mean = np.mean(data, 0)
    if not options.nomeansub:
        for i in range(len(data)):
            data[i] -= mean
    #from_numpy(mean).process("misc.mask.pack",{"mask":mask,"unpack":1}).write_image(args[1],0)

    shift = 0
    # This is where the actual action takes place!
    if options.mode == "pca":
        msa = skdc.PCA(n_components=options.nbasis)
        #		print(data.shape)
        msa.fit(data)
    elif options.mode == "factan":
        msa = skdc.FactorAnalysis(n_components=options.nbasis)
        msa.fit(data)
    elif options.mode == "sparsepca":
        msa = skdc.SparsePCA(n_components=options.nbasis)
        #		print(data.shape)
        msa.fit(data)
    elif options.mode == "fastica":
        msa = skdc.FastICA(n_components=options.nbasis,
                           algorithm="parallel",
                           max_iter=500,
                           tol=0.001)
        msa.fit(data)
    elif options.mode == "lda":
        shift = max(-data.min() + data.std() * 0.5,
                    data.std() * 4.0 - data.mean())  # we need positivity
        # if we are processing projections later, we need to try to insure that they will be positive as well
        if options.projin:
            nfile2 = EMUtil.get_image_count(options.projin)
            pmin = 0
            pstd = 0
            pmean = 0
            pn = 0
            for i in range(0, nfile2,
                           nfile2 // 256):  # read a scattering of images
                tmp = EMData(options.projin)
                pmin = min(pmin, tmp["minimum"])
                pstd = max(pstd, tmp["sigma_nonzero"])
                pmean += tmp["mean"]
                pn += 1
            pmean /= pn
            shiftp = max(pmin + pstd * 0.5, pstd * 4.0 - pmean)
            shift = max(shift, shiftp)

        data += shift
        msa = skdc.LatentDirichletAllocation(n_components=options.nbasis,
                                             learning_method="online",
                                             verbose=1)
        msa.fit(data)
    elif options.mode == "nmf":
        shift = max(-data.min() + data.std() * 1.5,
                    data.std() * 4.0 - data.mean())  # we need positivity
        # if we are processing projections later, we need to try to insure that they will be positive as well
        if options.projin:
            nfile2 = EMUtil.get_image_count(options.projin)
            pmin = 0
            pstd = 0
            pmean = 0
            pn = 0
            for i in range(0, nfile2,
                           nfile2 // 256):  # read a scattering of images
                tmp = EMData(options.projin)
                pmin = min(pmin, tmp["minimum"])
                pstd = max(pstd, tmp["sigma_nonzero"])
                pmean += tmp["mean"]
                pn += 1
            pmean /= pn
            shiftp = max(pmin + pstd * 0.5, pstd * 4.0 - pmean)
            shift = max(shift, shiftp)

        data += shift
        msa = skdc.NMF(n_components=options.nbasis, init="nndsvd")
        msa.fit(data)

    # write mean
    if not options.nomean and not options.nomeansub:
        mn = from_numpy(mean).process("misc.mask.pack", {
            "mask": mask,
            "unpack": 1
        })
        mn["eigval"] = 0  # we add this artifically to the mean image, both to mark it, and to make some other code requiring it work. It isn't meaningful as a value, obviously
        mn.write_image(args[1], 0)


#	print(msa.components_.shape)
#	c=from_numpy(msa.components_.copy()).write_image("z.hdf",0)

    if options.verbose > 0: print("MSA complete")

    # write other basis vectors
    if options.nomean or options.nomeansub: offset = 0
    else: offset = 1
    for i, v in enumerate(msa.components_):
        im = from_numpy(v.copy()).process("misc.mask.pack", {
            "mask": mask,
            "unpack": 1
        })
        if options.mode == "pca":
            im["eigval"] = float(msa.singular_values_[i])
            im["explvarfrac"] = float(msa.explained_variance_ratio_[i])
            if options.verbose:
                print("Explained variance: ", im["explvarfrac"],
                      "\tSingular Value: ", im["eigval"])
        elif options.mode == "fastica":
            if im["sigma"] > 0:
                im.mult(1.0 / im["sigma"]
                        )  # fastica seems to produce very small vector lengths
        im.write_image(args[1], i + offset)

    # if requested we use the model to generate reprojections of the full set of input images
    # into the new subspace. This permits use of nonlinear algorithms (the components_ output
    # is not directly usable)
    if len(args) > 2:
        try:
            os.unlink(args[2])
        except:
            pass

        if options.projin != None:
            images = options.projin
            nfile2 = EMUtil.get_image_count(images)
            step2 = [0, 1, nfile2]
        else:
            nfile2 = nfile
            step2 = step
            images = args[0]

        if options.verbose: print("Reprojecting input data into subspace")
        chunksize = min(max(2, 250000000 // nval),
                        step2[2])  # limiting memory usage for this step to ~2G
        out = EMData(
            options.nbasis, step2[2]
        )  # we hold the full set of reprojections in memory, though
        start = 0
        while (start < step2[2]):
            stept = [start, 1, min(step2[2], start + chunksize)]
            if options.verbose: print(stept)

            # read a chunk of data
            if options.simmx:
                chunk = simmx_get(images, options.simmx, mask, stept)
            else:
                chunk = normal_get(images, mask, stept)
            if shift != 0:
                chunk += shift  # for methods requiring positivity
                if chunk.min() <= 0:
                    print(
                        "ERROR: Results invalid, negative values. Shifting to prevent crash. Chunk ",
                        stept, " has mean=", chunk.mean(), "std=", chunk.std(),
                        "min=", chunk.min())
                    chunk += -chunk.min()

            proj = msa.transform(chunk)  # into subspace
            if options.normproj:
                for i in range(len(proj)):
                    proj[i] /= np.linalg.norm(proj[i])
            im = from_numpy(proj.copy())
            out.insert_clip(im, (0, start, 0))
            start += chunksize

        # write results
        out.write_image(args[2], 0)

    E2end(logid)
    if options.mode not in ("pca", "sparsepca", "fastica"):
        print(
            "WARNING: While projection vectors are reliable, use of modes other than PCA or ICA may involve nonlinarities, meaning the 'Eigenimages' may not be interpretable in the usual way."
        )
def Factor_scatter(X, labels):
    x = decomposition.FactorAnalysis(n_components=2).fit_transform(X)
    scatter(x, labels)
def plot_faces_decomposition():
    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    n_row, n_col = 2, 3
    n_components = n_row * n_col
    image_shape = (64, 64)
    rng = RandomState(0)

    # #############################################################################
    # Load faces data
    faces, _ = fetch_olivetti_faces(return_X_y=True, shuffle=True,
                                    random_state=rng)
    n_samples, n_features = faces.shape

    # global centering
    faces_centered = faces - faces.mean(axis=0)

    # local centering
    faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1)

    print("Dataset consists of %d faces" % n_samples)

    def plot_gallery(title, images, n_col=n_col, n_row=n_row, cmap=plt.cm.gray):
        plt.figure(figsize=(2. * n_col, 2.26 * n_row))
        plt.suptitle(title, size=16)
        for i, comp in enumerate(images):
            plt.subplot(n_row, n_col, i + 1)
            vmax = max(comp.max(), -comp.min())
            plt.imshow(comp.reshape(image_shape), cmap=cmap,
                       interpolation='nearest',
                       vmin=-vmax, vmax=vmax)
            plt.xticks(())
            plt.yticks(())
        plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)

    # #############################################################################
    # List of the different estimators, whether to center and transpose the
    # problem, and whether the transformer uses the clustering API.
    estimators = [
        ('Eigenfaces - PCA using randomized SVD',
         decomposition.PCA(n_components=n_components, svd_solver='randomized',
                           whiten=True),
         True),

        ('Non-negative components - NMF',
         decomposition.NMF(n_components=n_components, init='nndsvda', tol=5e-3),
         False),

        ('Independent components - FastICA',
         decomposition.FastICA(n_components=n_components, whiten=True),
         True),

        ('Sparse comp. - MiniBatchSparsePCA',
         decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8,
                                          n_iter=100, batch_size=3,
                                          random_state=rng),
         True),

        ('MiniBatchDictionaryLearning',
         decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
                                                   n_iter=50, batch_size=3,
                                                   random_state=rng),
         True),

        ('Cluster centers - MiniBatchKMeans',
         MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20,
                         max_iter=50, random_state=rng),
         True),

        ('Factor Analysis components - FA',
         decomposition.FactorAnalysis(n_components=n_components, max_iter=20),
         True),
    ]

    # #############################################################################
    # Plot a sample of the input data

    plot_gallery("First centered Olivetti faces", faces_centered[:n_components])

    # #############################################################################
    # Do the estimation and plot it

    for name, estimator, center in estimators:
        print("Extracting the top %d %s..." % (n_components, name))
        t0 = time()
        data = faces
        if center:
            data = faces_centered
        estimator.fit(data)
        train_time = (time() - t0)
        print("done in %0.3fs" % train_time)
        if hasattr(estimator, 'cluster_centers_'):
            components_ = estimator.cluster_centers_
        else:
            components_ = estimator.components_

        # Plot an image representing the pixelwise variance provided by the
        # estimator e.g its noise_variance_ attribute. The Eigenfaces estimator,
        # via the PCA decomposition, also provides a scalar noise_variance_
        # (the mean of pixelwise variance) that cannot be displayed as an image
        # so we skip it.
        if (hasattr(estimator, 'noise_variance_') and
                estimator.noise_variance_.ndim > 0):  # Skip the Eigenfaces case
            plot_gallery("Pixelwise variance",
                         estimator.noise_variance_.reshape(1, -1), n_col=1,
                         n_row=1)
        plot_gallery('%s - Train time %.1fs' % (name, train_time),
                     components_[:n_components])

    plt.show()

    # #############################################################################
    # Various positivity constraints applied to dictionary learning.
    estimators = [
        ('Dictionary learning',
         decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
                                                   n_iter=50, batch_size=3,
                                                   random_state=rng),
         True),
        ('Dictionary learning - positive dictionary',
         decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
                                                   n_iter=50, batch_size=3,
                                                   random_state=rng,
                                                   positive_dict=True),
         True),
        ('Dictionary learning - positive code',
         decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
                                                   n_iter=50, batch_size=3,
                                                   fit_algorithm='cd',
                                                   random_state=rng,
                                                   positive_code=True),
         True),
        ('Dictionary learning - positive dictionary & code',
         decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1,
                                                   n_iter=50, batch_size=3,
                                                   fit_algorithm='cd',
                                                   random_state=rng,
                                                   positive_dict=True,
                                                   positive_code=True),
         True),
    ]

    # #############################################################################
    # Plot a sample of the input data

    plot_gallery("First centered Olivetti faces", faces_centered[:n_components],
                 cmap=plt.cm.RdBu)

    # #############################################################################
    # Do the estimation and plot it

    for name, estimator, center in estimators:
        print("Extracting the top %d %s..." % (n_components, name))
        t0 = time()
        data = faces
        if center:
            data = faces_centered
        estimator.fit(data)
        train_time = (time() - t0)
        print("done in %0.3fs" % train_time)
        components_ = estimator.components_
        plot_gallery(name, components_[:n_components], cmap=plt.cm.RdBu)

    plt.show()
Example #11
0
methods['LLE'] = LLE(method='standard')
methods['LTSA'] = LLE(method='ltsa')
methods['Hessian LLE'] = LLE(method='hessian')
methods['Modified LLE'] = LLE(method='modified')
methods['Isomap'] = manifold.Isomap(n_neighbors, n_components)
methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1)
methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,
                                           n_neighbors=n_neighbors)
methods['t-SNE'] = manifold.TSNE(n_components=n_components,
                                 init='pca',
                                 random_state=0)
# Set-up linear methods
methods['PCA'] = decomposition.PCA(n_components)
methods['ICA'] = decomposition.FastICA(n_components)
#methods['NMF'] = decomposition.NMF(n_components) #Negative vals
methods['Factor Analysis'] = decomposition.FactorAnalysis(n_components)
#methods['LDA'] = LinearDiscriminantAnalysis(n_components) # Supervised method, requires class labels
methods['Kernel PCA (rbf)'] = decomposition.KernelPCA(n_components,
                                                      kernel="rbf")
methods['Kernel PCA (poly)'] = decomposition.KernelPCA(n_components,
                                                       kernel="poly")
methods['Kernel PCA (sigmoid)'] = decomposition.KernelPCA(n_components,
                                                          kernel="sigmoid")
methods['Kernel PCA (cosine)'] = decomposition.KernelPCA(n_components,
                                                         kernel="cosine")

###########################
## PLOTS
###########################

# S-Curve
Example #12
0
def FA_method(X):
    fa = decomposition.FactorAnalysis()
    fa.fit_transform(X)
Example #13
0
        return np.zeros((X.shape[0], 2)), y, metrics.empty_pq_metrics()

    if len(X_new.shape) != 2 or X_new.shape[1] != 2:
        print('----------------------------------------------------')
        print("Error running %s: Projection did not return 2 columns: " % id_run, X_new.shape)
        print('----------------------------------------------------')
        return np.zeros((X.shape[0], 2)), y, metrics.empty_pq_metrics()

    return X_new, y, metrics.eval_pq_metrics(X=X_new, y=y, elapsed_time=elapsed_time, id_run=id_run, dataset_name=dataset_name, output_dir=output_dir)


all_projections = dict()

all_projections['AE']      = (ae.AutoencoderProjection(), {'n_components': [2], 'model_size': [ae.ModelSize.SMALL, ae.ModelSize.MEDIUM, ae.ModelSize.LARGE]})
all_projections['DM']      = (tapkee.DiffusionMaps(), {'t': [2, 5, 10], 'width': [1.0, 5.0, 10.0], 'verbose': [False]})
all_projections['FA']      = (decomposition.FactorAnalysis(), {'n_components': [2], 'max_iter': [1000, 2000], 'random_state': [42]})
all_projections['FICA']    = (decomposition.FastICA(), {'n_components': [2], 'fun': ['logcosh', 'exp'], 'max_iter': [200, 400], 'random_state': [42]})
all_projections['FMAP']    = (vp.Fastmap(), {'verbose': [False], 'dissimilarity_type': ['euclidean']})
all_projections['FMVU']    = (drtoolbox.FastMVU(), {'k': [8, 12, 15], 'verbose': [False]})
all_projections['GDA']     = (drtoolbox.GDA(), {'kernel': ['gauss', 'linear'], 'verbose': [False]})
all_projections['GPLVM']   = (drtoolbox.GPLVM(), {'sigma': [0.5, 1.0, 2.0], 'verbose': [False]})
all_projections['GRP']     = (random_projection.GaussianRandomProjection(), {'n_components': [2], 'random_state': [42]})
all_projections['HLLE']    = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['hessian'], 'eigen_solver': ['dense'], 'random_state': [42]})
all_projections['IDMAP']   = (vp.IDMAP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'init_type': ['fastmap', 'random'], 'dissimilarity_type': ['euclidean']})
all_projections['IPCA']    = (decomposition.IncrementalPCA(), {'n_components': [2]})
all_projections['ISO']     = (manifold.Isomap(), {'n_components': [2], 'n_neighbors': [3, 5, 7], 'eigen_solver': ['dense']})
all_projections['KPCAPol'] = (decomposition.KernelPCA(), {'n_components': [2], 'gamma': [None] + [0.05, 0.05, 0.5], 'degree': [2, 3, 5], 'kernel': ['poly'], 'max_iter': [None], 'random_state': [42]})
all_projections['KPCARbf'] = (decomposition.KernelPCA(), {'n_components': [2], 'gamma': [None] + [0.05, 0.05, 0.5], 'kernel': ['rbf'], 'max_iter': [None], 'random_state': [42]})
all_projections['KPCASig'] = (decomposition.KernelPCA(), {'n_components': [2], 'gamma': [None] + [0.05, 0.05, 0.5], 'degree': [3], 'kernel': ['sigmoid'], 'max_iter': [None], 'random_state': [42]})
all_projections['LAMP']    = (vp.LAMP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'sample_type': ['random', 'clustering_centroid']})
all_projections['LE']      = (manifold.SpectralEmbedding(), {'n_components': [2], 'affinity': ['nearest_neighbors'], 'random_state': [42]})
    def generate_FA_matrices(self,
                             training_task_entry,
                             plot=False,
                             hdf=None,
                             dec=None,
                             bin_spk=None):

        import utils.fa_decomp as pa
        if bin_spk is None:
            if training_task_entry is not None:
                from db import dbfunctions as dbfn
                te = dbfn.TaskEntry(training_task_entry)
                hdf = te.hdf
                dec = te.decoder

            bin_spk, targ_pos, targ_ix, z, zz = self.extract_trials_all(
                hdf, dec)

        #Zscore is in time x neurons
        zscore_X, mu = self.zscore_spks(bin_spk)

        # #Find optimal number of factors:
        LL, psv = pa.find_k_FA(zscore_X, iters=3, max_k=10, plot=False)

        #Np.nanmean:
        nan_ix = np.isnan(LL)
        samp = np.sum(nan_ix == False, axis=0)
        ll = np.nansum(LL, axis=0)
        LL_new = np.divide(ll, samp)

        num_factors = 1 + (np.argmax(LL_new))
        print 'optimal LL factors: ', num_factors

        FA = skdecomp.FactorAnalysis(n_components=num_factors)

        #Samples x features:
        FA.fit(zscore_X)

        #FA matrices:
        U = np.mat(FA.components_).T
        i = np.diag_indices(U.shape[0])
        Psi = np.mat(np.zeros((U.shape[0], U.shape[0])))
        Psi[i] = FA.noise_variance_
        A = U * U.T
        B = np.linalg.inv(U * U.T + Psi)
        mu_vect = np.array([mu[0, :]]).T  #Size = N x 1
        sharL = A * B

        #Calculate shared / priv scaling:
        bin_spk_tran = bin_spk.T
        mu_mat = np.tile(np.array([mu[0, :]]).T, (1, bin_spk_tran.shape[1]))
        demn = bin_spk_tran - mu_mat
        shared_bin_spk = (sharL * demn)
        priv_bin_spk = bin_spk_tran - mu_mat - shared_bin_spk

        #Scaling:
        eps = 1e-15
        x_var = np.var(np.mat(bin_spk_tran), axis=1) + eps
        pr_var = np.var(priv_bin_spk, axis=1) + eps
        sh_var = np.var(shared_bin_spk, axis=1) + eps

        priv_scalar = np.sqrt(np.divide(x_var, pr_var))
        shared_scalar = np.sqrt(np.divide(x_var, sh_var))

        if plot:
            tmp = np.diag(U.T * U)
            plt.plot(np.arange(1, num_factors + 1),
                     np.cumsum(tmp) / np.sum(tmp), '.-')
            plt.plot([0, num_factors + 1], [.9, .9], '-')

        #Get main shared space:
        u, s, v = np.linalg.svd(A)
        s_red = np.zeros_like(s)
        s_hd = np.zeros_like(s)

        ix = np.nonzero(np.cumsum(s**2) / float(np.sum(s**2)) > .90)[0]
        if len(ix) > 0:
            n_dim_main_shared = ix[0] + 1
        else:
            n_dim_main_shared = len(s)
        if n_dim_main_shared < 2:
            n_dim_main_shared = 2
        print "main shared: n_dim: ", n_dim_main_shared, np.cumsum(s) / float(
            np.sum(s))
        s_red[:n_dim_main_shared] = s[:n_dim_main_shared]
        s_hd[n_dim_main_shared:] = s[n_dim_main_shared:]

        main_shared_A = u * np.diag(s_red) * v
        hd_shared_A = u * np.diag(s_hd) * v
        main_shared_B = np.linalg.inv(main_shared_A + hd_shared_A + Psi)

        uut_psi_inv = main_shared_B.copy()
        u_svd = u[:, :n_dim_main_shared]

        main_sharL = main_shared_A * main_shared_B

        main_shar = main_sharL * demn
        main_shar_var = np.var(main_shar, axis=1) + eps
        main_shar_scal = np.sqrt(np.divide(x_var, main_shar_var))

        main_priv = demn - main_shar
        main_priv_var = np.var(main_priv, axis=1) + eps
        main_priv_scal = np.sqrt(np.divide(x_var, main_priv_var))

        # #Get PCA decomposition:
        #LL, ax = pa.FA_all_targ_ALLms(hdf, iters=2, max_k=20, PCA_instead=True)
        #num_PCs = 1+(np.argmax(np.mean(LL, axis=0)))

        # Main PCA space:
        # Get cov matrix:
        cov_pca = np.cov(zscore_X.T)
        eig_val, eig_vec = np.linalg.eig(cov_pca)

        tot_var = sum(eig_val)
        cum_var_exp = np.cumsum(
            [i / tot_var for i in sorted(eig_val, reverse=True)])
        n_PCs = np.nonzero(cum_var_exp > 0.9)[0][0] + 1

        proj_mat = eig_vec[:, :n_PCs]
        proj_trans = np.mat(proj_mat) * np.mat(proj_mat.T)

        #PC matrices:
        return dict(fa_sharL=sharL,
                    fa_mu=mu_vect,
                    fa_shar_var_sc=shared_scalar,
                    fa_priv_var_sc=priv_scalar,
                    U=U,
                    Psi=Psi,
                    training_task_entry=training_task_entry,
                    FA_iterated_power=FA.iterated_power,
                    FA_score=FA.score(zscore_X),
                    FA_LL=np.array(FA.loglike_),
                    fa_main_shared=main_sharL,
                    fa_main_shared_sc=main_shar_scal,
                    fa_main_private_sc=main_priv_scal,
                    fa_main_shar_n_dim=n_dim_main_shared,
                    sing_vals=s,
                    own_pc_trans=proj_trans,
                    FA_model=FA,
                    uut_psi_inv=uut_psi_inv,
                    u_svd=u_svd)
Example #15
0
def dim_reduction(X, n_components=2, mode="MDS"):
    
    """Reduces the number of dimensions in which a dataset is defined.
    
    Arguments

    X       -   NumPy array with shape (N,M), where N is the number of
                observations, and M the number of features.
    
    Keyword Arguments
    
    n_components    -   Intended number of features after dimensionality
                        reduction. Default = 2
    
    mode            -   String that defines the type of dim reduction:
                        - None
                        - "PCA" principal component analysis
                        - "ICA" independent component analysis
                        - "FA" factor analysis
                        - "TSNE" t-stochastic neighbour embedding
                        - "UMAP" uniform manifold approximation and embedding
                        - "RANDOMPROJECTION"
                        - "FEATUREAGGLOMERATION"
                        - "ISOMAP"
                        - "LLE" local linear embedding
                        - "HESSIAN" Hessian eigenmaps
                        - "MLLE" modified local linear embedding
                        - "LTSA" local tangent space alignment
                        - "MDS" multi-dimensional scaling
                        - "DICTIONARY" dictionary learning
                        - "TSVD" truncated SVD (also known as "LSE")
                        Default = "MDS"
    
    Returns
    
    X       -   NumPy array with shape (N-n,M), where N is the number of
                observations and n is the number of observations with a NaN.
                M is the number of features. Now with scaled values.
    """
    
    # Make sure the mode is in all caps.
    if type(mode) == str:
        mode = mode.upper()
    
    # Copy X into a new matrix.
    X_ = numpy.copy(X)

    # None
    if mode is None or mode == "NONE":
        # Literally nothing happens here for now.
        print("Fart noise!")
        
    # Principal component analysis.
    elif mode == 'PCA':
        # Initialise a new PCA.
        pca = decomposition.PCA(n_components=n_components)
        # Fit the PCA with the data.
        pca.fit(X_)
        # Transform the data.
        X_ = pca.transform(X_)
    
    # Independent component analysis.
    elif mode == 'ICA':
        # Initialise a new ICA.
        ica = decomposition.FastICA(n_components=n_components)
        # Fit the ICA with the data.
        ica.fit(X_)
        # Transform the data.
        X_ = ica.transform(X_)
    
    # Factor analysis.
    elif mode == 'FA':
        # Initialise a new factor analysis.
        fa = decomposition.FactorAnalysis(n_components=n_components)
        # Perform the factor analysis on the data.
        fa.fit(X_)
        # Transform the data.
        X_ = fa.transform(X_)
    
    # T-Distributed stochastic neighbour embedding.
    elif mode == 'TSNE':
        # Run several t-SNEs to find a good one.
        n_runs = 10
        Xs_ = []
        dkl = numpy.ones(n_runs, dtype=float) * numpy.inf
        print("Running %d t-SNEs to find lowest Kullback-Leibler divergence." \
            % (n_runs))
        for i in range(n_runs):
            # Initialise a new t-distributed stochastic neighbouring embedding
            #  (t-SNE) analysis.
            tsne = TSNE(n_components=n_components)
            # Copy the data into a new variable.
            Xs_.append(numpy.copy(X_))
            # Fit to and transform the data.
            Xs_[i] = tsne.fit_transform(Xs_[i])
            # Get the KL-divergence.
            dkl[i] = tsne.kl_divergence_
            print("\tCurrent KL-divergence = %.5f" % (dkl[i]))
        # Choose the solution with the lowest KL-divergence.
        X_ = numpy.copy(Xs_[numpy.argmin(dkl)])
        # Get rid of all the excess X copies.
        del Xs_
    
    # Uniform manifold approximation and projection.
    elif mode == 'UMAP':
        # Create a new UMAP instance.
        um = umap.UMAP(n_components=n_components, min_dist=0.01)
        # Fit and transform X.
        X_ = um.fit_transform(X_)
    
    # Gaussian Random Projection.
    elif mode == 'RANDOMPROJECTION':
        # Create a new GaussianRandomProjection instance.
        rp = GaussianRandomProjection(n_components=n_components)
        # Fit and transform X.
        X_ = rp.fit_transform(X_)
    
    # Feature Agglomeration.
    elif mode == 'FEATUREAGGLOMERATION':
        # Create a new FeatureAgglomeration instance.
        fa = cluster.FeatureAgglomeration(n_clusters=n_components)
        # Fit and transform X.
        X_ = fa.fit_transform(X_)
    
    # Isomap.
    elif mode == 'ISOMAP':
        # Create a new Isomap instance.
        im = Isomap(n_components=n_components)
        # Fit and transform X.
        X_ = im.fit_transform(X_)
    
    # Locally Linear Embedding.
    elif mode == 'LLE':
        # Create a new LocallyLinearEmbedding instance.
        lle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='standard', eigen_solver='dense')
        # Fit and transform X.
        X_ = lle.fit_transform(X_)
    
    # Hessian eigenmaps.
    elif mode == 'HESSIAN':
        # Create a new LocallyLinearEmbedding instance.
        hlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='hessian', eigen_solver='dense')
        # Fit and transform X.
        X_ = hlle.fit_transform(X_)
    
    # MLLE.
    elif mode == 'MLLE':
        # Create a new LocallyLinearEmbedding instance.
        mlle = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='modified', eigen_solver='dense')
        # Fit and transform X.
        X_ = mlle.fit_transform(X_)
    
    # LTSA.
    elif mode == 'LTSA':
        # Create a new LocallyLinearEmbedding instance.
        ltsa = LocallyLinearEmbedding(n_neighbors=10, n_components=n_components, \
            method='ltsa', eigen_solver='dense')
        # Fit and transform X.
        X_ = ltsa.fit_transform(X_)
    
    # Multi-dimensional scaling.
    elif mode == 'MDS':
        # Create a new MDS instance.
        mds = MDS(n_components=n_components)
        # Fit and transform X.
        X_ = mds.fit_transform(X_)
    
    # Dictionary Learning
    elif mode == "DICTIONARY":
        # Create a DictionaryLearning instance.
        dictlearn = decomposition.DictionaryLearning( \
            n_components=n_components, \
            fit_algorithm='cd', \
            # The 'omp' algorithm orthogonalises the whole thing, whereas
            # a lasso solution with a low alpha leaves a slightly more
            # scattered solution.
            transform_algorithm='lasso_cd', \
            transform_alpha=0.1, \
            )
        # Fit and transform X.
        X_ = dictlearn.fit_transform(X)
    
    # Truncated SVD (also known as 'Latent Semantic analysis' (LSE)
    elif mode in ['TSVD', 'LSE']:
        tsvd = decomposition.TruncatedSVD(n_components=n_components)
        # Fit and transform X.
        X_ = tsvd.fit_transform(X)
    
    else:
        raise Exception("Unrecognised dimensionality reduction mode '%s'" % (mode))
    
    return X_
 def __init__(self, **kwargs):
     super().__init__()
     self.estimator = sk_d.FactorAnalysis(**kwargs)
def get_search_params(params_builder):
    search_params = {}
    safe_eval = SafeEval(load_scipy=True, load_numpy=True)
    safe_eval_es = SafeEval(load_estimators=True)

    for p in params_builder['param_set']:
        search_p = p['search_param_selector']['search_p']
        if search_p.strip() == '':
            continue
        param_type = p['search_param_selector']['selected_param_type']

        lst = search_p.split(':')
        assert (
            len(lst) == 2
        ), "Error, make sure there is one and only one colon in search parameter input."
        literal = lst[1].strip()
        param_name = lst[0].strip()
        if param_name:
            if param_name.lower() == 'n_jobs':
                sys.exit("Parameter `%s` is invalid for search." % param_name)
            elif not param_name.endswith('-'):
                ev = safe_eval(literal)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name] = ev
            else:
                # only for estimator eval, add `-` to the end of param
                #TODO maybe add regular express check
                ev = safe_eval_es(literal)
                for obj in ev:
                    if 'n_jobs' in obj.get_params():
                        obj.set_params(n_jobs=N_JOBS)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name[:-1]] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name[:-1]] = ev
        elif param_type != 'final_estimator_p':
            #TODO regular express check ?
            ev = safe_eval_es(literal)
            preprocessors = [
                preprocessing.StandardScaler(),
                preprocessing.Binarizer(),
                preprocessing.Imputer(),
                preprocessing.MaxAbsScaler(),
                preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(),
                feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(),
                feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS),
                skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0)
            ]
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessors[0:36])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessors[0:8])
                elif obj == 'fs_all':
                    newlist.extend(preprocessors[8:15])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessors[15:26])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessors[26:30])
                elif obj == 'reb_all':
                    newlist.extend(preprocessors[31:36])
                elif obj == 'imb_all':
                    newlist.extend(preprocessors[36:55])
                elif type(obj) is int and -1 < obj < len(preprocessors):
                    newlist.append(preprocessors[obj])
                elif hasattr(obj, 'get_params'):  # user object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported preprocessor type: %r" % (obj))
            search_params['preprocessing_' + param_type[5:6]] = newlist
        else:
            sys.exit("Parameter name of the final estimator can't be skipped!")

    return search_params
Example #18
0
def learning_curve_metrics(hdf_list, epoch_size=56, n_factors=5):
    #hdf_list = [3822, 3834, 3835, 3840]
    #obstacle learning: hdf_list = [4098, 4100, 4102, 4104, 4114, 4116, 4118, 4119]
    rew_ix_list = []
    te_refs = []
    rpm_list = []
    hdf_dict = {}
    perc_succ = []
    time_list = []
    offs = 0

    #f, ax = plt.subplots()
    for te in hdf_list:
        hdf_t = dbfn.TaskEntry(te)
        hdf = hdf_t.hdf
        hdf_dict[te] = hdf

        rew_ix, rpm = pa.get_trials_per_min(hdf,
                                            nmin=2,
                                            rew_per_min_cutoff=0,
                                            ignore_assist=True,
                                            return_rpm=True)
        ix = 0
        #ax.plot(rpm)

        trial_ix = np.array([
            i for i in hdf.root.task_msgs[:] if i['msg'] in
            ['reward', 'timeout_penalty', 'hold_penalty', 'obstacle_penalty']
        ],
                            dtype=hdf.root.task_msgs.dtype)

        while (ix + epoch_size) < len(rew_ix):
            start_rew_ix = rew_ix[ix]
            end_rew_ix = rew_ix[ix + epoch_size]
            msg_ix_mod = np.nonzero(
                scipy.logical_and(trial_ix['time'] <= end_rew_ix,
                                  trial_ix['time'] > start_rew_ix))[0]
            all_msg = trial_ix[msg_ix_mod]
            perc_succ.append(
                len(np.nonzero(all_msg['msg'] == 'reward')[0]) /
                float(len(all_msg)))

            rew_ix_list.append(rew_ix[ix:ix + epoch_size])
            rpm_list.append(np.mean(rpm[ix:ix + epoch_size]))
            te_refs.append(te)
            time_list.append((0.5 * (start_rew_ix + end_rew_ix)) + offs)

            ix += epoch_size
        offs = offs + len(hdf.root.task)

    #For each epoch, fit FA model (stick w/ 5 factors for now):
    ratio = []
    for te, r_ix in zip(te_refs, rew_ix_list):
        print te, len(r_ix)

        update_bmi_ix = np.nonzero(
            np.diff(
                np.squeeze(
                    hdf.root.task[:]['internal_decoder_state'][:, 3,
                                                               0])))[0] + 1
        bin_spk, targ_pos, targ_ix, z, zz = pa.extract_trials_all(
            hdf_dict[te], r_ix, time_cutoff=1000, update_bmi_ix=update_bmi_ix)
        zscore_X, mu = pa.zscore_spks(bin_spk)
        FA = skdecomp.FactorAnalysis(n_components=n_factors)
        FA.fit(zscore_X)

        #SOT Variance Ratio by target
        #Priv var / mean
        Cov_Priv = np.sum(FA.noise_variance_)
        U = np.mat(FA.components_).T
        Cov_Shar = np.trace(U * U.T)

        ratio.append(Cov_Shar / (Cov_Shar + Cov_Priv))
Example #19
0
def fa(train, test, n_component):
    transformer = sk_d.FactorAnalysis(n_components=n_component, random_state=0)
    train_out = transformer.fit_transform(train)
    test_out = transformer.fit_transform(test)
    return train_out, test_out
Example #20
0
def faces_decomposition():
    import logging
    from numpy.random import RandomState  #随机数生成器种子,从高斯分布或者其他等分布产生
    import matplotlib.pyplot as plt
    from time import time
    from sklearn.datasets import fetch_olivetti_faces
    from sklearn.cluster import MiniBatchKMeans
    from sklearn import decomposition

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    n_row, n_col = 2, 3
    n_components = n_row * n_col
    image_shape = (64, 64)
    rng = RandomState(0)

    #加载数据集
    dataset = fetch_olivetti_faces(shuffle=True, random_state=rng)
    faces = dataset.data

    n_samples, n_features = faces.shape

    faces_centered = faces - faces.mean(axis=0)

    faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1)

    print("dataset consits of %d faces" % n_samples)  #样本个数

    def plot_gallery(title, images, n_col=n_col, n_row=n_row):
        plt.figure(figsize=(2. * n_col, 2.26 * n_row))
        plt.suptitle(title, size=16)
        for i, comp in enumerate(images):
            plt.subplot(n_row, n_col, i + 1)
            vmax = max(comp.max(), -comp.min())
            plt.imshow(comp.reshape(image_shape),
                       cmap=plt.cm.gray,
                       interpolation='nearest',
                       vmin=-vmax,
                       vmax=vmax)
            plt.xticks(())
            plt.yticks(())
        plt.subplots_adjust(0.01, 0.05, 0.99, 0.93, 0.04, 0.)

    estimators = [
        ('Eigenfaces - PCA using randomized SVD',
         decomposition.PCA(n_components=n_components,
                           svd_solver='randomized',
                           whiten=True), True),
        ('Non-negative components - NMF',
         decomposition.NMF(n_components=n_components, init='nndsvda',
                           tol=5e-3), False),
        ('Independent components - FastICA',
         decomposition.FastICA(n_components=n_components, whiten=True), True),
        ('Sparse comp. - MiniBatchSparsePCA',
         decomposition.MiniBatchSparsePCA(n_components=n_components,
                                          alpha=0.8,
                                          n_iter=100,
                                          batch_size=3,
                                          random_state=rng), True),
        ('MiniBatchDictionaryLearning',
         decomposition.MiniBatchDictionaryLearning(n_components=15,
                                                   alpha=0.1,
                                                   n_iter=50,
                                                   batch_size=3,
                                                   random_state=rng), True),
        ('Cluster centers - MiniBatchKMeans',
         MiniBatchKMeans(n_clusters=n_components,
                         tol=1e-3,
                         batch_size=20,
                         max_iter=50,
                         random_state=rng), True),
        ('Factor Analysis components - FA',
         decomposition.FactorAnalysis(n_components=n_components,
                                      max_iter=2), True),
    ]

    # #############################################################################
    # Plot a sample of the input data

    plot_gallery("First centered Olivetti faces",
                 faces_centered[:n_components])

    # #############################################################################
    # Do the estimation and plot it

    for name, estimator, center in estimators:
        print("Extracting the top %d %s..." % (n_components, name))
        t0 = time()
        data = faces
        if center:
            data = faces_centered
        estimator.fit(data)
        train_time = (time() - t0)
        print("done in %0.3fs" % train_time)
        if hasattr(estimator, 'cluster_centers_'):
            components_ = estimator.cluster_centers_
        else:
            components_ = estimator.components_

        # Plot an image representing the pixelwise variance provided by the
        # estimator e.g its noise_variance_ attribute. The Eigenfaces estimator,
        # via the PCA decomposition, also provides a scalar noise_variance_
        # (the mean of pixelwise variance) that cannot be displayed as an image
        # so we skip it.
        if (hasattr(estimator, 'noise_variance_')
                and estimator.noise_variance_.ndim >
                0):  # Skip the Eigenfaces case
            plot_gallery("Pixelwise variance",
                         estimator.noise_variance_.reshape(1, -1),
                         n_col=1,
                         n_row=1)
        plot_gallery('%s - Train time %.1fs' % (name, train_time),
                     components_[:n_components])

    plt.show()
Example #21
0
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessings = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.MaxAbsScaler(), preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessings[0:35])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessings[0:7])
                elif obj == 'fs_all':
                    newlist.extend(preprocessings[7:14])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessings[14:25])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessings[25:29])
                elif obj == 'reb_all':
                    newlist.extend(preprocessings[30:35])
                elif obj == 'imb_all':
                    newlist.extend(preprocessings[35:54])
                elif type(obj) is int and -1 < obj < len(preprocessings):
                    newlist.append(preprocessings[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
print('RFR \nNumber of features %d \nError %.3f \nAccuracy %.3f\n' %
      (rfr_best[0], rfr_best[1], rfr_best[2]))

#-------------------------------------------------------------------------------------------
#model with dimensionality reduction using Linear Discriminant Analysis (not for regression)
#-------------------------------------------------------------------------------------------
lda = LinearDiscriminantAnalysis()
best = CV(lda, x, y)
benchmark.append(best)
benchmark_names.append('LDA')
print('LDA \nNumber of features %d \nError %.3f \nAccuracy %.3f\n' % best)

#---------------------------------------------------------
#model with dimensionality reduction using Factor Analysis
#---------------------------------------------------------
fa = decomposition.FactorAnalysis(max_iter=2000)
best = CV(fa, x, y)
benchmark.append(best)
benchmark_names.append('FA')
print('FA \nNumber of features %d \nError %.3f \nAccuracy %.3f\n' % best)

#----------------------------------------------------------------------
#model with dimensionality reduction using Principal component analysis
#----------------------------------------------------------------------
pca = decomposition.PCA()
best = CV(pca, x, y)
benchmark.append(best)
benchmark_names.append('PCA')
print('PCA \nNumber of features %d \nError %.3f \nAccuracy %.3f\n' % best)

#----------------------------------------------------------------------------------
Example #23
0
    plt.title("Explained variance for each Principal Component")
    plt.plot([i for i in range(1, n_features + 1)],
             list(explained_variance_ratio),
             color="steelblue",
             linestyle="-")
    plt.xlabel("Principal Component Number")
    plt.ylabel("Explained variance")
    plt.xlim([0, n_features])
    plt.ylim([0, 0.35])

#  ##############################################################################
## Factor Analysis

if (FA):
    print("\n===== Factor Analysis =====")
    fa = decomposition.FactorAnalysis(n_components=n_features)
    print("Fit...")
    fa.fit(X_scaled)

    # Plot observations in the FA basis and label them using rank feature
    X_faprojected = fa.transform(X_scaled)  # Project X on principal components

    NFIG += 1
    plt.figure(NFIG)
    plt.title("Rank of athletes in the FA basis")
    plt.scatter(X_faprojected[:, 0], X_faprojected[:, 1], c=data.get('Rank'))

    plt.xlabel("Component 1")
    plt.ylabel("Component 2")
    plt.colorbar()
Example #24
0

DATASET = digits
N_LABELS = 10
N_COMPONENTS = 19
N_CLUSTERS = 10
MODE = 'nothing'

# General Options
TITLE = 'Neural Network Classifier'
N_REPEAT = 10
LEARNING_RATE = 1e-1
TOLERANCE = 1e-4
TOPOLOGY = (3, )

fa = decomposition.FactorAnalysis(n_components=N_COMPONENTS)
new_data = fa.fit_transform(DATASET.training_features)

report = {}
labels = []

kmeans = cluster.KMeans(n_clusters=N_CLUSTERS)
kmeans.fit(new_data)

for i, c in enumerate(kmeans.labels_):
    if c not in report:
        report[c] = {}
    real_label = DATASET.training_labels[i]
    if real_label not in labels:
        labels.append(real_label)
    if real_label not in report[c]:
Example #25
0
                     xy=(wi[0], wi[1]),
                     xytext=(wi[0] - 0.02, wi[1] + 0.02))

    plt.plot(S[:, 0], S[:, 1], 'ro', linestyle='none', ms=1)
    plt.show()


# load data
data = sio.loadmat('04cars.mat')
X = data['X'][:, 7:18]  # use real-value features
y = data['names']
X = spp.StandardScaler().fit_transform(X)
labels = np.array([
    'Retail', 'Dealer', 'Engine', 'Cylinders', 'Horsepower', 'City MPG',
    'Highway MPG', 'Weight', 'Wheel Base', 'Length', 'Width'
])  # 每一列的标签
print('X.shape: ', X.shape)
print('y.shape: ', y.shape)

# fit FA model
L = 2
FA = sd.FactorAnalysis(n_components=L)
FA.fit(X)
C = FA.components_.T  # N * L
Z = FA.transform(X)
print('FA.W: \n', FA.components_)
print('psi: \n', FA.noise_variance_)
print('latent Z: \n', Z)

biPlot(C, Z, labels)
def main():
    dataset = fetch_olivetti_faces(shuffle=True, random_state=rng)
    faces = dataset.data

    n_samples, n_features = faces.shape
    # global centering
    faces_centered = faces - faces.mean(axis=0)
    # local centering
    faces_centered -= faces_centered.mean(axis=1).reshape(n_samples, -1)
    print("Dataset consists of %d faces" % n_samples)

    estimators = [
        ('Eigenfaces - PCA using randomized SVD',
         decomposition.PCA(n_components=n_components,
                           svd_solver='randomized',
                           whiten=True), True),
        ('Non-negative components - NMF',
         decomposition.NMF(n_components=n_components, init='nndsvda',
                           tol=5e-3), False),
        ('Independent components - FastICA',
         decomposition.FastICA(n_components=n_components, whiten=True), True),
        ('Sparse comp. - MiniBatchSparsePCA',
         decomposition.MiniBatchSparsePCA(n_components=n_components,
                                          alpha=0.8,
                                          n_iter=100,
                                          batch_size=3,
                                          random_state=rng), True),
        ('MiniBatchDictionaryLearning',
         decomposition.MiniBatchDictionaryLearning(n_components=15,
                                                   alpha=0.1,
                                                   n_iter=50,
                                                   batch_size=3,
                                                   random_state=rng), True),
        ('Cluster centers - MiniBatchKMeans',
         MiniBatchKMeans(n_clusters=n_components,
                         tol=1e-3,
                         batch_size=20,
                         max_iter=50,
                         random_state=rng), True),
        ('Factor Analysis components - FA',
         decomposition.FactorAnalysis(n_components=n_components,
                                      max_iter=2), True),
    ]

    plot_gallery("First centered Olivetti faces",
                 faces_centered[:n_components])

    for name, estimator, center in estimators:
        print("Extracting the top %d %s..." % (n_components, name))
        t0 = time()
        data = faces
        if center:
            data = faces_centered
        estimator.fit(data)
        train_time = (time() - t0)
        print("done in %0.3fs" % train_time)
        if hasattr(estimator, 'cluster_centers_'):
            components_ = estimator.cluster_centers_
        else:
            components_ = estimator.components_
        # so we skip it.
        if (hasattr(estimator, 'noise_variance_')
                and estimator.noise_variance_.ndim >
                0):  # Skip the Eigenfaces case
            plot_gallery("Pixelwise variance",
                         estimator.noise_variance_.reshape(1, -1),
                         n_col=1,
                         n_row=1)
        plot_gallery('%s - Train time %.1fs' % (name, train_time),
                     components_[:n_components])

    plt.show()
import pandas as pd
from sklearn import decomposition, preprocessing

data = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/00292/Wholesale%20customers%20data.csv'
)
data = data.drop(["Channel", "Region"], axis=1)
data.head()

# =============================================================================
# sklearn.decomposition.FactorAnalysis
# =============================================================================
# scaling the data before FA
data_normal = preprocessing.scale(data)

fa = decomposition.FactorAnalysis(n_components=2)
fa.fit(data_normal)

print(fa.components_)
df = pd.DataFrame(fa.components_.transpose(),
                  index=data.columns,
                  columns=['factor 1', 'factor 2'])
df
"""
                  factor 1  factor 2
Fresh            -0.047160  0.423627
Milk              0.732284  0.360762
Grocery           0.968583  0.058966
Frozen           -0.072645  0.564214
Detergents_Paper  0.961895 -0.122233
Delicassen        0.167762  0.722710
def classify_neural_network(dataset,
                            method,
                            n_components,
                            X,
                            X_test,
                            y,
                            y_test,
                            k_means_clusters=0,
                            em_clusters=0):

    filename = ('-'.join([dataset, method, str(n_components)]))

    if method == 'pca':
        dr = decomposition.PCA(n_components=n_components,
                               svd_solver='auto',
                               random_state=random_state)
        title = dataset.title() + ': Neural Network (PCA)'
    elif method == 'ica':
        dr = decomposition.FastICA(n_components=n_components,
                                   random_state=random_state,
                                   whiten=True)
        title = dataset.title() + ': Neural Network (ICA)'
    elif method == 'rp':
        dr = GaussianRandomProjection(n_components=n_components)
        title = dataset.title() + ': Neural Network (RP)'
    elif method == 'fa':
        dr = decomposition.FactorAnalysis(n_components=n_components,
                                          svd_method='randomized',
                                          random_state=random_state)
        title = dataset.title() + ': Neural Network (FA)'

    X = dr.fit_transform(X)
    X_test_t = dr.transform(X_test)

    if k_means_clusters:
        title += ' (K-Means)'
        filename += '-km'
        estimator = KMeans(n_clusters=k_means_clusters,
                           init='k-means++',
                           n_init=10,
                           random_state=random_state)
        estimator.fit(X)

        new_features = estimator.predict(X)
        X = np.insert(X, 0, new_features, axis=1)

        new_features = estimator.predict(X_test_t)
        X_test_t = np.insert(X_test_t, 0, new_features, axis=1)

    elif k_means_clusters:
        title += ' (Expectation-Maximization)'
        filename += '-em'
        estimator = GaussianMixture(n_components=k_means_clusters,
                                    init_params='kmeans',
                                    n_init=10,
                                    random_state=random_state,
                                    covariance_type='full',
                                    reg_covar=1e-2)
        estimator.fit(X)

        new_features = estimator.predict(X)
        X = np.insert(X, 0, new_features, axis=1)

        new_features = estimator.predict(X_test_t)
        X_test_t = np.insert(X_test_t, 0, new_features, axis=1)

    clf = train_neural_network(X, y.astype('int'), title, filename)

    y_pred = clf.predict(X_test_t)
    print f1_score(y_test.astype('int'), y_pred.astype('int'), average='macro')

    if not k_means_clusters and not em_clusters:
        X_test_t = dr.fit_transform(X_test)
        y_pred = clf.predict(X_test_t)
        print f1_score(y_test.astype('int'),
                       y_pred.astype('int'),
                       average='macro')
Example #29
0
#analiza factoriala
#elimin rotatia
model_fa = fact.FactorAnalyzer(rotation=None)
#construiesc modelul - standardizeaza modelul
model_fa.fit(x)
#calcul scoruri
f = model_fa.transform(x)
#plot al scorurilor
functions.plot_scoruri(
    f[:,
      0], f[:,
            1], list(t.index), "F1", "F2", "Plot Scoruri - Analiza Factoriala"
)  #coloana 1 din scoruri, aferenta primei componente principale

#matrice de corelatii
l = model_fa.loadings_

#varianta factorilor
alpha_fa = model_fa.get_factor_variance()  #aici merge facuta si tabelare

# model factorial sklearn - daca e fara factor_analyzer -- alta metoda de factorizare folosita
model_fa_sk = dec.FactorAnalysis(n_components=3)
model_fa_sk.fit(x)
#extragem scorurile
f_sk = model_fa_sk.transform(x)
functions.plot_scoruri(
    f_sk[:, 0], f_sk[:, 1], list(t.index), "F1", "F2",
    "Plot Scoruri SK - Analiza Factoriala"
)  #coloana 1 din scoruri, aferenta primei componente principale
functions.show()
def run_evolution(iterations, family_size, use_big_data, train_split,
                  use_binary, use_zero_threshold, drop_variable, decomp_mode,
                  pool_info, n_components):

    # Import metadata to know who is control and who is patient
    df = pd.read_csv('data/PAC2018_Covariates_pooling_red%s.csv' % pool_info,
                     index_col=0)

    if use_group == 1:
        df = df[df.Scanner == 1]
        postfix = 'scanner1'
    elif use_group == 0:
        df = df[df.Scanner != 1]
        postfix = 'scanner23'
    elif use_group == -1:
        df = df[df.Scanner != 0]
        postfix = 'scanner0'

    if drop_variable != '':
        df = df.drop(drop_variable, 1)
    labels = np.array(df['Label']) - 1
    sub = df.index

    # Get data in right form
    data = df.drop(['Label'], 1)

    # Compute the factor analysis
    if decomp_mode == 'faa':
        faa = decomposition.FactorAnalysis(n_components=n_components)
        faa.fit(data)
        data = faa.transform(data)

    elif decomp_mode == 'ica':
        ica = decomposition.FastICA(n_components=n_components)
        ica.fit(data)
        data = ica.transform(data)

    if use_big_data:
        new_data = np.copy(data)

        for i in range(data.shape[1]):
            temp = data * data[:, i][:, None]
            new_data = np.hstack((new_data, temp))
        data = np.copy(new_data)

    # Import test data for prediction
    df_test = pd.read_csv('data/PAC2018_Covariates_Test_pooling_red%s.csv' %
                          pool_info,
                          index_col=0)

    if use_group == 1:
        df_test = df_test[df_test.Scanner == 1]
    elif use_group == 0:
        df_test = df_test[df_test.Scanner != 1]
    elif use_group == -1:
        df_test = df_test[df_test.Scanner != 0]

    if drop_variable != '':
        df_test = df_test.drop(drop_variable, 1)

    # Get data in right form
    data_test = df_test.drop(['Label'], 1)

    if decomp_mode == 'faa':
        data_test = faa.transform(data_test)

    elif decomp_mode == 'ica':
        data_test = ica.transform(data_test)

    if use_big_data:
        new_data = np.copy(data_test)

        for i in range(data_test.shape[1]):
            temp = data_test * data_test[:, i][:, None]
            new_data = np.hstack((new_data, temp))
        data_test = np.copy(new_data)

    # Specify chromosoms
    chrom_size = data.shape[1]

    # Create new family
    family = [
        get_new_chromosom(chrom_size,
                          binary=use_binary,
                          zero_threshold=use_zero_threshold)
        for x in range(family_size)
    ]

    result_train = []
    result_test = []
    result_thresh = []
    evolution = []

    for k in range(iterations):

        if k % 50 == 0:

            # Balance dataset and create selecter
            max_label_size = np.min(
                [np.sum(lab == labels) for lab in np.unique(labels)])

            labels_1 = np.where(labels == 0)[0]
            np.random.shuffle(labels_1)
            labels_1 = labels_1[:max_label_size]

            labels_2 = np.where(labels == 1)[0]
            np.random.shuffle(labels_2)
            labels_2 = labels_2[:max_label_size]

            # Balance dataset
            new_data_id = np.hstack((labels_1, labels_2))
            np.random.shuffle(new_data_id)
            data_balanced = data[new_data_id]
            labels_balanced = labels[new_data_id]

            # Create selecter
            test_size = int(
                ((100 - (100 * train_split)) / 100.) * max_label_size)
            selecter = np.zeros(len(labels_balanced))
            selecter[:test_size] = 1
            selecter[max_label_size:max_label_size + test_size] = 1
            selecter = selecter.astype('bool')

        # Calculating fittnes using multiprocessing (=parallel)
        fitness = judge_family(data_balanced, labels_balanced, family,
                               selecter)
        fit_train = np.array(fitness[0])
        fit_test = np.array(fitness[1])

        good_parents = fit_train.argsort()[-32:]

        # Save best chromosom
        evolution.append([
            family[good_parents[-1]], family[good_parents[-2]],
            family[good_parents[-3]]
        ])

        # Get clear File identifier
        file_id = 'iter_%05d_family_%04d_bin_%s_zeroThresh_%s_group_%s_comp_%s%d' % (
            iterations, family_size, use_binary, use_zero_threshold, postfix,
            decomp_mode, n_components)

        # Create new family
        new_family = [family[g] for g in good_parents]

        # Create childrens
        for c in permutations(range(8), 2):

            new_child = np.zeros(chrom_size)
            if use_binary:
                new_child = new_child.astype('bool')
            half_id = int(chrom_size / 2)
            new_child[:half_id] = new_family[c[0]][1][:half_id]
            new_child[half_id:] = new_family[c[1]][1][half_id:]
            new_threshold = new_family[c[0]][0]
            new_family.append((new_threshold, new_child))

        # Vary threshold in good parents (if not zero threshold)
        if not use_zero_threshold:
            for f in [family[g] for g in good_parents]:
                new_threshold = np.random.randn()
                new_family.append((new_threshold, f[1]))

        # Create possible mutations for each family member
        family_length = len(new_family)
        for i in range(family_length):
            for j in [[0, 33], [33, 67], [67, 100]]:
                element = new_family[i]
                mut_rate = np.random.randint(j[0], j[1])
                mutation = get_new_chromosom(chrom_size,
                                             rate=np.random.randint(1, 100),
                                             binary=use_binary,
                                             zero_threshold=use_zero_threshold)

                if np.random.random() * 100 <= mut_rate:
                    mut_threshold = mutation[0]
                else:
                    mut_threshold = element[0]

                mutant = element[1].copy()

                mut_hit = (np.random.randint(1, 100, size=chrom_size) <
                           mut_rate).astype('bool')

                mutant[mut_hit] = mutation[1][mut_hit]
                new_family.append((mut_threshold, mutant))

        # Find duplicates
        analysis_format = [[float(f[0])] + list(f[1].astype('float'))
                           for f in new_family]
        a = np.asarray(analysis_format)
        b = np.ascontiguousarray(a).view(
            np.dtype((np.void, a.dtype.itemsize * a.shape[1])))
        a = np.unique(b).view(a.dtype).reshape(-1, a.shape[1])

        if use_binary:
            new_family = [(newfam[0], newfam[1:].astype('bool'))
                          for newfam in a]
        else:
            new_family = [(newfam[0], newfam[1:]) for newfam in a]

        # Add new chromosoms
        for j in [[0, 20], [20, 40], [40, 60], [60, 80], [80, 100]]:
            for i in range(10):
                mut_rate = np.random.randint(j[0], j[1])
                new_family.append(
                    get_new_chromosom(chrom_size,
                                      rate=mut_rate,
                                      binary=use_binary,
                                      zero_threshold=use_zero_threshold))

        # Add rest of chromosoms
        for j in range(family_size - len(new_family)):
            new_family.append(
                get_new_chromosom(chrom_size,
                                  rate=np.random.randint(1, 100),
                                  binary=use_binary,
                                  zero_threshold=use_zero_threshold))

        # Reset the family
        family = new_family

        acc_train = np.round(fit_train.max() * 100, 4)
        acc_test = np.round(fit_test.max() * 100, 4)
        result_train.append(acc_train)
        result_test.append(acc_test)
        acc_threshold = round(new_family[0][0], 3)
        result_thresh.append(acc_threshold)

        print(k, acc_train, acc_test, acc_threshold)

        acc_both = [acc_train, acc_test]
        if np.mean(acc_both) >= 70 and np.min(acc_both) >= 67.5:
            strong_thresh = evolution[-1][0][0]
            strong_chrom = evolution[-1][0][1]

            predict = np.sum(data_test * strong_chrom, 1) >= strong_thresh
            predict = (predict + 1).tolist()
            predict = [[np.mean(acc_both)] + acc_both + predict]

            np.savetxt('results/evolution_pooling/strong_%s_%s_%s.txt' %
                       (pool_info, file_id, str(time())),
                       predict,
                       fmt='%f',
                       delimiter=',')

    title_text = ' Acc = %s - Iter: %04d - Family: %04d - Big: %s - ' % (round(
        acc_train, 2), iterations, family_size, use_big_data)
    title_text += 'Binary: %s - ZeroThresh: %s - Group: %s' % (
        use_binary, use_zero_threshold, postfix)
    if drop_variable != '':
        title_text += ' - Dropped: %s' % drop_variable

    title_text += ' - Comp: %s' % decomp_mode

    result_mean = (np.array(result_train) + np.array(result_test)) / 2

    figure(figsize=(16, 6))
    plot(result_train)
    plot(result_test)
    plot(result_mean)
    plot(np.array(result_thresh) + 60)
    legend([
        'Train [~%0.1f]' % np.mean(result_train[200:]),
        'Test [~%0.1f]' % np.mean(result_test[200:]),
        'Average [~%0.1f]' % np.mean(result_mean[200:]), 'Threshold [+60]'
    ])
    title('Fitness:%s - Threshold at %f' % (title_text, result_thresh[-1]))
    xlabel('Generation')
    ylabel('Accuracy [%]')
    tight_layout()
    savefig('results/evolution_pooling/fitness_%s_%s.png' %
            (pool_info, file_id))
    close()

    comp_name = ['comp_%03d' % (r + 1) for r in range(data.shape[1])]

    evolution = np.array([
        np.array([[float(f[0])] + list(f[1].astype('float')) for f in ev])
        for ev in evolution
    ])
    evolutionRGB = np.rollaxis(np.rollaxis(evolution, 2), 1).astype('float32')
    figure(figsize=(16, 8))
    imshow(evolutionRGB, aspect='auto')
    title('Chromosom:%s - Threshold at %f' % (title_text, result_thresh[-1]))
    ylabel('Generation')
    xticks(range(chrom_size + 1), ['Threshold'] + comp_name,
           rotation='vertical')
    subplots_adjust(left=0.04, right=0.99, top=0.96, bottom=0.15)
    savefig('results/evolution_pooling/chromosom_%s_%s.png' %
            (pool_info, file_id))
    close()

    family = np.array([[float(f[0])] + list(f[1].astype('float'))
                       for f in family])
    figure(figsize=(16, 8))
    imshow(family, aspect='auto')
    title('Final Family:%s - Threshold at %f' %
          (title_text, result_thresh[-1]))
    ylabel('Generation')
    xticks(range(chrom_size + 1), ['Threshold'] + comp_name,
           rotation='vertical')
    subplots_adjust(left=0.04, right=0.99, top=0.96, bottom=0.15)
    savefig('results/evolution_pooling/family_%s_%s.png' %
            (pool_info, file_id))
    close()

    # Predict Test data
    chromosoms = evolution[-1]

    predictions = []
    for chromi in chromosoms:
        threshold = chromi[0]
        chrom = chromi[1:]

        predict = np.sum(data_test * chrom, 1) >= threshold
        predictions.append((predict + 1).tolist())

    np.savetxt('results/evolution_pooling/prediction_%s_%s.txt' %
               (pool_info, file_id),
               predictions,
               fmt='%d',
               delimiter=',')

    print('Done %s.' % file_id)