def __init__(self, Xin, n_components=None, fit_type='sklearn'): self.n_samples, self.n_features = Xin.shape # Center data self.mean = np.mean(Xin, axis=0) X = Xin - self.mean if n_components is None: self.n_components = min(self.n_features,self.n_samples) elif not 0 <= n_components <= self.n_features: raise ValueError("n_components=%r invalid for n_features=%d" % (n_components, self.n_features)) else: self.n_components = n_components self.components = None self.variance = None self.ll = None self.n_iters = None fa = FactorAnalysis(n_components=n_components) if fit_type=='sklearn': fa.fit(X) self.components = fa.components_ self.variance = fa.noise_variance_ self.ll = fa.loglike_ self.n_iters = fa.n_iter_ else: self.my_fit(X)
def SlumIndex(rates): #calculate Slum index #PCA with 4 components # pca = PCA(n_components=4) # pca.fit(rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']]) # #Here we STORE PARAMETER FOR SLUM IMPACT using the weights and vectors from pca # rates['SlumIndex'] = zeros(len(rates)) # weights=pca.explained_variance_ratio_ # #pca.components_ are the transformation vectors, rates are the original ones # new_vectors = dot(transpose(pca.components_), transpose(rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']].values)) # #Finally we get the index with the eigenvalues # rates['SlumIndex'] = transpose(dot(weights,new_vectors)) # pca = PCA(n_components=4) # new_vectors = pca.fit_transform(rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']]) # rates['SlumIndex'] = dot(pca.explained_variance_ratio_,transpose(new_vectors)) facAn = FactorAnalysis(n_components = 1) facAn.fit(rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']]) rates['SlumIndex'] = dot(facAn.components_**2,transpose(rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']].values))[0] # rates['SlumIndex'] = rates[['NoWater','DirtFloor','AvrPersPerRoom','NoSewage']].values.sum(axis=1) return rates[['ID','SlumIndex']]
def fit_factor_analysis(percentage=0.8): """ Runs the factor analysis. Parameters: percentage: float, default:0.8 The percentage of the cumulative sum of the eigenvalues to be held. This number defines the number of loading factors in the analysis. Returns: X: array of floats [n_samples,n_factors] The transformed data after the factor analysis. components: array of floats [n_factors,n_samples] The components of the factor analysis """ fa = FactorAnalysis() fa.fit(data) C = fa.get_covariance() l,e = np.linalg.eigh(C) cs = np.cumsum(l[::-1])/np.sum(l) n = np.sum(cs<percentage) fa.n_components = n X_ = fa.fit_transform(data) components = fa.components_ return X_,components
class MyRegressor(ModelDesign): def __init__(self, data): ModelDesign.__init__(self, data) self.reduction = PCA(n_components=150) self.reduction = FactorAnalysis(n_components=150) self.model = svm.NuSVR() def train(self): data = self.data X, Y = data.getXY(data.trainSize) X = np.array(X, dtype=np.float32) Y = np.array(Y, dtype=np.float32) Y = np.reshape(Y, newshape=Y.size) print("running", self.model, "regressor for", self.name) t1 = time.time() self.reduction.fit(X) X = self.reduction.transform(X) self.model = self.model.fit(X, Y) t2 = time.time() print("finished in", t2 - t1, "s") X, Y = data.getTestData() if X is not None: X = self.reduction.transform(X) Y1 = self.model.predict(X) Y = np.reshape(Y, newshape=Y.size) loss = np.sqrt(np.mean(np.square(Y1 - Y))) print("test RMSE=", loss) def predict(self, x): x = self.reduction.transform(x) y = self.model.predict(x) Y = np.reshape(y, newshape=(y.size, 1)) #Y=self.data.rescale(Y) return Y
def get_factors(shoppers, n_components=4, random_state=903, **kwargs): """ Find Factors to represent the shopper-level features in compressed space. These factors will be used to map simplified user input from application to the full feature space used in modeling. Args: shoppers (pd.DataFrame): full set of shoppers in feature data (train + test) n_components (int): number of factors to mine. Defaults to 4 and should stay that way (application UI based on these 4 analyzed factors) random_state (int): sets random state for factor analysis algorithm. Defaults to 4 (and should stay that way) kwargs: additional keyword arguments for sklearn.decomposition.FactorAnalysis Returns: pd.DataFrame: will have n_components rows and n_features columns. The values of this matrix can be used to map factors to full feature set (on std normal scale). """ # Remove columns which should not be considered in factor analysis x = shoppers for col in ['user_id', 'n_orders', 'label']: if col in x.columns: x = x.drop(columns=col) # Need to scale data as columns on incommensurate scales cols = x.columns x = preprocessing.scale(x) fa = FactorAnalysis(n_components, random_state=random_state, **kwargs) fa.fit(x) return pd.DataFrame(fa.components_, columns=cols)
def factor_analysis(data, num_features, components=1): print('-- Model: FactorAnalysis, numcomp: %d --' % components) fa = FactorAnalysis(n_components=components, random_state = 1) X = np.reshape(np.stack(data, axis=0), (-1,num_features)) #reshape to (data_size,num_vars) fa.fit(X) return fa
def test_factor_analysis(): """Test FactorAnalysis ability to recover the data covariance structure """ rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise assert_raises(ValueError, FactorAnalysis, svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' assert_raises(ValueError, fa_fail.fit, X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert_equal(X_t.shape, (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score(X).sum()) diff = np.all(np.diff(fa.loglike_)) assert_greater(diff, 0., 'Log likelihood dif not increase') # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_less(diff, 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2]) f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always', ConvergenceWarning) fa1.max_iter = 1 fa1.verbose = True fa1.fit(X) assert_true(w[-1].category == ConvergenceWarning) warnings.simplefilter('always', DeprecationWarning) FactorAnalysis(verbose=1) assert_true(w[-1].category == DeprecationWarning)
def rotated_scaled_fa(n_comp, arr_pq,varimax_=True): '''Perform factor analysis on a matrix IN: - n_comp, int, number of latent dimensions - arr_pq, arr, shape: samples (persons) x features (questions) - varimax_, bool, whether to perform a varimax rotation (default=True) OUT: - arr_qd, arr, shape: features x latent-dimension - arr_pd, arr, shape: samples x latent dimensions ''' fa = FactorAnalysis(n_comp) fa.fit(arr_pq) arr_pd = fa.transform(arr_pq) arr_qd = fa.components_.T ## do the varimax-rotation if varimax_ == True: arr_dp = np.transpose(arr_pd) L1,T= fr.rotate_factors(arr_qd,'varimax') arr_qd_new = np.dot(arr_qd,T) T_m1 = np.linalg.inv(T) arr_pd_new = np.dot(T_m1,arr_dp) arr_pd_new = np.transpose(arr_pd_new) return arr_qd_new, arr_pd_new else: return arr_qd, arr_pd
def initialize(self): """ Initialize the model. """ # inverse variance weighted mean if np.sum(self.obsvar) != 0.0: self.mean = np.sum(self.data / self.obsvar, axis=0) / \ np.sum(1.0 / self.obsvar, axis=0) else: self.mean = np.mean(self.data, axis=0) # use Factor Analysis to initialize factor loadings if self.M == 0: self.lam = np.zeros(1) else: fa = FactorAnalysis(n_components=self.M) fa.fit(self.data) self.lam = fa.components_.T # initialize jitter if self.jtype is None: self.jitter = np.array([]) elif self.jtype is 'one': self.jitter = 0.0 else: self.jitter = np.zeros(self.D) # save a copy self.initial_mean = self.mean.copy() self.initial_jitter = self.jitter.copy() self.initial_lambda = self.lam.copy()
def initializing(Y, K, singleSigma=False): N, D = Y.shape model = FactorAnalysis(n_components=K) zeroedY = deepcopy(Y) mus = np.zeros([D, 1]) for j in range(D): mus[j] = zeroedY[:, j].mean() zeroedY[:, j] = zeroedY[:, j] - mus[j] model.fit(zeroedY) A = model.components_.transpose() sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose() if singleSigma: sigmas = np.mean(sigmas) * np.ones(sigmas.shape) means = [] ps = [] for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 means.append(Y[non_zero_idxs, j].mean()) ps.append(1 - non_zero_idxs.mean()) lamb, pcov = curve_fit(exp_lam, means, ps, p0=.05) lamb = lamb[0] return A, mus, sigmas, lamb
class FactorAnalysis(): def __init__(self, cols, n_components): self.n_components = n_components self.model = FactorAnalysis(n_components=n_components) self.columns = cols def fit(self, data): self.model.fit(data[self.columns]) def fit_transform(self, data): transformed = self.model.fit_transform(data[self.columns]) transformed = pd.DataFrame( transformed, columns=["fa_" + str(i + 1) for i in range(self.n_components)]) data = pd.concat([data, transformed], axis=1) data = data.drop(self.columns, axis=1) return data def transform(self, data): transformed = self.model.transform(data[self.columns]) transformed = pd.DataFrame( transformed, columns=["fa_" + str(i + 1) for i in range(self.n_components)]) data = pd.concat([data, transformed], axis=1) data = data.drop(self.columns, axis=1) return data
def aic(mm): aic = [] for i in range(1, 10): fa = FactorAnalysis(n_components=i, tol=0.0001, max_iter=5000) fa.fit(mm) d = n * i b = 100 * fa.score(mm) - d aic.append(b) return aic
def runFA(self): print("Starting FA") print("Dimensionality reduction") numFeatures = 30 if (self.dataset == "otto"): numFeatures = 93 n_components = range(1, numFeatures + 1) decisiontree = DecisionTreeClassifier(criterion='gini', max_depth=15, min_samples_split=5) fa = FactorAnalysis(max_iter=1000) pipe = Pipeline(steps=[('fa', fa), ('decisionTree', decisiontree)]) # Plot the fa spectrum fa.fit(self.dataX) X = fa.components_ import numpy as np centered_matrix = X - X.mean(axis=1)[:, np.newaxis] cov = np.dot(centered_matrix, centered_matrix.T) eigvals, eigvecs = np.linalg.eig(cov) best_n = 11 if (self.dataset == "otto"): best_n = 30 self.plotFAGraph(n_components, eigvals, best_n) fig, ax = plt.subplots() ax.bar(n_components, eigvals, linewidth=2, color='blue') plt.axis('tight') plt.xlabel('n_components') ax.set_ylabel('Eigen Values') gridSearch = GridSearchCV(pipe, dict(fa__n_components=n_components), cv=3) gridSearch.fit(self.dataX, self.dataY) results = gridSearch.cv_results_ ax1 = ax.twinx() #Plotting the accuracies and best component ax1.plot(results['mean_test_score'], linewidth=2, color='red', label="CV score") ax1.set_ylabel('Mean Cross Validation Accuracy') ax1.axvline(best_n, linestyle=':', label='best n_components = %s' % (str(best_n)), linewidth=2) plt.legend(prop=dict(size=12), loc="upper right") plt.title("Accuracy of DT and Eigen Values of Latent Variables [" + self.dataset + "]") plt.savefig("./fa/" + self.dataset + "_best-n_components.png") plt.close()
def run_fa(dataset, min_components, max_components): X, y = load_dataset(dataset) data = X n_samples, n_features = data.shape n_labels = len(np.unique(y)) labels = y results = [] for n_components in range(min_components, max_components): print('n_components: ', n_components) for svd_method in ['lapack', 'randomized']: scores = [] data = X.copy() fa = FactorAnalysis(n_components=n_components, svd_method=svd_method, random_state=random_state) t0 = time() fa.fit(X) scores.append(n_components) scores.append(svd_method) scores.append(time() - t0) scores.append(fa.score(X)) results.append(scores) # N-Components vs Log Likelihood plot_results(np.array(results), trends_index=1, x_axis_index=0, x_axis_label='K-Components', y_axis_index=[3], y_axis_label='Log Liklihood', title=dataset.title() + ': FactorAnalysis', filename='-'.join(['fa', dataset, 'loglike'])) # N-Components vs Time plot_results(np.array(results), trends_index=1, x_axis_index=0, x_axis_label='K-Components', y_axis_index=[2], y_axis_label='Time', title=dataset.title() + ': FactorAnalysis', filename='-'.join(['fa', dataset, 'time'])) results = np.array(results) np.savetxt('output-csv/' + ('-'.join([dataset, 'fa.csv'])), results, delimiter=",", fmt="%s")
def bic(mm): bic = [] for i in range(1, 10): fa = FactorAnalysis(n_components=i, tol=0.0001, max_iter=5000) fa.fit(mm) d = n * i b = 100 * fa.score(mm) - (math.log(100) * d) / 2 bic.append(b) return bic
def get_inv_diag_plus_low_rank_cov_op(X, rank=2): fa = FactorAnalysis(n_components=rank) fa.fit(X) components = fa.components_ noise_vars = fa.noise_variance_ activations = fa.transform(X) return _woodbury_inverse(_diagonal_operator(1. / noise_vars), aslinearoperator(np.linalg.inv(1. / len(activations) * activations.T.dot(activations))), components.T, components)
def factor_analysis_method(train_x, train_y, validate_x, validate_y, fa_threshold, is_split=1): # 缺失值填充 train_x = train_x.fillna(0) train_x = train_x.values validate_x = validate_x.fillna(0) validate_x = validate_x.values # 归一化,之前必须保证没有空值,之后自动变成ndarray # scaler = MinMaxScaler() # train_x = scaler.fit_transform(train_x) # validate_x = scaler.fit_transform(validate_x) # dataframe变成没有标签的ndarray,以便可以输入模型 train_y = train_y.values validate_y = validate_y.values if is_split == 1: # 先把onehot列单独拿出来 onehot_train_x_left = train_x[:, :30] train_x_mid = train_x[:, 30:454] # onehot_train_x_right = train_x[:, 454:] onehot_validate_x_left = validate_x[:, :30] validate_x_mid = validate_x[:, 30:454] # onehot_validate_x_right = validate_x[:, 454:] else: train_ts_code_1 = train_x[:, 0] train_x_mid = train_x[:, 1:] valid_ts_code_1 = validate_x[:, 0] validate_x_mid = validate_x[:, 1:] # factor_analysis fa = FactorAnalysis(n_components=fa_threshold) selected_train_x = fa.fit(train_x_mid).transform(train_x_mid) selected_validate_x = fa.fit(validate_x_mid).transform(validate_x_mid) # 把ts_code再重新拼回来 if is_split == 1: #ts_code有30列 selected_train_x = np.hstack((onehot_train_x_left, selected_train_x)) selected_validate_x = np.hstack( (onehot_validate_x_left, selected_validate_x)) else: #ts_code只有一列 # print(train_ts_code_1.reshape(-1,1).shape) # print(selected_train_x.shape) selected_train_x = np.hstack( (train_ts_code_1.reshape(-1, 1), selected_train_x)) selected_validate_x = np.hstack( (valid_ts_code_1.reshape(-1, 1), selected_validate_x)) return selected_train_x, train_y, selected_validate_x, validate_y
def main_loop(self): self.aic_score = np.zeros(2 * self.M + 1) self.bic_score = np.zeros(2 * self.M + 1) for i in range(self.real_m - self.M, self.real_m + self.M + 1): self.m = i fa_model = FactorAnalysis(n_components=self.m) fa_model.fit(self.x) self.log_likelihood = fa_model.score(self.x) * self.N self.aic_score[i - self.real_m + self.M] = self.AIC() self.bic_score[i - self.real_m + self.M] = self.BIC() if self.verbose: self.show_line()
def initializeParams(Y, K, singleSigma=False, makePlot=False): """ initializes parameters using a standard factor analysis model (on imputed data) + exponential curve fitting. Checked. Input: Y: data matrix, n_samples x n_genes K: number of latent components singleSigma: uses only a single sigma as opposed to a different sigma for every gene makePlot: makes a mu - p_0 plot and shows the decaying exponential fit. Returns: A, mus, sigmas, decay_coef: initialized model parameters. """ N, D = Y.shape model = FactorAnalysis(n_components=K) zeroedY = deepcopy(Y) mus = np.zeros([D, 1]) for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 mus[j] = zeroedY[:, j].mean() zeroedY[:, j] = zeroedY[:, j] - mus[j] model.fit(zeroedY) A = model.components_.transpose() sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose() if singleSigma: sigmas = np.mean(sigmas) * np.ones(sigmas.shape) # Now fit decay coefficient means = [] ps = [] for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 means.append(Y[non_zero_idxs, j].mean()) ps.append(1 - non_zero_idxs.mean()) decay_coef, pcov = curve_fit(exp_decay, means, ps, p0=.05) decay_coef = decay_coef[0] mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means)**2)))) if (mse > 0) and makePlot: from matplotlib.pyplot import figure, scatter, plot, title, show figure() scatter(means, ps) plot(np.arange(min(means), max(means), .1), np.exp(-decay_coef * (np.arange(min(means), max(means), .1)**2))) title('Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse)) show() return A, mus, sigmas, decay_coef
def initializeParams(Y, K, singleSigma=False, makePlot=False): """ initializes parameters using a standard factor analysis model (on imputed data) + exponential curve fitting. Checked. Input: Y: data matrix, n_samples x n_genes K: number of latent components singleSigma: uses only a single sigma as opposed to a different sigma for every gene makePlot: makes a mu - p_0 plot and shows the decaying exponential fit. Returns: A, mus, sigmas, decay_coef: initialized model parameters. """ N, D = Y.shape model = FactorAnalysis(n_components=K) zeroedY = deepcopy(Y) mus = np.zeros([D, 1]) for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 mus[j] = zeroedY[:, j].mean() zeroedY[:, j] = zeroedY[:, j] - mus[j] model.fit(zeroedY) A = model.components_.transpose() sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose() if singleSigma: sigmas = np.mean(sigmas) * np.ones(sigmas.shape) # Now fit decay coefficient means = [] ps = [] for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 means.append(Y[non_zero_idxs, j].mean()) ps.append(1 - non_zero_idxs.mean()) decay_coef, pcov = curve_fit(exp_decay, means, ps, p0=.05) decay_coef = decay_coef[0] mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means) ** 2)))) if (mse > 0) and makePlot: from matplotlib.pyplot import figure, scatter, plot, title, show figure() scatter(means, ps) plot(np.arange(min(means), max(means), .1), np.exp(-decay_coef * (np.arange(min(means), max(means), .1) ** 2))) title('Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse)) show() return A, mus, sigmas, decay_coef
def main(): print ("Running CV on Log Likelihood approach.") LL() start_time = time.time() totalX = [] totalY = [] flag = True countTrain = 0 print ("\n\nNow testing on separate data.") with open("creditcard.csv", "rb") as f: data = csv.reader(f) for row in data: if flag: flag = False continue countTrain += 1 if countTrain > 228000: #CV on 80% of data totalX.append([float(i) for i in row[:-1]]) totalY.append(int(row[-1])) #newTotalX = np.fft.fft(totalX) totalX = scalar.fit_transform(totalX) print ("Data Loaded") clf = FactorAnalysis() clf.fit(totalX) #logLik = clf.score(totalX) Y = [] llScores = clf.score_samples(totalX) #calculates log likelihood of each sample (instead of average of whole data set) for i in range(len(totalY)): if llScores[i] > -60 and llScores[i] < -25: Y.append(0) else: Y.append(1) #prints running time of algorithm print("%s seconds" % (time.time() - start_time)) #print results print ("Results") auc = roc_auc_score(totalY, Y) print("Area under curve : " + str(auc)) fpr, tpr, _ = roc_curve(totalY, Y) print ("False Positive Rate : " + str(fpr[1])) _, recall, _ = precision_recall_curve(totalY, Y) print ("Recall : " + str(recall[1])) #to plot ROC curve plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % auc) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.legend(loc="lower right") plt.show()
def fit(self, y): """Fit the GPFA model parameters to the obervations y. Parameters ---------- y : ndarray (time, features) """ if isinstance(y, np.ndarray) and y.ndim == 2: y = [y] y_all = np.concatenate(y) self.mean_ = y_all.mean(axis=0, keepdims=True) y = [yi - self.mean_ for yi in y] n = y[0].shape[1] T = [yi.shape[0] for yi in y] model = FA(self.n_factors, svd_method='lapack') model.fit(y_all) self.R_ = np.diag(model.noise_variance_) self.C_ = model.components_.T self.d_ = np.zeros(n) self.tau_ = self.tau_init + self.rng.rand(self.n_factors) # Allocated and reuse these C = self.C_ R = self.R_ big_K = { Ti: calc_big_K(Ti, self.n_factors, self.tau_, self.var_n) for Ti in set(T) } y_cov = { Ti: block_dot_B(block_dot_A(C, big_K[Ti], Ti), C.T, Ti) + make_block_diag(R, Ti) for Ti in set(T) } big_d = {Ti: np.tile(self.d_, Ti) for Ti in set(T)} big_y = [yi.ravel() for yi in y] ll_pre = log_likelihood(big_d, y_cov, big_y, T) if self.verbose: print("FA log likelihood:", ll_pre) converged = False for ii in range(self.max_iter): ll = self._em_iter(y, big_K) if abs(ll - ll_pre) / np.amax([abs(ll), abs(ll_pre), 1. ]) <= self.tol: converged = True break ll_pre = ll if not converged: warnings.warn("EM max_iter reached.", ConvergenceWarning) return self
def factor_analysis_dimensionality_score(data_in, dimensions, nfold, maxiter=1000, verbose=False): ''' Estimate the latent dimensionality of an input dataset by appling cross validated factor analysis (FA) to input data and returning the maximum likelihood values. Args: data_in (nt, nch): Time series data in dimensions (ndim): 1D Array of dimensions to compute FA for nfold (int): Number of cross validation folds to compute. Must be >= 1 maxiter (int): Maximum number of FA iterations to compute if there is no convergence. Defaults to 1000. verbose (bool): Display % of dimensions completed. Defaults to False Returns: tuple: Tuple containing: | **log_likelihood_score (ndim, nfold):** Array of MLE FA score for each dimension for each fold | **iterations_required (ndim, nfold):** How many iterations of FA were required to converge for each fold ''' # Initialize arrays log_likelihood_score = np.zeros((np.max(np.shape(dimensions)), nfold)) iterations_required = np.zeros((np.max(np.shape(dimensions)), nfold)) if verbose == True: print('Cross validating and fitting ...') # Compute the maximum likelihood score for each dimension using factor analysis for dim_idx in range(len(dimensions)): fold_idx = 0 # Handle the case without cross validation. if nfold == 1: fa = FactorAnalysis(n_components=dimensions[dim_idx], max_iter=maxiter) fafit = fa.fit(data_in.T) log_likelihood_score[dim_idx, fold_idx] = fafit.score(data_in.T) iterations_required[dim_idx, fold_idx] = fafit.n_iter_ warnings.warn("Without cross validation the highest dimensional model will always fit best.") # Every other case with cross validation else: for trainidx, testidx in model_selection.KFold(n_splits=nfold).split(data_in.T): fa = FactorAnalysis(n_components=dimensions[dim_idx], max_iter=maxiter) fafit = fa.fit(data_in[:, trainidx].T) log_likelihood_score[dim_idx, fold_idx] = fafit.score(data_in[:, testidx].T) iterations_required[dim_idx, fold_idx] = fafit.n_iter_ fold_idx += 1 if verbose == True: print(str((100 * (dim_idx + 1)) // len(dimensions)) + "% Complete") return log_likelihood_score, iterations_required
def sd_fa(fname,components,result_name): ''' pca 计算 ''' cl_data,area_list = data_set(fname) values = cl_data.values fa = FactorAnalysis(n_components=components) #数据标准化 values = preprocessing.scale(values) try: fa.fit(values) except Exception,e: logging.error("factor analysis fit error") sys.exit()
class FactorAnalysisImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def makefac(df, ncomp, csv_path, comp_name=""): pca_df = PCA(n_components=ncomp) pca_df.fit(df) print(pca_df.explained_variance_ratio_.sum()) fac_df = FactorAnalysis(n_components=ncomp, svd_method='lapack') fac_df.fit(df) fac_df_df = pd.DataFrame(fac_df.components_, columns=df.columns) fac_df_df.to_csv(csv_path) new_cols = [] for i in range(ncomp): new_cols.append(comp_name + '_' + str(i)) trans_df = pd.DataFrame(fac_df.transform(df), index=df.index, columns=new_cols) return trans_df
def gridsearch_svm(Xtrain, Ytrain, Xval, Yval): #---------------------------------- Scaling X1, scaler = scale_data(Xtrain) X2 = scale_data(Xval, scaler) #---------------------------------- Factor analysis fa = FactorAnalysis() X1 = fa.fit_transform(X1) X2 = fa.fit(X2) #---------------------------------- Cross validation and grid search cv = ShuffleSplit(len(Xtrain), n_iter=1, train_size=0.25, test_size=.03, random_state=0) params = {'C': [1, 10], 'kernel': ['rbf', 'linear']} svr = svm.SVC(verbose=True, shrinking=False) classifier = grid_search.GridSearchCV(svr, params, verbose=3, cv=cv) t0 = time() classifier.fit(X1, Ytrain) train_time = time() - t0 print("train time: %0.3fs" % train_time) #---------------------------------- Prediction on validation set: t0 = time() pred = list(classifier.predict(X2)) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(classifier, 'coef_'): print("dimensionality: %d" % classifier.coef_.shape[1]) print("density: %f" % density(classifier.coef_)) print 'F1-score : ', f1_score(Yval, pred, average='binary') print("classification report:") print(classification_report(Yval, pred, target_names=['0', '1'], digits=4)) print("confusion matrix:") print(confusion_matrix(Yval, pred)) return classifier, scaler
def dataTransformations(x): x.rename(columns={'OCUPVIVPAR': 'Dwellers'}, inplace=True) #water x['Water'] = x['VPH_AGUAFV'] / x['Houses'] #Sanitation use VPH_EXCSA and VPH_NODREN x['Sanitation'] = (x['Houses'] - x['VPH_EXCSA'] + x['VPH_NODREN']) / (2. * x['Houses']) #Overcrowding use VPH_1CUART and PRO_OCUP_C # x['Density'] = 1. - 1./(1. +x['PRO_OCUP_C']) x['Density'] = x['PRO_OCUP_C'] - 2. x.loc[x.Density < 0, 'Density'] = 0. x['Density'] = 1. - 1. / (1. + x.Density) x['Density'] = x['Density'] / x['Density'].max() #Structure VPH_1CUART and VPH_PISOTI x['Structure'] = (x['VPH_PISOTI'] + x['VPH_1CUART']) / (2 * x['Houses']) ssiData = pd.DataFrame( normalize(x[['Water', 'Structure', 'Density', 'Sanitation']], axis=0), columns=['Water', 'Structure', 'Density', 'Sanitation']) # x.loc[:,'Factor'] = zeros(len(x) facAn = FactorAnalysis(n_components=1) facAn.fit(ssiData) x.loc[:, 'Factor'] = dot(facAn.components_**2, transpose(ssiData.values))[0] #K-Means k_meansX = ssiData # do the clustering k_means = KMeans(n_clusters=4) k_means.fit(k_meansX) x.loc[:, 'K_Means'] = k_means.labels_ #linear combination x.loc[:, 'LC'] = x[['Water', 'Structure', 'Sanitation' ]].sum(axis=1) + (x['PRO_OCUP_C'] / x['PRO_OCUP_C'].max()) #save x to csv # x.to_csv(folderPath+'dataTrans.csv') return x
def initialization_point(y, J): """ Run factor analysis to get a reasonable initialization point for the optimisation process. :param y: An array of the data that has shape (N, D) where N is the number of stars and D is the dimensionality of the data. :param J: The number of latent factors. :returns: A dictionary of initial values that can be fed directly to Stan. """ fa = FactorAnalysis(J) fa.fit(y) # TODO: Re-order the matrix of elements such that the low absolute values # are in the upper triangular part of the matrix, and that the entries # along the diagonal are positive. N, D = y.shape L, psi = (fa.components_.T, fa.noise_variance_) # The beta diagonal values must be positive. beta_diag = np.clip(L.T[np.diag_indices(J)], 0, np.inf) + 1e-3 # A hack to get the lower triangular beta values is to set the upper # triangular (including the diagonal) to non-finite values then re-order # and flatten the array. beta_lower_triangular = np.copy(L) beta_lower_triangular[np.triu_indices_from(L, 0)] = np.nan beta_lower_triangular = beta_lower_triangular.T.flatten() _ = np.isfinite(beta_lower_triangular) beta_lower_triangular = beta_lower_triangular[_] sigma_L = np.std(beta_lower_triangular) init = dict(psi=psi, beta_diag=beta_diag, beta_lower_triangular=beta_lower_triangular, sigma_L=sigma_L) return init
def expMlpc(self): pca = PCA(n_components=self.pcaBest) pca.fit(self.pcaDataX) self.pcaDataX = pca.transform(self.pcaDataX) self.pcaTrainX, self.pcaTestX, self.pcaTrainY, self.pcaTestY = train_test_split( self.pcaDataX, self.pcaDataY, test_size=0.3, random_state=0) print(self.pcaTrainX.shape) ica = FastICA(n_components=self.icaBest, max_iter=1000) ica.fit(self.icaDataX) self.icaDataX = ica.transform(self.icaDataX) self.icaTrainX, self.icaTestX, self.icaTrainY, self.icaTestY = train_test_split( self.icaDataX, self.icaDataY, test_size=0.3, random_state=0) print(self.icaTrainX.shape) rp = random_projection.GaussianRandomProjection( n_components=self.rpBest) rp.fit(self.rpDataX) self.rpDataX = rp.transform(self.rpDataX) self.rpTrainX, self.rpTestX, self.rpTrainY, self.rpTestY = train_test_split( self.rpDataX, self.rpDataY, test_size=0.3, random_state=0) print(self.rpTrainX.shape) fa = FactorAnalysis(n_components=self.faBest, max_iter=1000) fa.fit(self.faDataX) self.faDataX = fa.transform(self.faDataX) self.faTrainX, self.faTestX, self.faTrainY, self.faTestY = train_test_split( self.faDataX, self.faDataY, test_size=0.3, random_state=0) print(self.faTrainX.shape) normalResults = self.mlpc(self.trainX, self.trainY, self.testX, self.testY) pcaResults = self.mlpc(self.pcaTrainX, self.pcaTrainY, self.pcaTestX, self.pcaTestY) icaResults = self.mlpc(self.icaTrainX, self.icaTrainY, self.icaTestX, self.icaTestY) rpResults = self.mlpc(self.rpTrainX, self.rpTrainY, self.rpTestX, self.rpTestY) faResults = self.mlpc(self.faTrainX, self.faTrainY, self.faTestX, self.faTestY) print(normalResults) print(pcaResults) print(icaResults) print(rpResults) print(faResults)
def fit_system(intervals, n_components): print("fit function entered") y_prime = [] y = [] #pca = PCA(n_components=n_components) pca = FactorAnalysis(n_components=n_components) pca.fit(np.concatenate(intervals)) for interval,t in iterate_intervals(intervals, new_binsize): transformed = pca.transform(interval) y_prime.append(np.gradient(transformed, t, axis=0)) y.append(transformed) y = np.concatenate(y) y_prime = np.concatenate(y_prime) A = np.linalg.lstsq(y, y_prime, rcond=None)[0] A = A.T return A, pca
def factor_analysis(x, dims=3): x = to_ndarray(x) s = scale(x, axis=0, with_mean=True, with_std=True, copy=True) fa_model = FactorAnalysis(n_components=dims, svd_method="lapack") fitted = fa_model.fit(s) y = fitted.transform(s) print("Factor Analysis - Reduced dims from {} to {}".format( x.shape, y.shape )) return y, fitted
def factor_analysis(x, dims=3): x = to_ndarray(x) s = scale(x, axis=0, with_mean=True, with_std=True, copy=True) fa_model = FactorAnalysis(n_components=dims, svd_method="lapack") fitted = fa_model.fit(s) y = fitted.transform(s) print("Factor Analysis - Reduced dims from {} to {}".format( x.shape, y.shape)) return y, fitted
def dataTransformations(x): x.rename(columns={'OCUPVIVPAR': 'Dwellers'}, inplace=True) #water x['Water'] = x['VPH_AGUAFV']/x['Houses'] #Sanitation use VPH_EXCSA and VPH_NODREN x['Sanitation'] = (x['Houses'] - x['VPH_EXCSA'] + x['VPH_NODREN']) / (2.*x['Houses']) #Overcrowding use VPH_1CUART and PRO_OCUP_C # x['Density'] = 1. - 1./(1. +x['PRO_OCUP_C']) x['Density'] = x['PRO_OCUP_C']-2. x.loc[x.Density<0,'Density'] = 0. x['Density'] = 1. - 1./(1. + x.Density) x['Density'] = x['Density']/x['Density'].max() #Structure VPH_1CUART and VPH_PISOTI x['Structure'] = (x['VPH_PISOTI'] + x['VPH_1CUART']) / (2*x['Houses']) ssiData = pd.DataFrame(normalize(x[['Water','Structure','Density','Sanitation']],axis=0), columns=['Water','Structure','Density','Sanitation']) # x.loc[:,'Factor'] = zeros(len(x) facAn = FactorAnalysis(n_components = 1) facAn.fit(ssiData) x.loc[:,'Factor'] = dot(facAn.components_**2,transpose(ssiData.values))[0] #K-Means k_meansX = ssiData # do the clustering k_means = KMeans(n_clusters=4) k_means.fit(k_meansX) x.loc[:,'K_Means'] = k_means.labels_ #linear combination x.loc[:,'LC'] = x[['Water','Structure','Sanitation']].sum(axis=1) + (x['PRO_OCUP_C']/ x['PRO_OCUP_C'].max()) #save x to csv # x.to_csv(folderPath+'dataTrans.csv') return x
def fs_for_hybrid_data(x_train_left, y_train, x_validate_left, y_validate, method=0, method_threshold=10, is_auto=1): if method == 0: # None selected_x_train = x_train_left selected_x_validate = x_validate_left elif method == 1: # PCA print("使用PCA方法,方法结果为:") if is_auto == 1: pca = PCA(n_components='mle', whiten=False) else: pca = PCA(n_components=method_threshold, whiten=False) selected_x_train = pca.fit(x_train_left).transform(x_train_left) print(pca.explained_variance_ratio_) selected_x_validate = pca.fit(x_validate_left).transform( x_validate_left) print(pca.explained_variance_ratio_) elif method == 2: # 因子分析 fa = FactorAnalysis(n_components=method_threshold) selected_x_train = fa.fit(x_train_left).transform(x_train_left) selected_x_validate = fa.fit(x_validate_left).transform( x_validate_left) else: # 卡方检验 selected_x_train = SelectKBest(chi2, k=method_threshold).fit_transform( x_train_left, y_train) selected_x_validate = SelectKBest(chi2, k=method_threshold).fit_transform( x_validate_left, y_validate) # 降维后再次进行标准化 minmax_scaler = MinMaxScaler() selected_x_train = minmax_scaler.fit_transform(selected_x_train) selected_x_validate = minmax_scaler.fit_transform(selected_x_validate) return selected_x_train, selected_x_validate
def test_factor_analysis(): """Test FactorAnalysis ability to recover the data covariance structure """ rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) \ * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise fa = FactorAnalysis(n_components=n_components) fa.fit(X) X_t = fa.transform(X) assert_true(X_t.shape == (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score(X).sum()) # Make log likelihood increases at each iteration assert_true(np.all(np.diff(fa.loglike_) > 0.)) # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_true(diff < 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2])
from sklearn.lda import LDA iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names pca = PCA(n_components=2) X_r = pca.fit_transform(X) lda = LDA(n_components=2) X_r2 = lda.fit(X, y).transform(X) fa = FactorAnalysis(n_components=2) X_r3 = fa.fit(X).transform(X) # Percentage of variance explained for each components print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_)) print(sum(pca.explained_variance_ratio_)) plt.figure() for c, i, target_name in zip("rgb", [0, 1, 2], target_names): plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name) plt.legend(loc="best") plt.title('PCA of IRIS dataset') plt.figure() for c, i, target_name in zip("rgb", [0, 1, 2], target_names): plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], c=c, label=target_name) plt.legend(loc="best")
def test_factor_analysis(): # Test FactorAnalysis ability to recover the data covariance structure rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise assert_raises(ValueError, FactorAnalysis, svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' assert_raises(ValueError, fa_fail.fit, X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert_equal(X_t.shape, (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum()) assert_almost_equal(fa.score_samples(X).mean(), fa.score(X)) diff = np.all(np.diff(fa.loglike_)) assert_greater(diff, 0., 'Log likelihood dif not increase') # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_less(diff, 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2]) f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) fa1.max_iter = 1 fa1.verbose = True assert_warns(ConvergenceWarning, fa1.fit, X) # Test get_covariance and get_precision with n_components == n_features # with n_components < n_features and with n_components == 0 for n_components in [0, 2, X.shape[1]]: fa.n_components = n_components fa.fit(X) cov = fa.get_covariance() precision = fa.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12)
from data import load_data from sklearn.decomposition import FactorAnalysis try: import cPickle as pickle except: import pickle # Factor Analysis # ================================================================ # Apply factor analysis on the tf-idf matrix and transform raw documents into # intermediate representation. docs_tfidf, vocab_tfidf, vocabulary = load_data(subset='all') n_components = 40 fa = FactorAnalysis(n_components=n_components) fa.fit(docs_tfidf.toarray()) fa_words = fa.transform(vocab_tfidf.toarray()) # Create a dict to hold the new pca words. fa_dict = dict(zip(vocabulary, fa_words)) # Store the intermediate representation pca words on disk. fa_dict_filename = 'fa_dict.pk' if not os.path.exists(fa_dict_filename): fa_dict_file = open(fa_dict_filename, 'w') pickle.dump(fa_dict, fa_dict_file) # Store estimator on dist for further usage. fa_estimator_filename = 'fa_estimator.pk' if not os.path.exists(fa_estimator_filename): fa_estimator_file = open(fa_estimator_filename, 'w')
def factorAna(x,testData,n_components): fa = FactorAnalysis(n_components) fa.fit(x) newData = fa.transform(testData) return newData
def initalizeParams(Y, k, method = 'standard'): """ initializes parameters. By default, (method set to "standard") initializes using a mixture model. If method is set to "high_dimensional", first does dimensionality reduction using factor analysis and then clusters the low-dimensional data. Checked. """ assert(method in ['high_dimensional', 'standard']) if method == 'high_dimensional': N, D = Y.shape #initialize using factor analysis. model = FactorAnalysis(n_components = 5) low_dim_Y = model.fit_transform(Y) kmeans_model = KMeans(n_clusters = k) z = kmeans_model.fit_predict(low_dim_Y) cluster_mus = np.zeros([D, k]) cluster_weights = np.zeros([k,]) cluster_sigmas = np.zeros([D, k]) for z_i in sorted(set(z)): idxs = (z == z_i) cluster_weights[z_i] = np.mean(idxs) cluster_Y = Y[idxs, :] cluster_Y_is_nonzero = np.abs(cluster_Y) > 1e-6 cluster_mus[:, z_i] = cluster_Y.sum(axis = 0) / cluster_Y_is_nonzero.sum(axis = 0) cluster_sigmas[:, z_i] = np.sqrt(((cluster_Y ** 2).sum(axis = 0) - 2 * cluster_mus[:, z_i] * (cluster_Y.sum(axis = 0)) + cluster_mus[:, z_i]**2 * cluster_Y_is_nonzero.sum(axis = 0)) / cluster_Y_is_nonzero.sum(axis = 0)) for j in range(1, 5): assert(np.abs(cluster_sigmas[j, z_i] - np.std(cluster_Y[cluster_Y_is_nonzero[:, j], j])) < 1e-4) if method == 'standard': N, D = Y.shape model = GMM(n_components = k) imputedY = deepcopy(Y) for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 for i in range(N): if Y[i][j] == 0: imputedY[i][j] = np.random.choice(Y[non_zero_idxs, j]) model.fit(imputedY) cluster_mus = model.means_.transpose() cluster_weights = model.weights_ cluster_sigmas = np.sqrt(model.covars_.transpose()) #now fit decay coefficient means = [] ps = [] for j in range(D): non_zero_idxs = np.abs(Y[:, j]) > 1e-6 means.append(Y[non_zero_idxs, j].mean()) ps.append(1 - non_zero_idxs.mean()) decay_coef, pcov = curve_fit(exp_decay, means, ps) mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means) ** 2)))) print 'Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse) decay_coef = decay_coef[0] assert(np.all(cluster_sigmas > 0)) return cluster_mus, cluster_sigmas, cluster_weights, decay_coef
kf = cross_validation.KFold(cdata.shape[0], n_folds=4) max_components=30 sc=numpy.zeros((max_components,4)) for n_components in range(1,max_components): fa=FactorAnalysis(n_components=n_components) fold=0 for train,test in kf: train_data=cdata[train,:] test_data=cdata[test,:] fa.fit(train_data) sc[n_components,fold]=fa.score(test_data) fold+=1 meanscore=numpy.mean(sc,1) meanscore[0]=-numpy.inf maxscore=numpy.argmax(meanscore) print ('crossvalidation suggests %d components'%maxscore) # now run it on full dataset to get components fa=FactorAnalysis(n_components=maxscore) fa.fit(cdata) for c in range(maxscore): s=numpy.argsort(fa.components_[c,:]) print('')
def learn(data): model=FA(n_components =2) model.fit(data) return PreferenceGenerator(model.components_)
def simulate(data, factors=0, maxtrials=5, multiplier=1, seed=0): n = len(data) dim = len(data[0]) simulated = np.zeros((n,dim)) distribution = np.zeros((n,dim)) iteration = 0 BestRMSR = 1 trialsWithoutImprovement = 0 #apply distribution from supplied data distribution = data.copy() TargetCorr = corr(data.T) IntermidiateCorr = TargetCorr.copy() BestCorr = IntermidiateCorr #print data.shape #print simulated.shape #print TargetCorr, TargetCorr.shape if(factors == 0): eigvalsObserved = np.linalg.eigvals(IntermidiateCorr) eigvalsRandom = np.zeros((100,dim)) randomData = np.zeros((n,dim)) for i in range(0, 100): for j in range(0, dim): randomData[:, j] = np.random.permutation(distribution[:, j]) eigvalsRandom[i, :] = np.linalg.eigvals(corr(randomData.T)) eigvalsRandom = np.mean(eigvalsRandom, axis=0) factors = max(1, np.sum(eigvalsObserved > eigvalsRandom)) #steps 5,6 SharedComp = np.random.normal(0, 1, (n, factors)) UniqueComp = np.random.normal(0, 1, (n, dim)) SharedLoad = np.zeros((dim, factors)) UniqueLoad = np.zeros(dim) while trialsWithoutImprovement < maxtrials: iteration += 1 #Calculate factor loadings and apply to reproduce desired correlations (steps 7, 8) fa = FactorAnalysis() fa.n_components = factors fa.fit(IntermidiateCorr) FactLoadings = fa.components_.T #print FactLoadings.shape if (factors == 1): SharedLoad[:, 0] = FactLoadings[:, 0] else: SharedLoad = FactLoadings #print SharedLoad SharedLoad = np.clip(SharedLoad, -1, 1) #print SharedLoad if (SharedLoad[0, 0] < 0): SharedLoad *= -1 #print SharedLoad SharedLoadSq = SharedLoad * SharedLoad #print SharedLoadSq for i in range(0, dim): SharedLoadSum = np.sum(SharedLoadSq[i, :]) if(SharedLoadSum < 1): UniqueLoad[i] = 1 - SharedLoadSum else: UniqueLoad[i] = 0 UniqueLoad = np.sqrt(UniqueLoad) #print UniqueLoad MergedShare = np.dot(SharedComp, SharedLoad.T) for i in range(0, dim): simulated[:, i] = MergedShare[:, i] + UniqueComp[:, i]*UniqueLoad[i] #print simulated #Replace normal with nonnormal distributions (step 9) for i in range(0, dim): indices = np.argsort(simulated[:, i]) simulated = np.array(simulated)[indices] simulated[:, i] = distribution[:, i] #print simulated #print distribution #Calculate RMSR correlation, compare to lowest value, take appropriate action (steps 10, 11, 12) ReproducedCorr = corr(simulated.T) ResidualCorr = TargetCorr - ReproducedCorr; #print ResidualCorr RMSR = np.sqrt(np.sum(np.tril(ResidualCorr) ** 2) / (0.5 * (dim*dim - dim))) #print RMSR if (RMSR < BestRMSR): BestRMSR = RMSR BestCorr = IntermidiateCorr BestRes = ResidualCorr IntermidiateCorr = IntermidiateCorr + multiplier*ResidualCorr trialsWithoutImprovement = 0 else: trialsWithoutImprovement += 1 CurrentMultiplier = multiplier * (0.5 ** trialsWithoutImprovement) try: IntermidiateCorr = BestCorr + CurrentMultiplier * BestRes except NameError: BestRes = ResidualCorr IntermidiateCorr = BestCorr + CurrentMultiplier * BestRes #Construct the data set with the lowest RMSR correlation (step 13) fa = FactorAnalysis() fa.n_components = factors fa.fit(BestCorr) FactLoadings = fa.components_.T if (factors == 1): SharedLoad[:, 0] = FactLoadings[:, 0] else: SharedLoad = FactLoadings SharedLoad = np.clip(SharedLoad, -1, 1) if (SharedLoad[0, 0] < 0): SharedLoad *= -1 SharedLoadSq = SharedLoad * SharedLoad for i in range(0, dim): SharedLoadSum = np.sum(SharedLoadSq[i, :]) if(SharedLoadSum < 1): UniqueLoad[i] = 1 - SharedLoadSum else: UniqueLoad[i] = 0 UniqueLoad = np.sqrt(UniqueLoad) MergedShare = np.dot(SharedComp, SharedLoad.T) for i in range(0, dim): simulated[:, i] = MergedShare[:, i] + UniqueComp[:, i]*UniqueLoad[i] simulated = preprocessing.scale(simulated) for i in range(0, dim): indices = np.argsort(simulated[:, i]) simulated = np.array(simulated)[indices] simulated[:, i] = distribution[:, i] #return the simulated data set (step 14) #print 'RMSR', BestRMSR return simulated