def reduceDataset(self,nr=3,method='PCA'):
     '''It reduces the dimensionality of a given dataset using different techniques provided by Sklearn library
      Methods available:
                         'PCA'
                         'FactorAnalysis'
                         'KPCArbf','KPCApoly'
                         'KPCAcosine','KPCAsigmoid'
                         'IPCA'
                         'FastICADeflation'
                         'FastICAParallel'
                         'Isomap'
                         'LLE'
                         'LLEmodified'
                         'LLEltsa'
     '''
     dataset=self.ModelInputs['Dataset']
     #dataset=self.dataset[Model.in_columns]
     #dataset=self.dataset[['Humidity','TemperatureF','Sea Level PressureIn','PrecipitationIn','Dew PointF','Value']]
     #PCA
     if method=='PCA':
         sklearn_pca = sklearnPCA(n_components=nr)
         reduced = sklearn_pca.fit_transform(dataset)
     #Factor Analysis
     elif method=='FactorAnalysis':
         fa=FactorAnalysis(n_components=nr)
         reduced=fa.fit_transform(dataset)
     #kernel pca with rbf kernel
     elif method=='KPCArbf':
         kpca=KernelPCA(nr,kernel='rbf')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with poly kernel
     elif method=='KPCApoly':
         kpca=KernelPCA(nr,kernel='poly')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with cosine kernel
     elif method=='KPCAcosine':
         kpca=KernelPCA(nr,kernel='cosine')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with sigmoid kernel
     elif method=='KPCAsigmoid':
         kpca=KernelPCA(nr,kernel='sigmoid')
         reduced=kpca.fit_transform(dataset)
     #ICA
     elif method=='IPCA':
         ipca=IncrementalPCA(nr)
         reduced=ipca.fit_transform(dataset)
     #Fast ICA
     elif method=='FastICAParallel':
         fip=FastICA(nr,algorithm='parallel')
         reduced=fip.fit_transform(dataset)
     elif method=='FastICADeflation':
         fid=FastICA(nr,algorithm='deflation')
         reduced=fid.fit_transform(dataset)
     elif method == 'All':
         self.dimensionalityReduction(nr=nr)
         return self
     
     self.ModelInputs.update({method:reduced})
     self.datasetsAvailable.append(method)
     return self
Ejemplo n.º 2
0
    def initialize(self):
        """
        Initialize the model.
        """
        # inverse variance weighted mean
        if np.sum(self.obsvar) != 0.0:
            self.mean = np.sum(self.data / self.obsvar, axis=0) / \
                np.sum(1.0 / self.obsvar, axis=0)
        else:
            self.mean = np.mean(self.data, axis=0)

        # use Factor Analysis to initialize factor loadings
        if self.M == 0:
            self.lam = np.zeros(1)
        else:
            fa = FactorAnalysis(n_components=self.M)
            fa.fit(self.data)
            self.lam = fa.components_.T

        # initialize jitter
        if self.jtype is None:
            self.jitter = np.array([])
        elif self.jtype is 'one':
            self.jitter = 0.0
        else:
            self.jitter = np.zeros(self.D)

        # save a copy
        self.initial_mean = self.mean.copy()
        self.initial_jitter = self.jitter.copy()
        self.initial_lambda = self.lam.copy()
def factor_analysis(results_dir):
	data_array = np.transpose(np.genfromtxt(os.path.join(results_dir,'summary.csv'),delimiter=','))
	fa = FactorAnalysis(n_components = 2)
	new_array = fa.fit_transform(data_array)
	print fa.get_covariance().shape
	print new_array
	np.savetxt(os.path.join(results_dir,'FA-datasets-2.csv'), new_array, delimiter=',')
 def dimensionalityReduction(self,nr=5):
     '''It applies all the dimensionality reduction techniques available in this class:
     Techniques available:
                         'PCA'
                         'FactorAnalysis'
                         'KPCArbf','KPCApoly'
                         'KPCAcosine','KPCAsigmoid'
                         'IPCA'
                         'FastICADeflation'
                         'FastICAParallel'
                         'Isomap'
                         'LLE'
                         'LLEmodified'
                         'LLEltsa'
     '''
     dataset=self.ModelInputs['Dataset']
     sklearn_pca = sklearnPCA(n_components=nr)
     p_components = sklearn_pca.fit_transform(dataset)
     fa=FactorAnalysis(n_components=nr)
     factors=fa.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='rbf')
     rbf=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='poly')
     poly=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='cosine')
     cosine=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='sigmoid')
     sigmoid=kpca.fit_transform(dataset)
     ipca=IncrementalPCA(nr)
     i_components=ipca.fit_transform(dataset)
     fip=FastICA(nr,algorithm='parallel')
     fid=FastICA(nr,algorithm='deflation')
     ficaD=fip.fit_transform(dataset)
     ficaP=fid.fit_transform(dataset)
     '''isomap=Isomap(n_components=nr).fit_transform(dataset)
     try:
         lle1=LocallyLinearEmbedding(n_components=nr).fit_transform(dataset)
     except ValueError:
         lle1=LocallyLinearEmbedding(n_components=nr,eigen_solver='dense').fit_transform(dataset)
     try:
         
         lle2=LocallyLinearEmbedding(n_components=nr,method='modified').fit_transform(dataset)
     except ValueError:
         lle2=LocallyLinearEmbedding(n_components=nr,method='modified',eigen_solver='dense').fit_transform(dataset) 
     try:
         lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa').fit_transform(dataset)
     except ValueError:
         lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa',eigen_solver='dense').fit_transform(dataset)'''
     values=[p_components,factors,rbf,poly,cosine,sigmoid,i_components,ficaD,ficaP]#,isomap,lle1,lle2,lle3]
     keys=['PCA','FactorAnalysis','KPCArbf','KPCApoly','KPCAcosine','KPCAsigmoid','IPCA','FastICADeflation','FastICAParallel']#,'Isomap','LLE','LLEmodified','LLEltsa']
     self.ModelInputs.update(dict(zip(keys, values)))
     [self.datasetsAvailable.append(key) for key in keys ]
     
     #debug
     #dataset=pd.DataFrame(self.ModelInputs['Dataset'])
     #dataset['Output']=self.ModelOutput
     #self.debug['Dimensionalityreduction']=dataset
     ###
     return self
def factor_analysis(x, dims=3):
  x = to_ndarray(x)
  s = scale(x, axis=0, with_mean=True, with_std=True, copy=True)
  fa_model = FactorAnalysis(n_components=dims, svd_method="lapack")
  fitted = fa_model.fit(s)
  y = fitted.transform(s)
  print("Factor Analysis - Reduced dims from {} to {}".format( x.shape, y.shape ))
  return y, fitted
Ejemplo n.º 6
0
def run_fa(dataset, min_components, max_components):

    X, y = load_dataset(dataset)
    data = X

    n_samples, n_features = data.shape
    n_labels = len(np.unique(y))
    labels = y

    results = []

    for n_components in range(min_components, max_components):
        print('n_components: ', n_components)

        for svd_method in ['lapack', 'randomized']:

            scores = []
            data = X.copy()
            fa = FactorAnalysis(n_components=n_components,
                                svd_method=svd_method,
                                random_state=random_state)

            t0 = time()
            fa.fit(X)

            scores.append(n_components)
            scores.append(svd_method)
            scores.append(time() - t0)
            scores.append(fa.score(X))

            results.append(scores)

    # N-Components vs Log Likelihood
    plot_results(np.array(results),
                 trends_index=1,
                 x_axis_index=0,
                 x_axis_label='K-Components',
                 y_axis_index=[3],
                 y_axis_label='Log Liklihood',
                 title=dataset.title() + ': FactorAnalysis',
                 filename='-'.join(['fa', dataset, 'loglike']))

    # N-Components vs Time
    plot_results(np.array(results),
                 trends_index=1,
                 x_axis_index=0,
                 x_axis_label='K-Components',
                 y_axis_index=[2],
                 y_axis_label='Time',
                 title=dataset.title() + ': FactorAnalysis',
                 filename='-'.join(['fa', dataset, 'time']))

    results = np.array(results)
    np.savetxt('output-csv/' + ('-'.join([dataset, 'fa.csv'])),
               results,
               delimiter=",",
               fmt="%s")
def factor_analysis(y_mat, num_components):
    from sklearn.decomposition import FactorAnalysis
    F = FactorAnalysis(num_components)
    transformed = F.fit_transform(
        y_mat.transpose())  # shape: time x components
    components = F.components_
    mn = F.mean_
    noise_variance = F.noise_variance_
    return transformed, components, mn, noise_variance
Ejemplo n.º 8
0
def factorAnalysis(data, percentage=0.535):
    dataMat = np.array(data)
    newData, meanVal = zeroMean(data)  #equalization
    covMat = covArray(newData)  #covariance matrix
    eigVals, eigVects = featureMatrix(covMat)
    n_components = percentage2n(eigVals, percentage)
    clf = FactorAnalysis(n_components=n_components)
    new_data = clf.fit_transform(dataMat)
    return new_data
Ejemplo n.º 9
0
    def runFA(self):
        print("Starting FA")
        print("Dimensionality reduction")
        numFeatures = 30
        if (self.dataset == "otto"):
            numFeatures = 93
        n_components = range(1, numFeatures + 1)

        decisiontree = DecisionTreeClassifier(criterion='gini',
                                              max_depth=15,
                                              min_samples_split=5)
        fa = FactorAnalysis(max_iter=1000)

        pipe = Pipeline(steps=[('fa', fa), ('decisionTree', decisiontree)])

        # Plot the fa spectrum
        fa.fit(self.dataX)
        X = fa.components_
        import numpy as np
        centered_matrix = X - X.mean(axis=1)[:, np.newaxis]
        cov = np.dot(centered_matrix, centered_matrix.T)
        eigvals, eigvecs = np.linalg.eig(cov)
        best_n = 11
        if (self.dataset == "otto"):
            best_n = 30

        self.plotFAGraph(n_components, eigvals, best_n)

        fig, ax = plt.subplots()
        ax.bar(n_components, eigvals, linewidth=2, color='blue')
        plt.axis('tight')
        plt.xlabel('n_components')
        ax.set_ylabel('Eigen Values')

        gridSearch = GridSearchCV(pipe,
                                  dict(fa__n_components=n_components),
                                  cv=3)
        gridSearch.fit(self.dataX, self.dataY)
        results = gridSearch.cv_results_
        ax1 = ax.twinx()

        #Plotting the accuracies and best component
        ax1.plot(results['mean_test_score'],
                 linewidth=2,
                 color='red',
                 label="CV score")
        ax1.set_ylabel('Mean Cross Validation Accuracy')
        ax1.axvline(best_n,
                    linestyle=':',
                    label='best n_components = %s' % (str(best_n)),
                    linewidth=2)

        plt.legend(prop=dict(size=12), loc="upper right")
        plt.title("Accuracy of DT and Eigen Values of Latent Variables [" +
                  self.dataset + "]")
        plt.savefig("./fa/" + self.dataset + "_best-n_components.png")
        plt.close()
Ejemplo n.º 10
0
def aic(mm):
    aic = []
    for i in range(1, 10):
        fa = FactorAnalysis(n_components=i, tol=0.0001, max_iter=5000)
        fa.fit(mm)
        d = n * i
        b = 100 * fa.score(mm) - d
        aic.append(b)
    return aic
Ejemplo n.º 11
0
def FAforAllworkloads(n_c, frame):
    all_metrics_data = frame.values

    all_metrics_data_Trans = all_metrics_data.T
    tmp_all_transformer = FactorAnalysis(n_components=n_c, random_state=0)
    tmp_workload_A_transformed = tmp_all_transformer.fit_transform(
        all_metrics_data_Trans)

    return tmp_workload_A_transformed
Ejemplo n.º 12
0
def bic(mm):
    bic = []
    for i in range(1, 10):
        fa = FactorAnalysis(n_components=i, tol=0.0001, max_iter=5000)
        fa.fit(mm)
        d = n * i
        b = 100 * fa.score(mm) - (math.log(100) * d) / 2
        bic.append(b)
    return bic
Ejemplo n.º 13
0
def testAlgorithm():
    import matplotlib.pyplot as plt

    random.seed(35)
    np.random.seed(32)

    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    colors = ['red', 'blue', 'green']
    cluster_ids = sorted(list(set(ids)))
    model = FactorAnalysis(n_components=k)
    factor_analysis_Zhat = model.fit_transform(Y)

    plt.figure(figsize=[15, 5])

    plt.subplot(131)
    for id in cluster_ids:
        plt.scatter(Z[ids == id, 0],
                    Z[ids == id, 1],
                    color=colors[id - 1],
                    s=4)
        plt.title('True Latent Positions\nFraction of Zeros %2.3f' %
                  (Y == 0).mean())
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])

    plt.subplot(132)
    for id in cluster_ids:
        plt.scatter(Zhat[ids == id, 0],
                    Zhat[ids == id, 1],
                    color=colors[id - 1],
                    s=4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('ZIFA Estimated Latent Positions')
        # title(titles[method])

    plt.subplot(133)
    for id in cluster_ids:
        plt.scatter(factor_analysis_Zhat[ids == id, 0],
                    factor_analysis_Zhat[ids == id, 1],
                    color=colors[id - 1],
                    s=4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('Factor Analysis Estimated Latent Positions')

    plt.show()
Ejemplo n.º 14
0
def dimension_reduction(train_x, train_y, test_x, n_col, method='fact'):
    # Obtain column names
    attr_list = train_x.columns

    # Using RFE to rank feactures and then select
    if method == 'RFE':
        # Using RFE to rank attributes
        lin_reg = LinearRegression()
        rfe = RFE(lin_reg, n_col)
        fit = rfe.fit(train_x, train_y)

        # Selecte most relevant attributes for machien learning
        fit_list = fit.support_.tolist()
        indexes = [
            index for index in range(len(fit_list)) if fit_list[index] == True
        ]

        # Print out attributes selected and ranking
        print('\nAttributes selected are: ', itemgetter(*indexes)(attr_list))
        print('\nAttributes Ranking: ', fit.ranking_)

        train_x_returned = train_x.iloc[:, indexes]
        test_x_returned = test_x.iloc[:, indexes]

    # Using factor analysis
    elif method == 'fact':
        fact_anal = FactorAnalysis(n_components=n_col)
        train_x_returned = pd.DataFrame(fact_anal.fit_transform(train_x))
        test_x_returned = pd.DataFrame(fact_anal.transform(test_x))

        train_x_returned.columns = [
            ''.join(['feature_', str(i)])
            for i in list(train_x_returned.columns)
        ]
        test_x_returned.columns = [
            ''.join(['feature_', str(i)])
            for i in list(test_x_returned.columns)
        ]

    # Using PCA
    elif method == 'PCA':
        pca_down = PCA(n_components=n_col)
        train_x_returned = pd.DataFrame(pca_down.fit_transform(train_x))
        test_x_returned = pd.DataFrame(pca_down.transform(test_x))

        train_x_returned.columns = [
            ''.join(['feature_', str(i)])
            for i in list(train_x_returned.columns)
        ]
        test_x_returned.columns = [
            ''.join(['feature_', str(i)])
            for i in list(test_x_returned.columns)
        ]

    # Returned selected or regenerated features
    return train_x_returned, test_x_returned
Ejemplo n.º 15
0
def fa_run(tol=0.01):
    pca = FactorAnalysis(n_components=2, tol=tol)

    pca_data = pca.fit(data).transform(data)

    fig, axs = plt.subplots(1, 1)

    axs.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='rainbow')

    plt.show()
Ejemplo n.º 16
0
def test_factor_analysis():
    """Test FactorAnalysis ability to recover the data covariance structure
    """
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise
    assert_raises(ValueError, FactorAnalysis, svd_method='foo')
    fa_fail = FactorAnalysis()
    fa_fail.svd_method = 'foo'
    assert_raises(ValueError, fa_fail.fit, X)
    fas = []
    for method in ['randomized', 'lapack']:
        fa = FactorAnalysis(n_components=n_components, svd_method=method)
        fa.fit(X)
        fas.append(fa)

        X_t = fa.transform(X)
        assert_equal(X_t.shape, (n_samples, n_components))

        assert_almost_equal(fa.loglike_[-1], fa.score(X).sum())

        diff = np.all(np.diff(fa.loglike_))
        assert_greater(diff, 0., 'Log likelihood dif not increase')

        # Sample Covariance
        scov = np.cov(X, rowvar=0., bias=1.)

        # Model Covariance
        mcov = fa.get_covariance()
        diff = np.sum(np.abs(scov - mcov)) / W.size
        assert_less(diff, 0.1, "Mean absolute difference is %f" % diff)
        fa = FactorAnalysis(n_components=n_components,
                            noise_variance_init=np.ones(n_features))
        assert_raises(ValueError, fa.fit, X[:, :2])

    f = lambda x, y: np.abs(getattr(x, y))  # sign will not be equal
    fa1, fa2 = fas
    for attr in ['loglike_', 'components_', 'noise_variance_']:
        assert_almost_equal(f(fa1, attr), f(fa2, attr))
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always', ConvergenceWarning)
        fa1.max_iter = 1
        fa1.verbose = True
        fa1.fit(X)
        assert_true(w[-1].category == ConvergenceWarning)

        warnings.simplefilter('always', DeprecationWarning)
        FactorAnalysis(verbose=1)
        assert_true(w[-1].category == DeprecationWarning)
def get_inv_diag_plus_low_rank_cov_op(X, rank=2):
    fa = FactorAnalysis(n_components=rank)
    fa.fit(X)
    components = fa.components_
    noise_vars = fa.noise_variance_
    activations = fa.transform(X)

    return _woodbury_inverse(_diagonal_operator(1. / noise_vars),
                 aslinearoperator(np.linalg.inv(1. / len(activations) * 
                                  activations.T.dot(activations))),
                 components.T, components)
Ejemplo n.º 18
0
    def compute_scores(x):
        pca = PCA(svd_solver='full')  #申请模型
        fa = FactorAnalysis()

        pca_scores, fa_scores = [], []
        for n in n_components:
            pca.n_components = n
            fa.n_components = n
            pca_scores.append(np.mean(cross_val_score(pca, x)))  #通过交叉验证估计得分
            fa_scores.append(np.mean(cross_val_score(fa, x)))
        return pca_scores, fa_scores
Ejemplo n.º 19
0
def model_process(X, y):
    """
    调用训练模型进行数据处理
    :param X: 自变量
    :param y: 因变量
    :return: result
    """
    fa = FactorAnalysis()
    fa.fit_transform(X, y)
    # print fa.get_covariance()
    print fa.components_
Ejemplo n.º 20
0
def compute_scores(X, n_components):
    pca = PCA()
    fa = FactorAnalysis()
    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X, cv=5)))
        fa.n_components = n
        fa_scores.append(np.mean(cross_val_score(fa, X, cv=5)))

    return pca_scores, fa_scores
Ejemplo n.º 21
0
def factor_analysis_method(train_x,
                           train_y,
                           validate_x,
                           validate_y,
                           fa_threshold,
                           is_split=1):
    # 缺失值填充
    train_x = train_x.fillna(0)
    train_x = train_x.values
    validate_x = validate_x.fillna(0)
    validate_x = validate_x.values

    # 归一化,之前必须保证没有空值,之后自动变成ndarray
    # scaler = MinMaxScaler()
    # train_x = scaler.fit_transform(train_x)
    # validate_x = scaler.fit_transform(validate_x)

    # dataframe变成没有标签的ndarray,以便可以输入模型
    train_y = train_y.values
    validate_y = validate_y.values

    if is_split == 1:
        # 先把onehot列单独拿出来
        onehot_train_x_left = train_x[:, :30]
        train_x_mid = train_x[:, 30:454]
        # onehot_train_x_right = train_x[:, 454:]
        onehot_validate_x_left = validate_x[:, :30]
        validate_x_mid = validate_x[:, 30:454]
        # onehot_validate_x_right = validate_x[:, 454:]
    else:
        train_ts_code_1 = train_x[:, 0]
        train_x_mid = train_x[:, 1:]
        valid_ts_code_1 = validate_x[:, 0]
        validate_x_mid = validate_x[:, 1:]

    # factor_analysis
    fa = FactorAnalysis(n_components=fa_threshold)
    selected_train_x = fa.fit(train_x_mid).transform(train_x_mid)
    selected_validate_x = fa.fit(validate_x_mid).transform(validate_x_mid)

    # 把ts_code再重新拼回来
    if is_split == 1:  #ts_code有30列
        selected_train_x = np.hstack((onehot_train_x_left, selected_train_x))
        selected_validate_x = np.hstack(
            (onehot_validate_x_left, selected_validate_x))
    else:  #ts_code只有一列
        # print(train_ts_code_1.reshape(-1,1).shape)
        # print(selected_train_x.shape)
        selected_train_x = np.hstack(
            (train_ts_code_1.reshape(-1, 1), selected_train_x))
        selected_validate_x = np.hstack(
            (valid_ts_code_1.reshape(-1, 1), selected_validate_x))

    return selected_train_x, train_y, selected_validate_x, validate_y
Ejemplo n.º 22
0
def factor_analysis(df, Data_path_exit, df2_index):
    # Выполняем расчет описательной статистики и корреляционной матрицы
    df_des_stat = df.describe()
    df_cor_stat = df.corr()
    # Делаем вывод описательной статистики и корреляционной матрицы
    df_des_stat.to_csv(Data_path_exit + 'des_stat.csv',
                       sep=';',
                       float_format='%.3f')
    df_cor_stat.to_csv(Data_path_exit + 'cor_stat.csv',
                       sep=';',
                       float_format='%.3f')
    # Строим диаграммы рассеивания и гистограммы
    matrix = scatter_matrix(df, figsize=[20, 20], alpha=0.2)
    # Импортируем данные
    plt.savefig(Data_path_exit + 'Scatter_matrix' + '.png',
                format='png',
                dpi=300)

    df_scaled = preprocessing.scale(
        df)  # массив со стандартизированными данными
    # Проецируем с метода главных компонент переменнные на плоскость. Выделяем 4 главных фактора (можно больше)
    pca = PCA(n_components=4)
    pca1 = pca.fit(df_scaled)
    print('Доля разброса, которую объясняют факторы: ',
          pca.explained_variance_ratio_)

    # Рассчитываем значения основных факторов
    zzz = pca.transform(df_scaled)
    values_factors = pd.DataFrame(zzz)
    values_factors.to_csv(Data_path_exit + 'factor_values.csv',
                          sep=';',
                          float_format='%.3f')
    #print (zzz)

    # Факторный анализ
    fa = FactorAnalysis(n_components=4)  # Количество факторов
    fac_1 = fa.fit(df_scaled)
    df_fa = pd.DataFrame(fa.components_, columns=df.columns)
    df_fa.to_csv(Data_path_exit + 'factor_result.csv',
                 sep=';',
                 float_format='%.3f'
                 )  # Координаты факторов в пространстве исходных значений
    # Уникальность значений в смысле дисперсии, объяснённой факторами (чем больше, тем хуже объясняется факторами) содержится в атрибуте
    fac_2 = pd.Series(fa.noise_variance_, df.columns)
    fac_2.to_csv(
        Data_path_exit + 'Unic_values.csv', sep=';',
        float_format='%.3f')  # Координаты факторов. Основной результат
    print('Уникальность значений:\n', fac_2)
    scores = pd.DataFrame(fa.transform(df_scaled),
                          columns=['factor1', 'factor2', 'factor3', 'factor4'])
    scores = scores.set_index(df2_index.index)
    scores.to_csv(
        Data_path_exit + 'factor_vectors.csv', sep=';',
        float_format='%.3f')  # Координаты факторов. Основной результат
Ejemplo n.º 23
0
 def compute_scores(self, max_n):
     n_components = np.arange(0, max_n, 1)
     pca = PCA(svd_solver='full')
     fa = FactorAnalysis()
     pca_scores, fa_scores = [], []
     for n in n_components:
         pca.n_components = n
         fa.n_components = n
         pca_scores.append(np.mean(cross_val_score(pca, self.sample)))
         fa_scores.append(np.mean(cross_val_score(fa, self.sample)))
     return pca_scores, fa_scores
Ejemplo n.º 24
0
def dim_reduction_fa(df, pca_ncomps=10):
    df_pca = FactorAnalysis(n_components=pca_ncomps).fit(df.T)

    df_pcs = df_pca.transform(df.T)
    df_pcs = pd.DataFrame(df_pcs,
                          index=df.T.index,
                          columns=pc_labels(pca_ncomps))
    df_loadings = pd.DataFrame(df_pca.components_,
                               index=pc_labels(pca_ncomps),
                               columns=df.T.columns)

    return df_pcs, df_loadings
Ejemplo n.º 25
0
def initializeParams(Y, K, singleSigma=False, makePlot=False):
    """
	initializes parameters using a standard factor analysis model (on imputed data) + exponential curve fitting.
	Checked.
	Input:
	Y: data matrix, n_samples x n_genes
	K: number of latent components
	singleSigma: uses only a single sigma as opposed to a different sigma for every gene
	makePlot: makes a mu - p_0 plot and shows the decaying exponential fit.
	Returns:
	A, mus, sigmas, decay_coef: initialized model parameters.
	"""

    N, D = Y.shape
    model = FactorAnalysis(n_components=K)
    zeroedY = deepcopy(Y)
    mus = np.zeros([D, 1])

    for j in range(D):
        non_zero_idxs = np.abs(Y[:, j]) > 1e-6
        mus[j] = zeroedY[:, j].mean()
        zeroedY[:, j] = zeroedY[:, j] - mus[j]

    model.fit(zeroedY)

    A = model.components_.transpose()
    sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose()
    if singleSigma:
        sigmas = np.mean(sigmas) * np.ones(sigmas.shape)

    # Now fit decay coefficient
    means = []
    ps = []
    for j in range(D):
        non_zero_idxs = np.abs(Y[:, j]) > 1e-6
        means.append(Y[non_zero_idxs, j].mean())
        ps.append(1 - non_zero_idxs.mean())

    decay_coef, pcov = curve_fit(exp_decay, means, ps, p0=.05)
    decay_coef = decay_coef[0]

    mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means)**2))))

    if (mse > 0) and makePlot:
        from matplotlib.pyplot import figure, scatter, plot, title, show
        figure()
        scatter(means, ps)
        plot(np.arange(min(means), max(means), .1),
             np.exp(-decay_coef * (np.arange(min(means), max(means), .1)**2)))
        title('Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse))
        show()

    return A, mus, sigmas, decay_coef
def compute_scores(X):
    pca = PCA(svd_solver="full")
    fa = FactorAnalysis()

    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X)))
        fa_scores.append(np.mean(cross_val_score(fa, X)))

    return pca_scores, fa_scores
Ejemplo n.º 27
0
def run_FA(X,y,title):
    
    fa = FA(random_state=5)
    fa.fit_transform(X)
    vn = fa.noise_variance_
    print(vn)
    plt.plot(list(range(len(vn))), vn, 'm-')
    plt.xlabel('conponent')
    plt.ylabel('noise variance')
    plt.tick_params('y', colors='m')
    plt.title("FA Noise Variance: "+ title)
    plt.show()
Ejemplo n.º 28
0
 def main_loop(self):
     self.aic_score = np.zeros(2 * self.M + 1)
     self.bic_score = np.zeros(2 * self.M + 1)
     for i in range(self.real_m - self.M, self.real_m + self.M + 1):
         self.m = i
         fa_model = FactorAnalysis(n_components=self.m)
         fa_model.fit(self.x)
         self.log_likelihood = fa_model.score(self.x) * self.N
         self.aic_score[i - self.real_m + self.M] = self.AIC()
         self.bic_score[i - self.real_m + self.M] = self.BIC()
     if self.verbose:
         self.show_line()
Ejemplo n.º 29
0
def factor_dim(df):
    #主成份分析
    pmodel = PCA(n_components=3)
    lower_mat = pmodel.fit_transform(df)
    df_array   = df.values[:]
    lower_df = DataFrame(lower_mat,columns=["factor1","factor2","factor3"])
    #因子分析
    fmodel =FactorAnalysis (n_components=3,random_state=0)
    lower_fac = fmodel.fit_transform(df)
    #lower_df = DataFrame(lower_fac,columns=["factor1","factor1","factor1"])
    print(lower_df)
    return lower_df
Ejemplo n.º 30
0
def compute_scores(X):
    pca = PCA()
    fa = FactorAnalysis()

    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X)))
        fa_scores.append(np.mean(cross_val_score(fa, X)))

    return pca_scores, fa_scores
Ejemplo n.º 31
0
def compute_pca_scores(X, n_features=15):
    pca = PCA(svd_solver='full')
    fa = FactorAnalysis()
    n_components = np.arange(0, n_features, 5)
    pca_scores, fa_scores = [], []
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X)))
        fa_scores.append(np.mean(cross_val_score(fa, X)))

    return pca_scores, fa_scores
Ejemplo n.º 32
0
def initializeParams(Y, K, singleSigma=False, makePlot=False):
	"""
	initializes parameters using a standard factor analysis model (on imputed data) + exponential curve fitting.
	Checked.
	Input:
	Y: data matrix, n_samples x n_genes
	K: number of latent components
	singleSigma: uses only a single sigma as opposed to a different sigma for every gene
	makePlot: makes a mu - p_0 plot and shows the decaying exponential fit.
	Returns:
	A, mus, sigmas, decay_coef: initialized model parameters.
	"""

	N, D = Y.shape
	model = FactorAnalysis(n_components=K)
	zeroedY = deepcopy(Y)
	mus = np.zeros([D, 1])

	for j in range(D):
		non_zero_idxs = np.abs(Y[:, j]) > 1e-6
		mus[j] = zeroedY[:, j].mean()
		zeroedY[:, j] = zeroedY[:, j] - mus[j]

	model.fit(zeroedY)

	A = model.components_.transpose()
	sigmas = np.atleast_2d(np.sqrt(model.noise_variance_)).transpose()
	if singleSigma:
		sigmas = np.mean(sigmas) * np.ones(sigmas.shape)

	# Now fit decay coefficient
	means = []
	ps = []
	for j in range(D):
		non_zero_idxs = np.abs(Y[:, j]) > 1e-6
		means.append(Y[non_zero_idxs, j].mean())
		ps.append(1 - non_zero_idxs.mean())

	decay_coef, pcov = curve_fit(exp_decay, means, ps, p0=.05)
	decay_coef = decay_coef[0]

	mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means) ** 2))))

	if (mse > 0) and makePlot:
		from matplotlib.pyplot import figure, scatter, plot, title, show
		figure()
		scatter(means, ps)
		plot(np.arange(min(means), max(means), .1), np.exp(-decay_coef * (np.arange(min(means), max(means), .1) ** 2)))
		title('Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse))
		show()

	return A, mus, sigmas, decay_coef
def main():
    print ("Running CV on Log Likelihood approach.")
    LL()

    start_time = time.time()
    totalX = []
    totalY = []
    flag = True
    countTrain = 0
    print ("\n\nNow testing on separate data.")
    with open("creditcard.csv", "rb") as f:
        data = csv.reader(f)
        for row in data:
            if flag:
                flag = False
                continue
            countTrain += 1
            if countTrain > 228000:          #CV on 80% of data
                totalX.append([float(i) for i in row[:-1]])
                totalY.append(int(row[-1]))

    #newTotalX = np.fft.fft(totalX)
    totalX = scalar.fit_transform(totalX)
    print ("Data Loaded")
    clf = FactorAnalysis()
    clf.fit(totalX)
    #logLik = clf.score(totalX)
    Y = []
    llScores = clf.score_samples(totalX)						#calculates log likelihood of each sample (instead of average of whole data set)
    for i in range(len(totalY)):
        if llScores[i] > -60 and llScores[i] < -25:
            Y.append(0)
        else:
            Y.append(1)
	#prints running time of algorithm
    print("%s seconds" % (time.time() - start_time))
	#print results
    print ("Results")
    auc = roc_auc_score(totalY, Y)
    print("Area under curve : " + str(auc))
    fpr, tpr, _ = roc_curve(totalY, Y)
    print ("False Positive Rate : " + str(fpr[1]))
    _, recall, _ = precision_recall_curve(totalY, Y)
    print ("Recall : " + str(recall[1]))

	#to plot ROC curve
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % auc)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc="lower right")
    plt.show()
Ejemplo n.º 34
0
def cluster_sk_factor_analysis(content):
    """ SK FA | components: N, data:[[]], classes:[] """
    _config = FactorAnalysis(n_components=content['n_components'],
                             svd_method=content['svd_method'],
                             tol=content['tol'])
    _result = _config.fit(content['data']).transform(content['data'])
    return httpWrapper(
        json.dumps({
            'result': _result.tolist(),
            'loglike': _config.loglike_,
            'noiseVariance': _config.noise_variance_.tolist(),
            'nIter': _config.n_iter_
        }))
Ejemplo n.º 35
0
def compute_scores(X, n_components):
    pca = PCA()
    fa = FactorAnalysis()

    pca_scores, fa_scores = [], []
    for n in n_components:
        print 'Processing dimension {}'.format(n)
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X)))
        fa_scores.append(np.mean(cross_val_score(fa, X)))

    return pca_scores, fa_scores
def do_fa(df):
    columns = [
        "cement", "slag", "fly_ash", "water", "superplasticizer",
        "coarse_aggregate", "fine_aggregate"
    ]
    X = df[columns]
    X_std = StandardScaler().fit_transform(X)

    fa = FactorAnalysis(n_components=4, random_state=100)
    X_fa = fa.fit_transform(X_std)
    fa_summary = pd.DataFrame(fa.components_, columns=columns)
    print(fa_summary)
    fa_plot(X_fa[:, 0:2], np.transpose(fa.components_[0:2, :]), columns)
def factor_analysis( data ):
    fa = FactorAnalysis()
    features = numerical_features + categorical_features
    fa_data = fa.fit_transform( data[features] )
    plt.figure()
    plt.subplot(2,2,0)
    plt.scatter( fa_data[:,0], fa_data[:,1], c=data[target] )
    plt.subplot(2,2,1)
    plt.scatter( fa_data[:,2], fa_data[:,3], c=data[target] )
    plt.subplot(2,2,2)
    plt.scatter( fa_data[:,4], fa_data[:,5], c=data[target] )
    plt.subplot(2,2,3)
    plt.scatter( fa_data[:,6], fa_data[:,7], c=data[target] )
    return fa_data
Ejemplo n.º 38
0
class FactorAnalysisImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Ejemplo n.º 39
0
def sd_fa(fname,components,result_name):
    '''
    pca 计算
    '''
    cl_data,area_list = data_set(fname)
    values = cl_data.values
    fa = FactorAnalysis(n_components=components)
    #数据标准化
    values = preprocessing.scale(values)
    try:
        fa.fit(values)
    except Exception,e:
            logging.error("factor analysis fit error")
            sys.exit()
    def fit(self, y):
        """Fit the GPFA model parameters to the obervations y.

        Parameters
        ----------
        y : ndarray (time, features)
        """
        if isinstance(y, np.ndarray) and y.ndim == 2:
            y = [y]
        y_all = np.concatenate(y)
        self.mean_ = y_all.mean(axis=0, keepdims=True)
        y = [yi - self.mean_ for yi in y]
        n = y[0].shape[1]
        T = [yi.shape[0] for yi in y]
        model = FA(self.n_factors, svd_method='lapack')
        model.fit(y_all)

        self.R_ = np.diag(model.noise_variance_)
        self.C_ = model.components_.T
        self.d_ = np.zeros(n)
        self.tau_ = self.tau_init + self.rng.rand(self.n_factors)
        # Allocated and reuse these
        C = self.C_
        R = self.R_
        big_K = {
            Ti: calc_big_K(Ti, self.n_factors, self.tau_, self.var_n)
            for Ti in set(T)
        }
        y_cov = {
            Ti: block_dot_B(block_dot_A(C, big_K[Ti], Ti), C.T, Ti) +
            make_block_diag(R, Ti)
            for Ti in set(T)
        }
        big_d = {Ti: np.tile(self.d_, Ti) for Ti in set(T)}
        big_y = [yi.ravel() for yi in y]
        ll_pre = log_likelihood(big_d, y_cov, big_y, T)
        if self.verbose:
            print("FA log likelihood:", ll_pre)

        converged = False
        for ii in range(self.max_iter):
            ll = self._em_iter(y, big_K)
            if abs(ll - ll_pre) / np.amax([abs(ll), abs(ll_pre), 1.
                                           ]) <= self.tol:
                converged = True
                break
            ll_pre = ll
        if not converged:
            warnings.warn("EM max_iter reached.", ConvergenceWarning)
        return self
Ejemplo n.º 41
0
 def factorLoadings(self):
     '''
     Returns a pandas dataframe containing the raw and standardized factor loadings of each item on a single factor.
     
     This method provides the unstandardized "rawLoadings", and the standardized "stdLoadings" for the items on a
     single factor, using scikit-learn's FactorAnalysis algorithm. This is used for determining which items fit best
     with the construct. 
     '''
     return  pd.DataFrame({
         'rawLoadings' : pd.Series(FactorAnalysis(n_components=1).fit(self._data).components_[0], 
                                   index=self.data.columns),
         'stdLoadings' : pd.Series(FactorAnalysis(n_components=1).fit(self.stdData).components_[0], 
                                   index=self.data.columns)
         })
def compute_scores(X, n_components):
  """
  This is the "y" data of the plots -- the CV scores.
  """
  pca = PCA()
  fa = FactorAnalysis()
  
  pca_scores, fa_scores = [], []
  for n in n_components:
    pca.n_components = n
    fa.n_components = n
    pca_scores.append(np.mean(cross_val_score(pca, X)))
    fa_scores.append(np.mean(cross_val_score(fa, X)))
  
  return pca_scores, fa_scores
Ejemplo n.º 43
0
def compute_scores(X, n_components):
    pca = PCA()
    fa = FactorAnalysis()

    pca_scores, fa_scores = [], []
    for n in n_components:
    	start = time.time()
        pca.n_components = n
        fa.n_components = n
        pca_scores.append(np.mean(cross_val_score(pca, X)))
        fa_scores.append(np.mean(cross_val_score(fa, X)))
        end = time.time()
        print 'PCA scores (%3d)' % n, pca_scores
        print 'FA  scores (%3d)' % n, fa_scores
        print 'TIME:           ', end-start

    return pca_scores, fa_scores
Ejemplo n.º 44
0
def testAlgorithm():
    import matplotlib.pyplot as plt

    random.seed(35)
    np.random.seed(32)

    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    colors = ['red', 'blue', 'green']
    cluster_ids = sorted(list(set(ids)))
    model = FactorAnalysis(n_components=k)
    factor_analysis_Zhat = model.fit_transform(Y)

    plt.figure(figsize=[15, 5])

    plt.subplot(131)
    for id in cluster_ids:
        plt.scatter(Z[ids == id, 0], Z[ids == id, 1], color=colors[id - 1], s=4)
        plt.title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean())
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])

    plt.subplot(132)
    for id in cluster_ids:
        plt.scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color=colors[id - 1], s=4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('ZIFA Estimated Latent Positions')
        # title(titles[method])

    plt.subplot(133)
    for id in cluster_ids:
        plt.scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color = colors[id - 1], s = 4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('Factor Analysis Estimated Latent Positions')

    plt.show()
Ejemplo n.º 45
0
def dataTransformations(x):

	x.rename(columns={'OCUPVIVPAR': 'Dwellers'}, inplace=True)
	#water
	x['Water'] = x['VPH_AGUAFV']/x['Houses']

	#Sanitation use VPH_EXCSA and VPH_NODREN
	x['Sanitation'] = (x['Houses'] - x['VPH_EXCSA'] + x['VPH_NODREN']) / (2.*x['Houses'])

	#Overcrowding use VPH_1CUART and PRO_OCUP_C
	# x['Density'] = 1. - 1./(1. +x['PRO_OCUP_C'])
	x['Density'] = x['PRO_OCUP_C']-2.
	x.loc[x.Density<0,'Density'] = 0.
	x['Density'] = 1. - 1./(1. + x.Density)
	x['Density'] = x['Density']/x['Density'].max()
	
	#Structure VPH_1CUART and VPH_PISOTI
	x['Structure'] = (x['VPH_PISOTI'] + x['VPH_1CUART']) / (2*x['Houses'])

	ssiData = pd.DataFrame(normalize(x[['Water','Structure','Density','Sanitation']],axis=0), columns=['Water','Structure','Density','Sanitation'])

	# x.loc[:,'Factor'] = zeros(len(x)	
	facAn = FactorAnalysis(n_components = 1)
	facAn.fit(ssiData)
	x.loc[:,'Factor'] = dot(facAn.components_**2,transpose(ssiData.values))[0]

	#K-Means
	k_meansX = ssiData

	# do the clustering
	k_means = KMeans(n_clusters=4)
	k_means.fit(k_meansX) 
	x.loc[:,'K_Means'] = k_means.labels_

	#linear combination

	x.loc[:,'LC'] = x[['Water','Structure','Sanitation']].sum(axis=1) + (x['PRO_OCUP_C']/ x['PRO_OCUP_C'].max())

	


	#save x to csv
	# x.to_csv(folderPath+'dataTrans.csv')
	return x
Ejemplo n.º 46
0
def factor_analysis(tests):
	from sklearn.decomposition import FactorAnalysis
	from sklearn.cross_validation import cross_val_score
	
	matrix = correct_matrix(tests,kind='ctrl')
	print(matrix.shape)
	# matrix must have a number of rows divisible by 3.  
	# if it does not, eliminate some rows, or pass cv=a to cross_val_score,
	# where 'a' is a number by which the number of rows is divisible.  
	fa = FactorAnalysis()
	fa_scores = []
	n_components = np.arange(1,41)
	for n in n_components:
		fa.n_components = n
		fa_scores.append(np.mean(cross_val_score(fa, matrix)))

	plt.plot(n_components,fa_scores)
	
	return n_components,fa_scores
Ejemplo n.º 47
0
def factor_analyses(results_dir):
	data_array = np.genfromtxt(os.path.join(results_dir,'summary.csv'),delimiter=',')
	fa1 = FactorAnalysis(n_components = 1)
	new_array_gbm = fa1.fit_transform(np.transpose(data_array[range(15)]))
	print new_array_gbm.shape
	fa2 = FactorAnalysis(n_components = 1)
	new_array_tree = fa2.fit_transform(np.transpose(data_array[range(41,51) + range(54,64)]))
	print new_array_tree.shape

	fa3 = FactorAnalysis(n_components = 1)
	new_array_lin = fa3.fit_transform(np.transpose(data_array[range(27,41) + range(51,54)]))

	fa4 = FactorAnalysis(n_components = 1)
	new_array_knn = fa4.fit_transform(np.transpose(data_array[range(16,27)]))

	datasets = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'datasets.csv'), 'r').readlines()]
	methods = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'methods.csv'), 'r').readlines()]
	figure()
	pretty_scatter(new_array_tree, [1 for x in range(115)], data_array[46], 200*np.ones(new_array_tree.shape), ['' for d in datasets])
	xlabel('Dimension 1')
	ylabel('Arbitrary Dimension 2')
	colorbar()

	figure()

	plot(new_array_lin, new_array_tree, 'bo')
	xlabel('Linear')
	ylabel('Tree + RF')

	figure()
	subplot(2,2,1)
	scatter(new_array_gbm, new_array_tree)
	xlabel('GBM')
	ylabel('Tree + RF')

	#figure()
	subplot(2,2,2)
	scatter(new_array_knn, new_array_tree)
	xlabel('KNN')
	ylabel('Tree + RF')

	#figure()
	subplot(2,2,3)
	scatter(new_array_knn, new_array_lin)
	xlabel('KNN')
	ylabel('Linear')

	subplot(2,2,4)
	scatter(new_array_gbm, new_array_lin)
	xlabel('GBM')
	ylabel('Linear')
	show()
def fit_factor_analysis(percentage=0.8):
    """
    Runs the factor analysis.

    Parameters:

        percentage: float, default:0.8

        The percentage of the cumulative sum of the eigenvalues to be held. This number defines the number of loading factors in the analysis.

    Returns:
        
        X: array of floats [n_samples,n_factors]

            The transformed data after the factor analysis.

        components: array of floats [n_factors,n_samples]

            The components of the factor analysis
    """
    fa = FactorAnalysis()
    fa.fit(data)
    C = fa.get_covariance()
    l,e = np.linalg.eigh(C)
    cs = np.cumsum(l[::-1])/np.sum(l)
    n = np.sum(cs<percentage)

    fa.n_components = n
    X_ = fa.fit_transform(data)
    components = fa.components_
    return X_,components
Ejemplo n.º 49
0
def initialize(trials, params, config):
    """Make skeleton"""
    # TODO: fast initialization for large dataset
    from sklearn.decomposition import FactorAnalysis

    zdim = params["zdim"]
    xdim = params["xdim"]

    # TODO: use only a subsample of trials?
    y = np.concatenate([trial["y"] for trial in trials], axis=0)
    subsample = np.random.choice(y.shape[0], max(y.shape[0] // 10, 50))
    ydim = y.shape[-1]
    fa = FactorAnalysis(n_components=zdim, random_state=0)
    z = fa.fit_transform(y[subsample, :])
    a = fa.components_
    b = np.log(np.maximum(np.mean(y, axis=0, keepdims=True), config["eps"]))
    noise = np.var(y[subsample, :] - z @ a, ddof=0, axis=0)

    # stupid way of update
    # two cases
    # 1) no key
    # 2) empty value (None)
    if params.get("a") is None:
        params.update(a=a)
    if params.get("b") is None:
        params.update(b=b)
    if params.get("noise") is None:
        params.update(noise=noise)

    for trial in trials:
        length = trial["y"].shape[0]

        if trial.get("mu") is None:
            trial.update(mu=fa.transform(trial["y"]))

        if trial.get("x") is None:
            trial.update(x=np.ones((length, xdim, ydim)))

        trial.update({"w": np.zeros((length, zdim)), "v": np.zeros((length, zdim))})
Ejemplo n.º 50
0
def test_factor_analysis():
    """Test FactorAnalysis ability to recover the data covariance structure
    """
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) \
                * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise

    fa = FactorAnalysis(n_components=n_components)
    fa.fit(X)
    X_t = fa.transform(X)
    assert_true(X_t.shape == (n_samples, n_components))

    assert_almost_equal(fa.loglike_[-1], fa.score(X).sum())

    # Make log likelihood increases at each iteration
    assert_true(np.all(np.diff(fa.loglike_) > 0.))

    # Sample Covariance
    scov = np.cov(X, rowvar=0., bias=1.)

    # Model Covariance
    mcov = fa.get_covariance()
    diff = np.sum(np.abs(scov - mcov)) / W.size
    assert_true(diff < 0.1, "Mean absolute difference is %f" % diff)

    fa = FactorAnalysis(n_components=n_components,
                        noise_variance_init=np.ones(n_features))
    assert_raises(ValueError, fa.fit, X[:, :2])
derived_dir=os.path.join(basedir,'Data/Derived_Data/%s'%dataset)


data,surveykey=get_survey_data(dataset)

cdata=data.values


kf = cross_validation.KFold(cdata.shape[0], n_folds=4)

max_components=30

sc=numpy.zeros((max_components,4))

for n_components in range(1,max_components):
    fa=FactorAnalysis(n_components=n_components)
    fold=0
    for train,test in kf:
        train_data=cdata[train,:]
        test_data=cdata[test,:]

        fa.fit(train_data)
        sc[n_components,fold]=fa.score(test_data)
        fold+=1

meanscore=numpy.mean(sc,1)
meanscore[0]=-numpy.inf
maxscore=numpy.argmax(meanscore)
print ('crossvalidation suggests %d components'%maxscore)

# now run it on full dataset to get components
Ejemplo n.º 52
0
# A = np.array([[1, 0.2], [0.2, 1]])  # Mixing matrix
# X = np.dot(S, A.T)  # Generate observations

rng = np.random.RandomState(42)
S = rng.normal(scale=0.01,size=(10000, 2))
S[:,1][::2] *= 1.7
S[:,0][::2] /= 1.7
S[:,1][1::2] /= 1.7
S[:,0][1::2] *= 1.7
X=deepcopy(S)
X[:,1] = X[:,0]/-2+X[:,1]

pca = PCA()
S_pca_ = pca.fit_transform(X)

fa = FactorAnalysis(svd_method="lapack")
S_fa_ = fa.fit_transform(X)

ica = FastICA(max_iter=20000, tol=0.00001)
S_ica_ = ica.fit_transform(X)  # Estimate the sources


###############################################################################
# Plot results

def plot_samples(S, axis_list=None):
    plt.scatter(S[:, 0], S[:, 1], s=2, marker='o', zorder=10,
                color='steelblue', alpha=0.5)
    if axis_list is not None:
        colors = ['orange', 'red']
        for color, axis in zip(colors, axis_list):
pca = decomposition.PCA()
sub_pca_prime = pca.fit_transform(sub_pca_imputed) 

pca.n_components_  # the estimated number of components
pca.components_  # principal component loadings
pca.explained_variance_ratio_ # percentage of variance explained by each principal components
pca.explained_variance_ratio_.cumsum()  # cumulative sum of percentage of variance explained


# Factor Analysis
GSS = pd.read_csv("GSS_Cum.csv")
sub = GSS.ix[:,'confinan':'conarmy']

# impute missing value in DataFrame sub
from sklearn import preprocessing
impute = preprocessing.Imputer()
sub_imputed = impute.fit_transform(sub)

# use FactorAnalysis package 
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components = 5, max_iter = 100) #Here we set dimensionality of latent space to be 5 and maximum number of iterations to be 100
sub_fa = fa.fit_transform(sub_imputed)

fa.components_  # factor loadings
fa.loglike_   # the log likelihood at each iteration
fa.n_iter_   # Number of iterations run



Ejemplo n.º 54
0
def simulate(data, factors=0, maxtrials=5, multiplier=1, seed=0):
    n = len(data)
    dim = len(data[0])
    simulated = np.zeros((n,dim))
    distribution = np.zeros((n,dim))
    iteration = 0
    BestRMSR = 1
    trialsWithoutImprovement = 0

    #apply distribution from supplied data
    distribution = data.copy()
    TargetCorr = corr(data.T)
    IntermidiateCorr = TargetCorr.copy()
    BestCorr = IntermidiateCorr
    #print data.shape
    #print simulated.shape
    #print TargetCorr, TargetCorr.shape

    if(factors == 0):
        eigvalsObserved = np.linalg.eigvals(IntermidiateCorr)
        eigvalsRandom = np.zeros((100,dim))
        randomData = np.zeros((n,dim))

        for i in range(0, 100):
            for j in range(0, dim):
                randomData[:, j] = np.random.permutation(distribution[:, j])
            eigvalsRandom[i, :] = np.linalg.eigvals(corr(randomData.T))
        eigvalsRandom = np.mean(eigvalsRandom, axis=0)
        factors = max(1, np.sum(eigvalsObserved > eigvalsRandom))

    #steps 5,6
    SharedComp = np.random.normal(0, 1, (n, factors))
    UniqueComp = np.random.normal(0, 1, (n, dim))
    SharedLoad = np.zeros((dim, factors))
    UniqueLoad = np.zeros(dim)

    while trialsWithoutImprovement < maxtrials:
        iteration += 1

        #Calculate factor loadings and apply to reproduce desired correlations (steps 7, 8)
        fa = FactorAnalysis()
        fa.n_components = factors
        fa.fit(IntermidiateCorr)
        FactLoadings = fa.components_.T
        #print FactLoadings.shape

        if (factors == 1):
            SharedLoad[:, 0] = FactLoadings[:, 0]
        else:
            SharedLoad = FactLoadings
        #print SharedLoad

        SharedLoad = np.clip(SharedLoad, -1, 1)
        #print SharedLoad

        if (SharedLoad[0, 0] < 0):
            SharedLoad *= -1
        #print SharedLoad

        SharedLoadSq = SharedLoad * SharedLoad
        #print SharedLoadSq

        for i in range(0, dim):
            SharedLoadSum = np.sum(SharedLoadSq[i, :])
            if(SharedLoadSum < 1):
                UniqueLoad[i] = 1 - SharedLoadSum
            else:
                UniqueLoad[i] = 0
        UniqueLoad = np.sqrt(UniqueLoad)
        #print UniqueLoad

        MergedShare = np.dot(SharedComp, SharedLoad.T)
        for i in range(0, dim):
            simulated[:, i] = MergedShare[:, i] + UniqueComp[:, i]*UniqueLoad[i]
        #print simulated

        #Replace normal with nonnormal distributions (step 9)
        for i in range(0, dim):
            indices = np.argsort(simulated[:, i])
            simulated = np.array(simulated)[indices]
            simulated[:, i] = distribution[:, i]
        #print simulated
        #print distribution

        #Calculate RMSR correlation, compare to lowest value, take appropriate action (steps 10, 11, 12)
        ReproducedCorr = corr(simulated.T)
        ResidualCorr = TargetCorr - ReproducedCorr;
        #print ResidualCorr

        RMSR = np.sqrt(np.sum(np.tril(ResidualCorr) ** 2) / (0.5 * (dim*dim - dim)))
        #print RMSR

        if (RMSR < BestRMSR):
            BestRMSR = RMSR
            BestCorr = IntermidiateCorr
            BestRes = ResidualCorr
            IntermidiateCorr = IntermidiateCorr + multiplier*ResidualCorr
            trialsWithoutImprovement = 0
        else:
            trialsWithoutImprovement += 1
            CurrentMultiplier = multiplier * (0.5 ** trialsWithoutImprovement)
            try:
                IntermidiateCorr = BestCorr + CurrentMultiplier * BestRes
            except NameError:
                BestRes = ResidualCorr
                IntermidiateCorr = BestCorr + CurrentMultiplier * BestRes

    #Construct the data set with the lowest RMSR correlation (step 13)
    fa = FactorAnalysis()
    fa.n_components = factors
    fa.fit(BestCorr)
    FactLoadings = fa.components_.T

    if (factors == 1):
        SharedLoad[:, 0] = FactLoadings[:, 0]
    else:
        SharedLoad = FactLoadings

    SharedLoad = np.clip(SharedLoad, -1, 1)

    if (SharedLoad[0, 0] < 0):
        SharedLoad *= -1

    SharedLoadSq = SharedLoad * SharedLoad

    for i in range(0, dim):
        SharedLoadSum = np.sum(SharedLoadSq[i, :])
        if(SharedLoadSum < 1):
            UniqueLoad[i] = 1 - SharedLoadSum
        else:
            UniqueLoad[i] = 0
    UniqueLoad = np.sqrt(UniqueLoad)

    MergedShare = np.dot(SharedComp, SharedLoad.T)
    for i in range(0, dim):
        simulated[:, i] = MergedShare[:, i] + UniqueComp[:, i]*UniqueLoad[i]

    simulated = preprocessing.scale(simulated)

    for i in range(0, dim):
        indices = np.argsort(simulated[:, i])
        simulated = np.array(simulated)[indices]
        simulated[:, i] = distribution[:, i]

    #return the simulated data set (step 14)
    #print 'RMSR', BestRMSR

    return simulated
def test_factor_analysis():
    # Test FactorAnalysis ability to recover the data covariance structure
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise

    assert_raises(ValueError, FactorAnalysis, svd_method='foo')
    fa_fail = FactorAnalysis()
    fa_fail.svd_method = 'foo'
    assert_raises(ValueError, fa_fail.fit, X)
    fas = []
    for method in ['randomized', 'lapack']:
        fa = FactorAnalysis(n_components=n_components, svd_method=method)
        fa.fit(X)
        fas.append(fa)

        X_t = fa.transform(X)
        assert_equal(X_t.shape, (n_samples, n_components))

        assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum())
        assert_almost_equal(fa.score_samples(X).mean(), fa.score(X))

        diff = np.all(np.diff(fa.loglike_))
        assert_greater(diff, 0., 'Log likelihood dif not increase')

        # Sample Covariance
        scov = np.cov(X, rowvar=0., bias=1.)

        # Model Covariance
        mcov = fa.get_covariance()
        diff = np.sum(np.abs(scov - mcov)) / W.size
        assert_less(diff, 0.1, "Mean absolute difference is %f" % diff)
        fa = FactorAnalysis(n_components=n_components,
                            noise_variance_init=np.ones(n_features))
        assert_raises(ValueError, fa.fit, X[:, :2])

    f = lambda x, y: np.abs(getattr(x, y))  # sign will not be equal
    fa1, fa2 = fas
    for attr in ['loglike_', 'components_', 'noise_variance_']:
        assert_almost_equal(f(fa1, attr), f(fa2, attr))

    fa1.max_iter = 1
    fa1.verbose = True
    assert_warns(ConvergenceWarning, fa1.fit, X)

    # Test get_covariance and get_precision with n_components == n_features
    # with n_components < n_features and with n_components == 0
    for n_components in [0, 2, X.shape[1]]:
        fa.n_components = n_components
        fa.fit(X)
        cov = fa.get_covariance()
        precision = fa.get_precision()
        assert_array_almost_equal(np.dot(cov, precision),
                                  np.eye(X.shape[1]), 12)
from data import load_data
from sklearn.decomposition import FactorAnalysis
try:
    import cPickle as pickle
except:
    import pickle

# Factor Analysis

# ================================================================
# Apply factor analysis on the tf-idf matrix and transform raw documents into
# intermediate representation.
docs_tfidf, vocab_tfidf, vocabulary = load_data(subset='all')
n_components = 40
fa = FactorAnalysis(n_components=n_components)
fa.fit(docs_tfidf.toarray())
fa_words = fa.transform(vocab_tfidf.toarray())

# Create a dict to hold the new pca words.
fa_dict = dict(zip(vocabulary, fa_words))

# Store the intermediate representation pca words on disk.
fa_dict_filename = 'fa_dict.pk'
if not os.path.exists(fa_dict_filename):
    fa_dict_file = open(fa_dict_filename, 'w')
    pickle.dump(fa_dict, fa_dict_file)

# Store estimator on dist for further usage.
fa_estimator_filename = 'fa_estimator.pk'
if not os.path.exists(fa_estimator_filename):
Ejemplo n.º 57
0
def learn(data):
    model=FA(n_components =2)
    model.fit(data)
    return PreferenceGenerator(model.components_)
Ejemplo n.º 58
0
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA, FastICA, FactorAnalysis

rng = np.random.RandomState(42)
s = rng.normal(scale=0.01,size=(4,1000))
S = np.ones((3,1000))
S[0] = s[0]
S[1] = s[1]
S[2] = s[0]+s[1]

pca = PCA()
S_pca_ = pca.fit_transform(S.T)

fa = FactorAnalysis(svd_method="lapack")
S_fa_ = fa.fit_transform(S.T)

ica = FastICA(max_iter=20000, tol=0.00001)
S_ica_ = ica.fit_transform(S.T)  # Estimate the sources

def plot_3d(data, ax, axis_list=None):
	data /= np.std(data)
	ax.scatter(data[0] ,data[1], data[2] , s=2, marker='o', zorder=10, color='steelblue', alpha=0.5)
	ax.set_xlim(-4, 4)
	ax.set_ylim(-4, 4)
	ax.set_zlim(-4, 4)
	ax.set_xlabel('x')
	ax.set_ylabel('y')
	ax.set_zlabel('z')
	for label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
Ejemplo n.º 59
0
def base(
    use_filter="default",
    data_path="~/data/faons/latest.csv",
    filter_name="default.csv",
    participant_subset="",
    drop_metadata=True,
    drop=[],
    clean=7,
    components=5,
    facecolor="#ffffff",
):

    data_path = path.expanduser(data_path)
    filter_path = path.join(path.dirname(path.realpath(__file__)), "filters", filter_name)

    filters = pd.read_csv(
        filter_path, index_col=0, header=None
    ).transpose()  # transpose filters because of .csv file formatting, specify index_col to not get numbered index
    all_data = pd.read_csv(data_path)

    all_data = all_data[map(lambda y: len(set(y)) > clean, np.array(all_data))]

    # drops metadata
    if drop_metadata == True:
        all_data = all_data.drop(filters["metadata"][pd.Series.notnull(filters["metadata"])], axis=1)

        # compile list of column names to be dropped:
    drop_list = []
    for drop_item in drop:
        drop_list += list(filters[drop_item][pd.Series.notnull(filters[drop_item])])
    drop_list = list(
        set(drop_list)
    )  # get unique column names (the list may contain duplicates if overlaying multiple filters)
    all_data = all_data.drop(drop_list, axis=1)

    if participant_subset == "odd":
        keep_rows = all_data.index.values[1::2]
        filtered_data = all_data.ix[keep_rows]
    elif participant_subset == "even":
        keep_rows = all_data.index.values[0::2]
        filtered_data = all_data.ix[keep_rows]
    elif participant_subset == "male":
        filtered_data = all_data[all_data["My legal gender:"] == "Male"]
    elif participant_subset == "female":
        filtered_data = all_data[all_data["My legal gender:"] == "Female"]
    else:
        filtered_data = all_data

        # convert to correct type for analysis:
    filtered_data_array = np.array(filtered_data, dtype="float64")

    filtered_data_array = filtered_data_array / 100

    pca = PCA()
    S_pca_ = pca.fit_transform(filtered_data_array)

    fa = FactorAnalysis(svd_method="lapack")
    S_fa_ = fa.fit_transform(filtered_data_array)

    ica = FastICA(n_components=components, max_iter=20000, tol=0.00001)
    S_ica_ = ica.fit_transform(filtered_data_array)  # Estimate the sources

    load = ica.mixing_

    remapped_cmap = remappedColorMap(
        cm.PiYG,
        start=(np.max(load) - abs(np.min(load))) / (2 * np.max(load)),
        midpoint=abs(np.min(load)) / (np.max(load) + abs(np.min(load))),
        name="shrunk",
    )
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17.5, 5), facecolor=facecolor)
    graphic = ax.imshow(load, cmap=remapped_cmap, interpolation="none")