Example #1
0
def get_MFA_params(zl, kl, rl_nextl):
    ''' Determine clusters with a GMM and then adjust a Factor Model over each cluster
    zl (ndarray): The lth layer latent variable 
    kl (int): The number of components of the lth layer
    rl_nextl (1darray): The dimension of the lth layer and (l+1)th layer
    -----------------------------------------------------
    returns (dict): Dict with the parameters of the MFA approximated by GMM + FA. 
    '''
    #======================================================
    # Fit a GMM in the continuous space
    #======================================================
    numobs = zl.shape[0]

    not_all_groups = True
    max_trials = 100
    empty_count_counter = 0

    while not_all_groups:
        # If not enough obs per group then the MFA diverge...

        gmm = GaussianMixture(n_components=kl)
        s = gmm.fit_predict(zl)

        clusters_found, count = np.unique(s, return_counts=True)

        if (len(clusters_found) == kl):  # & (count >= 5).all():
            not_all_groups = False

        empty_count_counter += 1
        if empty_count_counter >= max_trials:
            raise RuntimeError(
                'Could not find a GMM init that presents the \
                               proper number of groups:', kl)

    psi = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float)
    psi_inv = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float)
    H = np.full((kl, rl_nextl[0], rl_nextl[1]), 0).astype(float)
    eta = np.full((kl, rl_nextl[0]), 0).astype(float)
    z_nextl = np.full((numobs, rl_nextl[1]), np.nan).astype(float)

    #========================================================
    # And then a MFA on each of those group
    #========================================================

    for j in range(kl):
        indices = (s == j)
        fa = FactorAnalyzer(rotation=None, method='ml', n_factors=rl_nextl[1])
        fa.fit(zl[indices])

        psi[j] = np.diag(fa.get_uniquenesses())
        H[j] = fa.loadings_
        psi_inv[j] = np.diag(1 / fa.get_uniquenesses())
        z_nextl[indices] = fa.transform(zl[indices])

        eta[j] = np.mean(zl[indices], axis=0)

    params = {'H': H, 'psi': psi, 'z_nextl': z_nextl, 'eta': eta, 'classes': s}
    return params
        0: 'Purchase',
        1: 'Marking',
        2: 'Post Purchase',
        3: 'Product Position'
    })

    del dfX['Post Purchase']
    return dfX


# Prepare significant X values for regression.
trainDF = dropInsignificantX(X_train_transformed)
testDF = dropInsignificantX(X_test_tranformed)

# Train regression model on training data
model = LinearRegression()
model.fit(trainDF, y_train)

# Show model statistics.
showModelSummary(model, y_test, testDF)

# Check coefficient significance.
showCoefficientPValues(y_train, trainDF.values)

print("FA Factor Variance")
print(fa.get_factor_variance())
print("FA Get Communalities")
print(fa.get_communalities())
print("FA Uniqunesses")
print(fa.get_uniquenesses())
Example #3
0
def calculate_py_output(test_name,
                        factors,
                        method,
                        rotation,
                        use_corr_matrix=False,
                        top_dir=None):
    """
    Use the `FactorAnalyzer()` class to perform the factor analysis
    and return a dictionary with relevant output for given scenario.

    Parameters
    ----------
    test_name : str
        The name of the test
    factors : int
        The number of factors
    method : str
        The rotation method
    rotation : str
        The type of rotation
    use_corr_matrix : bool, optional
        Whether to use the correlation matrix.
        Defaults to False.
    top_dir : str, optional
        The top directory for test data
        Defaults to `DATA_DIR``

    Returns
    -------
    output : dict
        A dictionary containing the outputs
        for all `OUTPUT_TYPES`.
    """
    if top_dir is None:
        top_dir = DATA_DIR

    filename = join(top_dir, test_name + '.csv')
    data = pd.read_csv(filename)

    if use_corr_matrix:
        X = data.corr()
    else:
        X = data.copy()

    rotation = None if rotation == 'none' else rotation
    method = {'uls': 'minres'}.get(method, method)

    fa = FactorAnalyzer(n_factors=factors,
                        method=method,
                        rotation=rotation,
                        is_corr_matrix=use_corr_matrix)
    fa.fit(X)

    evalues, values = fa.get_eigenvalues()

    return {
        'value': values,
        'evalues': evalues,
        'structure': fa.structure_,
        'loading': fa.loadings_,
        'uniquenesses': fa.get_uniquenesses(),
        'communalities': fa.get_communalities(),
        'scores': fa.transform(data)
    }
Example #4
0
def FactorAnalysis(df, rotation = "varimax", n_factors = 10, transform = False):

    """ You want "varimax" rotation if you want orthogonal (highly differentiable) with very high and low variable loading. common
        You want "oblimin" for non-orthogonal loading. Increases eigenvalues, but reduced interpretability.
        You want "promax" if you want Oblimin on large datasets.
        
        See https://stats.idre.ucla.edu/spss/output/factor-analysis/ for increased explination. 
    """   

    assert not df.isnull().values.any(), "Data must not contain any nan or inf values"
    assert all(df.std().values > 0), "Columns used in Factor Analysis must have a non-zero Std. Dev. (aka more than a single value)"  

    def data_suitable(df, kmo_value = False, ignore = False):
        
        #Test to ensure data is not identity Matrix
        chi_square_value, p_value = calculate_bartlett_sphericity(df)
        
        # test to ensure that observed data is adquite for FA. Must be > 0.6
        kmo_all, kmo_model = calculate_kmo(df)

        if (p_value > 0.1 or kmo_model < 0.6) and ignore != True:
            raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}.  KMO model Score: {}".format(p_value, kmo_model))
        
        if kmo_value:
            return kmo_model
        else:
            return
        
        
    print("KMO Value: {}.".format(data_suitable(df, kmo_value = True)))

    fa = FactorAnalyzer(method = "minres", 
                        rotation = rotation,
                        n_factors = n_factors)

    fa.fit(df)

    def eigenplot(df):
        df = pd.DataFrame(df)
        
        fig = go.Figure()
        
        fig.add_trace(
            go.Scatter(
                x = df.index.values,
                y = df[0].values,
                mode = 'lines'
            )
        )
        
        
        fig.add_shape(
            type = "line",
            y0 = 1,
            x0 = 0,
            y1 = 1,
            x1 = len(df),
            line = dict(
                color = 'red',
                dash = 'dash'
            )
        )
        
        fig.update_layout(
            title = "Factor Eigenvalues",
            yaxis_title="Eigenvalue",
            xaxis_title="Factor",
            xaxis = dict(
                range = [0,df[df[0] > 0].index.values[-1]]
                )
        )
        
        fig.show()
        return

    eigenplot(fa.get_eigenvalues()[1])
    Plotting.LabeledHeatmap(fa.loadings_, y = list(df.columns), title = "Factor Loading", expand = True, height = 2000, width = 2000)

    tmp = pd.DataFrame(fa.get_factor_variance()[1:]) 
    tmp.index = ["Proportional Varience","Cumulative Varience"]
    Plotting.dfTable(tmp)

    if rotation == 'promax':
        Plotting.LabeledHeatmap(fa.phi_, title = "Factor Correlation", expand = True, height = 2000, width = 2000)
        Plotting.LabeledHeatmap(fa.structure_, y = list(df.columns), title = "Variable-Factor Correlation", expand = True, height = 2000, width = 2000)

    Plotting.LabeledHeatmap(pd.DataFrame(fa.get_communalities()).T, 
                            title = "Varience Explained",
                            x = list(df.columns), 
                            description = "The proportion of each variables varience that can be explained by the factors.", 
                            expand = True, 
                            height = 300, 
                            width = 2000)

    Plotting.LabeledHeatmap(pd.DataFrame(fa.get_uniquenesses()).T, 
                            title = "Variable Uniqueness",
                            x = list(df.columns),
                            expand = True, 
                            height = 300,
                             width = 2000)

    if transform:
        return fa.transform(df)

    return