Example #1
0
 def compute_higher_order_factors(self, c=None, rotate='oblimin'):
     """ Return higher order EFA """
     if c is None:
         c = self.get_c()
         print('# of components not specified, using BIC determined #')
     if ('factor_tree_%s' % rotate in self.results.keys() and 
         c in self.results['factor_tree_Rout_%s' % rotate].keys()):
         # get factor correlation matrix
         scores = get_attr(self.results['factor_tree_Rout_%s' % rotate][c], 'scores')
         phi = pd.DataFrame(np.corrcoef(scores.T))
         # check for correlations
         if np.mean(np.tril(phi, -1)) < 10E-5:
             return
         n_obs = self.data.shape[0]
         labels = list(self.results['factor_tree_%s' % rotate][c].columns)
         BIC_c, BICs = find_optimal_components(phi, 
                                               metric='BIC', 
                                               nobs=n_obs)
         if BIC_c != 0:
             if 'factor2_tree_%s' % rotate not in self.results.keys():
                 self.results['factor2_tree_%s' % rotate] = {}
                 self.results['factor2_tree_Rout_%s' % rotate] = {}
             Rout, higher_order_out = psychFA(phi, BIC_c, nobs=n_obs)
             loadings = get_loadings(higher_order_out, labels)
             self.results['factor2_tree_%s' % rotate][c] = loadings
             self.results['factor2_tree_Rout_%s' % rotate][c] = Rout
         else:
             print('Higher order factors could not be calculated')
     else:
         print('No %s factor solution computed yet!' % c)
Example #2
0
 def verify_factor_solution(self):
     fa, output = psychFA(self.data, 10)
     scores = output['scores'] # factor scores per subjects derived from psychFA
     scaled_data = scale(self.data)
     redone_scores = scaled_data.dot(output['weights'])
     redone_score_diff = np.mean(scores-redone_scores)
     assert(redone_score_diff < 1e-5)
 def compute_higher_order_factors(self, c=None, rotate='oblimin'):
     """ Return higher order EFA """
     if c is None:
         c = self.get_c()
         print('# of components not specified, using BIC determined #')
     if ('factor_tree_%s' % rotate in self.results.keys() and 
         c in self.results['factor_tree_Rout_%s' % rotate].keys()):
         # get factor correlation matrix
         scores = get_attr(self.results['factor_tree_Rout_%s' % rotate][c], 'scores')
         phi = pd.DataFrame(np.corrcoef(scores.T))
         # check for correlations
         if np.mean(np.tril(phi, -1)) < 10E-5:
             return
         n_obs = self.data.shape[0]
         labels = list(self.results['factor_tree_%s' % rotate][c].columns)
         BIC_c, BICs = find_optimal_components(phi, 
                                               metric='BIC', 
                                               nobs=n_obs)
         if BIC_c != 0:
             if 'factor2_tree_%s' % rotate not in self.results.keys():
                 self.results['factor2_tree_%s' % rotate] = {}
                 self.results['factor2_tree_Rout_%s' % rotate] = {}
             Rout, higher_order_out = psychFA(phi, BIC_c, nobs=n_obs)
             loadings = get_loadings(higher_order_out, labels)
             self.results['factor2_tree_%s' % rotate][c] = loadings
             self.results['factor2_tree_Rout_%s' % rotate][c] = Rout
         else:
             print('Higher order factors could not be calculated')
     else:
         print('No %s factor solution computed yet!' % c)
 def verify_factor_solution(self):
     fa, output = psychFA(self.data, 10)
     scores = output['scores'] # factor scores per subjects derived from psychFA
     scaled_data = scale(self.data)
     redone_scores = scaled_data.dot(output['weights'])
     redone_score_diff = np.mean(scores-redone_scores)
     assert(redone_score_diff < 1e-5)
 def get_loading(self, c=None, bootstrap=False, rotate='oblimin',
                 recompute=False, copy=True):
     """ Return the loading for an EFA solution at the specified c """
     if c is None:
         c = self.get_c()
         print('# of components not specified, using BIC determined #')
     n_iter = 1
     if bootstrap:
         n_iter = self.boot_iter
     if 'factor_tree_%s' % rotate not in self.results.keys():
         self.results['factor_tree_%s' % rotate] = {}
         self.results['factor_tree_Rout_%s' % rotate] = {}
     if (not recompute and# recomputing isn't wanted
         c in self.results['factor_tree_%s' % rotate].keys() and # c factors have been computed
         (n_iter==1 or 'cis' in self.results['factor_tree_Rout_%s' % rotate][c].names)):
         if copy:
             return self.results['factor_tree_%s' % rotate][c].copy()
         else:
             return self.results['factor_tree_%s' % rotate][c]
     else:
         print('No %s factor solution computed yet! Computing...' % c)
         fa, output = psychFA(self.data, c, method='ml', rotate=rotate,
                              n_iter=n_iter)
         loadings = get_loadings(output, labels=self.data.columns)
         self.results['factor_tree_%s' % rotate][c] = loadings
         self.results['factor_tree_Rout_%s' % rotate][c] = fa
         if copy:
             return loadings.copy()
         else:
             return loadings
Example #6
0
def drop_EFA(data, measures, c):
    
    to_drop = data.filter(regex='|'.join(measures)).columns
    subset = data.drop(to_drop, axis=1)
    fa, output = psychFA(subset, c, method='ml', rotate='oblimin')
    loadings = get_loadings(output, labels=subset.columns)
    return loadings
Example #7
0
 def get_loading(self, c=None, bootstrap=False, rotate='oblimin',
                 recompute=False, copy=True):
     """ Return the loading for an EFA solution at the specified c """
     if c is None:
         c = self.get_c()
         print('# of components not specified, using BIC determined #')
     n_iter = 1
     if bootstrap:
         n_iter = self.boot_iter
     if 'factor_tree_%s' % rotate not in self.results.keys():
         self.results['factor_tree_%s' % rotate] = {}
         self.results['factor_tree_Rout_%s' % rotate] = {}
     if (not recompute and# recomputing isn't wanted
         c in self.results['factor_tree_%s' % rotate].keys() and # c factors have been computed
         (n_iter==1 or 'cis' in self.results['factor_tree_Rout_%s' % rotate][c].names)):
         if copy:
             return self.results['factor_tree_%s' % rotate][c].copy()
         else:
             return self.results['factor_tree_%s' % rotate][c]
     else:
         print('No %s factor solution computed yet! Computing...' % c)
         fa, output = psychFA(self.data, c, method='ml', rotate=rotate,
                              n_iter=n_iter)
         loadings = get_loadings(output, labels=self.data.columns)
         self.results['factor_tree_%s' % rotate][c] = loadings
         self.results['factor_tree_Rout_%s' % rotate][c] = fa
         if copy:
             return loadings.copy()
         else:
             return loadings
def find_optimal_components(data, minc=1, maxc=50, nobs=0, metric='BIC'):
    """
    Fit EFA over a range of components and returns the best c. If metric = CV
    uses sklearn. Otherwise uses psych
    metric: str, method to use for optimal components. Options 'BIC', 'SABIC',
            and 'CV'
    """
    steps_since_best = 0 # count steps since last best metric.
    metrics = {}
    maxc = min(maxc, data.shape[1])
    n_components = range(minc,maxc)
    scaler = StandardScaler()
    if metric != 'CV':
        best_metric = float("Inf")
        best_c = 0
        for c in n_components:
            out = psychFA(data, c, method='ml', nobs=nobs)
            if out is None:
                break
            fa, output = out
            curr_metric = output[metric]
            # iterate counter if new metric isn't better than previous metric
            if len(metrics) > 0:
                if curr_metric >= best_metric:
                    steps_since_best += 1
                else:
                    steps_since_best = 0
                    best_c = c
                    best_metric = curr_metric
            metrics[c] = curr_metric
            if steps_since_best > 2:
                break
    else:
        for c in n_components:
            fa = FactorAnalysis(c)
            scaler = StandardScaler()
            imputer = Imputer()
            pipe = Pipeline(steps = [('impute', imputer),
                                     ('scale', scaler),
                                     ('fa', fa)])
            cv = cross_val_score(pipe, data, cv=10)
            # iterate counter if new metric isn't better than previous metric
            if len(metrics) > 0:
                if cv < metrics[c-1]:
                    steps_since_best += 1
                else:
                    steps_since_best = 0
            metrics[c] = np.mean(cv)
            if steps_since_best > 2:
                break
        best_c = max(metrics, key=metrics.get)
    return best_c, metrics
Example #9
0
def find_optimal_components(data, minc=1, maxc=50, nobs=0, metric='BIC'):
    """
    Fit EFA over a range of components and returns the best c. If metric = CV
    uses sklearn. Otherwise uses psych
    metric: str, method to use for optimal components. Options 'BIC', 'SABIC',
            and 'CV'
    """
    steps_since_best = 0  # count steps since last best metric.
    metrics = {}
    maxc = min(maxc, data.shape[1])
    n_components = range(minc, maxc)
    scaler = StandardScaler()
    if metric != 'CV':
        best_metric = float("Inf")
        best_c = 0
        for c in n_components:
            out = psychFA(data, c, method='ml', nobs=nobs)
            if out is None:
                break
            fa, output = out
            curr_metric = output[metric]
            # iterate counter if new metric isn't better than previous metric
            if len(metrics) > 0:
                if curr_metric >= best_metric:
                    steps_since_best += 1
                else:
                    steps_since_best = 0
                    best_c = c
                    best_metric = curr_metric
            metrics[c] = curr_metric
            if steps_since_best > 2:
                break
    else:
        for c in n_components:
            fa = FactorAnalysis(c)
            scaler = StandardScaler()
            imputer = Imputer()
            pipe = Pipeline(steps=[('impute', imputer), ('scale',
                                                         scaler), ('fa', fa)])
            cv = cross_val_score(pipe, data, cv=10)
            # iterate counter if new metric isn't better than previous metric
            if len(metrics) > 0:
                if cv < metrics[c - 1]:
                    steps_since_best += 1
                else:
                    steps_since_best = 0
            metrics[c] = np.mean(cv)
            if steps_since_best > 2:
                break
        best_c = max(metrics, key=metrics.get)
    return best_c, metrics
def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True):
    name = results.ID.split('_')[0].title()
    orig_data = results.data
    positive_skewed = [i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i]
    negative_skewed = [i.replace('.ReflogTr', '') for i in orig_data.columns if ".ReflogTr" in i]
    DVs = [i.replace('.logTr','').replace('.ReflogTr','') for i in orig_data.columns]
    orig_scores = results.EFA.get_scores(rotate=rotate)
    
    # load and clean retest data exactly like original data
    data_raw = get_behav_data(dataset=results.dataset,
                              file='meaningful_variables.csv')
    retest_data_raw = get_behav_data(dataset=results.dataset.replace('Complete','Retest'),
                                     file='meaningful_variables.csv')
    shared_ids = set(retest_data_raw.index) & set(data_raw.index)
    data_raw = data_raw.loc[shared_ids, :]
    retest_data_raw = retest_data_raw.loc[shared_ids, :]
    raw_data = {'T1': data_raw, 'T2': retest_data_raw}
    imputed_data = {}
    for name, data in raw_data.items():  
        tmp_data = data.loc[:, DVs]
        tmp_data = transform_remove_skew(tmp_data,
                                         positive_skewed=positive_skewed,
                                         negative_skewed=negative_skewed)
        tmp_data = remove_outliers(tmp_data)
        tmp_data_imputed, error = missForest(tmp_data)  
        scaled_tmp_data = scale(tmp_data_imputed)
        imputed_data[name] = scaled_tmp_data
    
    # get subjects not in the retest set
    ind_data = orig_data.loc[set(orig_data.index)-shared_ids]
    fa, output = psychFA(ind_data, results.EFA.results['num_factors'], 
                         method='ml', rotate=rotate)
    weights = get_attr(fa, 'weights')
    scores = {}
    for name, data in imputed_data.items():
        suffix=''
        if name=='T2': suffix='T2'
        tmp_scores = pd.DataFrame(data.dot(weights),
                                  index=shared_ids,
                                  columns=[i+' '+suffix for i in orig_scores.columns])
        scores[name] = tmp_scores
    combined = pd.concat([scores['T1'], scores['T2']], axis=1)
    cross_diag = [combined.corr().iloc[i,i+len(orig_scores.columns)] 
                    for i in range(len(orig_scores.columns))]
        # get ICCs
    ICCs = []
    for col in scores['T1'].columns:
        tmp = combined.filter(regex=col)
        out = psych.ICC(tmp)
        ICCs.append(list(out[0][1])[-1])
    return combined, cross_diag, ICCs, (fa, output)
Example #11
0
def create_factor_tree(data,
                       component_range=(1, 13),
                       component_list=None,
                       rotate='oblimin'):
    """
    Runs "visualize_factors" at multiple dimensionalities and saves them
    to a pdf
    data: dataframe to run EFA on at multiple dimensionalities
    groups: group list to be passed to visualize factors
    filename: filename to save pdf
    component_range: limits of EFA dimensionalities. e.g. (1,5) will run
                     EFA with 1 component, 2 components... 5 components.
    component_list: list of specific components to calculate. Overrides
                    component_range if set
    """
    def get_similarity_order(lower_dim, higher_dim):
        "Helper function to reorder factors into correspondance between two dimensionalities"
        subset = corr_lower_higher(higher_dim, lower_dim)
        max_factors = np.argmax(abs(subset.values), axis=0)
        return np.argsort(max_factors)

    EFA_results = {}
    full_fa_results = {}
    # plot
    if component_list is None:
        components = range(component_range[0], component_range[1] + 1)
    else:
        components = component_list
    for c in components:
        fa, output = psychFA(data, c, method='ml', rotate=rotate)
        tmp_loading_df = get_loadings(output, labels=data.columns)
        if (c - 1) in EFA_results.keys():
            reorder_index = get_similarity_order(tmp_loading_df,
                                                 EFA_results[c - 1])
            tmp_loading_df = tmp_loading_df.iloc[:, reorder_index]
            tmp_loading_df.columns = sorted(tmp_loading_df.columns)
        EFA_results[c] = tmp_loading_df
        full_fa_results[c] = fa
    return EFA_results, full_fa_results
def create_factor_tree(data, component_range=(1,13), component_list=None,
                       rotate='oblimin'):
    """
    Runs "visualize_factors" at multiple dimensionalities and saves them
    to a pdf
    data: dataframe to run EFA on at multiple dimensionalities
    groups: group list to be passed to visualize factors
    filename: filename to save pdf
    component_range: limits of EFA dimensionalities. e.g. (1,5) will run
                     EFA with 1 component, 2 components... 5 components.
    component_list: list of specific components to calculate. Overrides
                    component_range if set
    """
    def get_similarity_order(lower_dim, higher_dim):
        "Helper function to reorder factors into correspondance between two dimensionalities"
        subset = corr_lower_higher(higher_dim, lower_dim)
        max_factors = np.argmax(abs(subset.values), axis=0)
        return np.argsort(max_factors)

    EFA_results = {}
    full_fa_results = {}
    # plot
    if component_list is None:
        components = range(component_range[0],component_range[1]+1)
    else:
        components = component_list
    for c in components:
        fa, output = psychFA(data, c, method='ml', rotate=rotate)
        tmp_loading_df = get_loadings(output, labels=data.columns)
        if (c-1) in EFA_results.keys():
            reorder_index = get_similarity_order(tmp_loading_df, EFA_results[c-1])
            tmp_loading_df = tmp_loading_df.iloc[:, reorder_index]
            tmp_loading_df.columns = sorted(tmp_loading_df.columns)
        EFA_results[c] = tmp_loading_df
        full_fa_results[c] = fa
    return EFA_results, full_fa_results
 def run_EFA(data, c, rotation, orig_scores):
     fa, out = psychFA(data, c, rotate=EFA_rotation)
     scores = pd.DataFrame(out['scores'], index=data.index)
     scores = reorder_FA(orig_scores, scores)
     return scores
 def run_EFA(data, c, rotation, orig_loading):
     fa, out = psychFA(data, c, rotate=EFA_rotation)
     loadings = pd.DataFrame(out['loadings'], index=data.columns)
     loadings = reorder_FA(orig_loadings, loadings)
     return loadings
def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True):
    name = results.ID.split('_')[0].title()
    orig_data = results.data
    positive_skewed = [
        i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i
    ]
    negative_skewed = [
        i.replace('.ReflogTr', '') for i in orig_data.columns
        if ".ReflogTr" in i
    ]
    DVs = [
        i.replace('.logTr', '').replace('.ReflogTr', '')
        for i in orig_data.columns
    ]
    orig_scores = results.EFA.get_scores(rotate=rotate)

    # load and clean retest data exactly like original data
    data_raw = get_behav_data(dataset=results.dataset,
                              file='meaningful_variables.csv')
    retest_data_raw = get_behav_data(dataset=results.dataset.replace(
        'Complete', 'Retest'),
                                     file='meaningful_variables.csv')
    shared_ids = set(retest_data_raw.index) & set(data_raw.index)
    data_raw = data_raw.loc[shared_ids, :]
    retest_data_raw = retest_data_raw.loc[shared_ids, :]
    raw_data = {'T1': data_raw, 'T2': retest_data_raw}
    imputed_data = {}
    for name, data in raw_data.items():
        tmp_data = data.loc[:, DVs]
        tmp_data = transform_remove_skew(tmp_data,
                                         positive_skewed=positive_skewed,
                                         negative_skewed=negative_skewed)
        tmp_data = remove_outliers(tmp_data)
        tmp_data_imputed, error = missForest(tmp_data)
        scaled_tmp_data = scale(tmp_data_imputed)
        imputed_data[name] = scaled_tmp_data

    # get subjects not in the retest set
    ind_data = orig_data.loc[set(orig_data.index) - shared_ids]
    fa, output = psychFA(ind_data,
                         results.EFA.results['num_factors'],
                         method='ml',
                         rotate=rotate)
    weights = get_attr(fa, 'weights')
    scores = {}
    for name, data in imputed_data.items():
        suffix = ''
        if name == 'T2': suffix = 'T2'
        tmp_scores = pd.DataFrame(
            data.dot(weights),
            index=shared_ids,
            columns=[i + ' ' + suffix for i in orig_scores.columns])
        scores[name] = tmp_scores
    combined = pd.concat([scores['T1'], scores['T2']], axis=1)
    cross_diag = [
        combined.corr().iloc[i, i + len(orig_scores.columns)]
        for i in range(len(orig_scores.columns))
    ]
    # get ICCs
    ICCs = []
    for col in scores['T1'].columns:
        tmp = combined.filter(regex=col)
        out = psych.ICC(tmp)
        ICCs.append(list(out[0][1])[-1])
    return combined, cross_diag, ICCs, (fa, output)
 def run_EFA(data, c, rotation, orig_scores):
     fa, out = psychFA(data, c, rotate=EFA_rotation)
     scores = pd.DataFrame(out['scores'], index=data.index)
     scores = reorder_FA(orig_scores, scores)
     return scores
 def run_EFA(data, c, rotation, orig_loading):
     fa, out = psychFA(data, c, rotate=EFA_rotation)
     loadings = pd.DataFrame(out['loadings'], index=data.columns)
     loadings = reorder_FA(orig_loadings, loadings)
     return loadings