def transfer_scores(data, results, rotate='oblimin'): """ calculates factor scores in a new dataset based on a reference results object """ ref_data = results.data EFA = results.EFA c = EFA.results['num_factors'] loadings = EFA.get_loading(c=c, rotate=rotate) # transform data positive_skewed = [ i.replace('.logTr', '') for i in ref_data.columns if ".logTr" in i ] negative_skewed = [ i.replace('.ReflogTr', '') for i in ref_data.columns if ".ReflogTr" in i ] DVs = [ i.replace('.logTr', '').replace('.ReflogTr', '') for i in ref_data.columns ] data = data.loc[:, DVs] print('using correct transfer_scores') data = transform_remove_skew(data, positive_skewed=positive_skewed, negative_skewed=negative_skewed, drop_failed=False) data = remove_outliers(data) data_imputed, error = missForest(data) subset = data_imputed.loc[:, loadings.index] scaled_data = scale(subset) # calculate scores weights = get_attr(EFA.results['factor_tree_Rout_%s' % rotate][c], 'weights') scores = pd.DataFrame(scaled_data.dot(weights), index=data_imputed.index, columns=loadings.columns) return scores
def get_retest_comparison_data(): subsets = ['meaningful_variables_noDDM.csv', 'meaningful_variables_EZ.csv', 'meaningful_variables_hddm.csv'] dataset = pd.DataFrame() for subset in subsets: df = get_behav_data(file=subset) df_clean = remove_outliers(df) df_clean = transform_remove_skew(df_clean) drop_columns = set(dataset) & set(df_clean) df_clean.drop(labels=drop_columns, axis=1, inplace=True) dataset = pd.concat([dataset, df_clean], axis=1) return dataset
def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True): name = results.ID.split('_')[0].title() orig_data = results.data positive_skewed = [i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i] negative_skewed = [i.replace('.ReflogTr', '') for i in orig_data.columns if ".ReflogTr" in i] DVs = [i.replace('.logTr','').replace('.ReflogTr','') for i in orig_data.columns] orig_scores = results.EFA.get_scores(rotate=rotate) # load and clean retest data exactly like original data data_raw = get_behav_data(dataset=results.dataset, file='meaningful_variables.csv') retest_data_raw = get_behav_data(dataset=results.dataset.replace('Complete','Retest'), file='meaningful_variables.csv') shared_ids = set(retest_data_raw.index) & set(data_raw.index) data_raw = data_raw.loc[shared_ids, :] retest_data_raw = retest_data_raw.loc[shared_ids, :] raw_data = {'T1': data_raw, 'T2': retest_data_raw} imputed_data = {} for name, data in raw_data.items(): tmp_data = data.loc[:, DVs] tmp_data = transform_remove_skew(tmp_data, positive_skewed=positive_skewed, negative_skewed=negative_skewed) tmp_data = remove_outliers(tmp_data) tmp_data_imputed, error = missForest(tmp_data) scaled_tmp_data = scale(tmp_data_imputed) imputed_data[name] = scaled_tmp_data # get subjects not in the retest set ind_data = orig_data.loc[set(orig_data.index)-shared_ids] fa, output = psychFA(ind_data, results.EFA.results['num_factors'], method='ml', rotate=rotate) weights = get_attr(fa, 'weights') scores = {} for name, data in imputed_data.items(): suffix='' if name=='T2': suffix='T2' tmp_scores = pd.DataFrame(data.dot(weights), index=shared_ids, columns=[i+' '+suffix for i in orig_scores.columns]) scores[name] = tmp_scores combined = pd.concat([scores['T1'], scores['T2']], axis=1) cross_diag = [combined.corr().iloc[i,i+len(orig_scores.columns)] for i in range(len(orig_scores.columns))] # get ICCs ICCs = [] for col in scores['T1'].columns: tmp = combined.filter(regex=col) out = psych.ICC(tmp) ICCs.append(list(out[0][1])[-1]) return combined, cross_diag, ICCs, (fa, output)
def get_retest_comparison_data(): subsets = [ 'meaningful_variables_noDDM.csv', 'meaningful_variables_EZ.csv', 'meaningful_variables_hddm.csv' ] dataset = pd.DataFrame() for subset in subsets: df = get_behav_data(file=subset) df_clean = remove_outliers(df) df_clean = transform_remove_skew(df_clean) drop_columns = set(dataset) & set(df_clean) df_clean.drop(labels=drop_columns, axis=1, inplace=True) dataset = pd.concat([dataset, df_clean], axis=1) return dataset
def transfer_scores(data, results, rotate='oblimin'): """ calculates factor scores in a new dataset based on a reference results object """ ref_data = results.data EFA = results.EFA c = EFA.results['num_factors'] loadings = EFA.get_loading(c=c, rotate=rotate) # transform data positive_skewed = [i.replace('.logTr', '') for i in ref_data.columns if ".logTr" in i] negative_skewed = [i.replace('.ReflogTr', '') for i in ref_data.columns if ".ReflogTr" in i] DVs = [i.replace('.logTr','').replace('.ReflogTr','') for i in ref_data.columns] data = data.loc[:, DVs] data = transform_remove_skew(data, positive_skewed=positive_skewed, negative_skewed=negative_skewed) data = remove_outliers(data) data_imputed, error = missForest(data) subset = data_imputed.loc[:, loadings.index] scaled_data = scale(subset) # calculate scores weights = get_attr(EFA.results['factor_tree_Rout_%s' % rotate][c], 'weights') scores = pd.DataFrame(scaled_data.dot(weights), index=data_imputed.index, columns=loadings.columns) return scores
def calc_EFA_retest_held_out(results, rotate='oblimin', verbose=True): name = results.ID.split('_')[0].title() orig_data = results.data positive_skewed = [ i.replace('.logTr', '') for i in orig_data.columns if ".logTr" in i ] negative_skewed = [ i.replace('.ReflogTr', '') for i in orig_data.columns if ".ReflogTr" in i ] DVs = [ i.replace('.logTr', '').replace('.ReflogTr', '') for i in orig_data.columns ] orig_scores = results.EFA.get_scores(rotate=rotate) # load and clean retest data exactly like original data data_raw = get_behav_data(dataset=results.dataset, file='meaningful_variables.csv') retest_data_raw = get_behav_data(dataset=results.dataset.replace( 'Complete', 'Retest'), file='meaningful_variables.csv') shared_ids = set(retest_data_raw.index) & set(data_raw.index) data_raw = data_raw.loc[shared_ids, :] retest_data_raw = retest_data_raw.loc[shared_ids, :] raw_data = {'T1': data_raw, 'T2': retest_data_raw} imputed_data = {} for name, data in raw_data.items(): tmp_data = data.loc[:, DVs] tmp_data = transform_remove_skew(tmp_data, positive_skewed=positive_skewed, negative_skewed=negative_skewed) tmp_data = remove_outliers(tmp_data) tmp_data_imputed, error = missForest(tmp_data) scaled_tmp_data = scale(tmp_data_imputed) imputed_data[name] = scaled_tmp_data # get subjects not in the retest set ind_data = orig_data.loc[set(orig_data.index) - shared_ids] fa, output = psychFA(ind_data, results.EFA.results['num_factors'], method='ml', rotate=rotate) weights = get_attr(fa, 'weights') scores = {} for name, data in imputed_data.items(): suffix = '' if name == 'T2': suffix = 'T2' tmp_scores = pd.DataFrame( data.dot(weights), index=shared_ids, columns=[i + ' ' + suffix for i in orig_scores.columns]) scores[name] = tmp_scores combined = pd.concat([scores['T1'], scores['T2']], axis=1) cross_diag = [ combined.corr().iloc[i, i + len(orig_scores.columns)] for i in range(len(orig_scores.columns)) ] # get ICCs ICCs = [] for col in scores['T1'].columns: tmp = combined.filter(regex=col) out = psych.ICC(tmp) ICCs.append(list(out[0][1])[-1]) return combined, cross_diag, ICCs, (fa, output)
EZ_subset.to_csv(path.join(directory, 'meaningful_variables_EZ.csv')) readme_lines += ["meaningful_variables_EZ.csv: subset of exhaustive data to only meaningful variables with rt/acc parameters removed (replaced by EZ DDM params)\n\n"] # make subset without acc/rt vars and just hddm DDM hddm_subset = drop_vars(subset, drop_vars = ['_acc', '_rt', 'EZ'], saved_vars = ['simple_reaction_time.avg_rt', 'dospert_rt_survey']) hddm_subset.to_csv(path.join(directory, 'meaningful_variables_hddm.csv')) readme_lines += ["meaningful_variables_hddm.csv: subset of exhaustive data to only meaningful variables with rt/acc parameters removed (replaced by hddm DDM params)\n\n"] # save files that are selected for use # selected_variables = hddm_subset #OG selected_variables = noDDM_subset #HENRY selected_variables.to_csv(path.join(directory, 'meaningful_variables.csv')) # readme_lines += ["meaningful_variables.csv: Same as meaningful_variables_hddm.csv\n\n"] #OG readme_lines += ["meaningful_variables.csv: Same as meaningful_variables_noDDM.csv\n\n Used for Replication Study.\n\n"] #HENRY # clean data selected_variables_clean = transform_remove_skew(selected_variables) selected_variables_clean = remove_outliers(selected_variables_clean) selected_variables_clean = remove_correlated_task_variables(selected_variables_clean) selected_variables_clean.to_csv(path.join(directory, 'meaningful_variables_clean.csv')) readme_lines += ["meaningful_variables_clean.csv: same as meaningful_variables.csv with skewed variables transformed and then outliers removed \n\n"] # imputed data selected_variables_imputed, error = missForest(selected_variables_clean) selected_variables_imputed.to_csv(path.join(directory, 'meaningful_variables_imputed.csv')) readme_lines += ["meaningful_variables_imputed.csv: meaningful_variables_clean.csv after imputation with missForest\n\n"] #save selected variables selected_variables_reference = valence_df selected_variables_reference.loc[selected_variables.columns].to_csv(path.join(reference_dir, 'selected_variables_reference.csv')) # save task data subset
# make subset without acc/rt vars and just EZ DDM EZ_subset = drop_vars(subset, drop_vars = ['_acc', '_rt', 'hddm'], saved_vars = ['simple_reaction_time.avg_rt', 'dospert_rt_survey']) EZ_subset.to_csv(path.join(directory, 'meaningful_variables_EZ.csv')) readme_lines += ["meaningful_variables_EZ.csv: subset of exhaustive data to only meaningful variables with rt/acc parameters removed (replaced by EZ DDM params)\n\n"] # make subset without acc/rt vars and just hddm DDM hddm_subset = drop_vars(subset, drop_vars = ['_acc', '_rt', 'EZ'], saved_vars = ['simple_reaction_time.avg_rt', 'dospert_rt_survey']) hddm_subset.to_csv(path.join(directory, 'meaningful_variables_hddm.csv')) readme_lines += ["meaningful_variables_hddm.csv: subset of exhaustive data to only meaningful variables with rt/acc parameters removed (replaced by hddm DDM params)\n\n"] # save files that are selected for use selected_variables = hddm_subset selected_variables.to_csv(path.join(directory, 'meaningful_variables.csv')) readme_lines += ["meaningful_variables.csv: Same as meaningful_variables_hddm.csv\n\n"] # clean data selected_variables_clean = transform_remove_skew(selected_variables) selected_variables_clean = remove_outliers(selected_variables_clean) selected_variables_clean = remove_correlated_task_variables(selected_variables_clean) selected_variables_clean.to_csv(path.join(directory, 'meaningful_variables_clean.csv')) readme_lines += ["meaningful_variables_clean.csv: same as meaningful_variables.csv with skewed variables transformed and then outliers removed \n\n"] # imputed data selected_variables_imputed, error = missForest(selected_variables_clean) selected_variables_imputed.to_csv(path.join(directory, 'meaningful_variables_imputed.csv')) readme_lines += ["meaningful_variables_imputed.csv: meaningful_variables_clean.csv after imputation with missForest\n\n"] #save selected variables selected_variables_reference = valence_df selected_variables_reference.loc[selected_variables.columns].to_csv(path.join(reference_dir, 'selected_variables_reference.csv')) # save task data subset