コード例 #1
0
def get_fmri_pay(data):
    assert 'ontask_time' in data.columns, \
        'Task time not found. Must run "calc_time_taken" first.'
    all_exps = data.experiment_exp_id.unique()
    exps_completed = data.groupby('worker_id').experiment_exp_id.unique()
    exps_not_completed = exps_completed.map(lambda x: list(
        set(all_exps) - set(x) - set(['selection_optimization_compensation'])))
    completed = exps_completed[exps_completed.map(lambda x: len(x) >= 63)]
    not_completed = exps_not_completed[exps_not_completed.map(
        lambda x: len(x) > 0)]
    # calculate time taken
    # get time taken for each task from previous mturk sample
    time_path = os.path.join(get_info('base_directory'), 'references',
                             'experiment_lengths.json')
    task_time = json.load(open(time_path))
    time_missed = exps_not_completed.map(lambda x: np.sum(
        [task_time[i] if task_time[i] is not None else 3 for i in x]) / 60)
    # calculate pay
    completed_pay = pd.Series(data=100, index=completed.index)
    prorate_pay = 100 - time_missed[not_completed.index] * 10
    #remove anyone who was double counted
    pay = pd.concat([completed_pay, prorate_pay
                     ]).map(lambda x: round(x, 1)).to_frame(name='base')
    pay['bonuses'] = get_bonuses(data, 15, 10)
    pay['total'] = pay.sum(axis=1)
    return pay
コード例 #2
0
def convert_var_names(to_convert):
    '''Convert array of variable names or columns/index of a dataframe. Assumes that all values either
    come from short of long variable names. If a dataframe is passed, variable conversion
    is done in place.
    '''
    assert(isinstance(to_convert, (list, np.ndarray, pd.DataFrame))), \
        'Object to convert must be a list, numpy array or pandas DataFrame'
    reference_location = os.path.join(get_info('base_directory'), 'references', 'variable_name_lookup.csv')
    var_lookup = pd.Series.from_csv(reference_location)
    inverse_lookup = pd.Series(index = var_lookup.values, data = var_lookup.index)
    
    if type(to_convert) == pd.DataFrame:
        # convert columns if there are dependent variable names
        if to_convert.columns[0] in var_lookup:
            new_columns = [var_lookup.loc[c] if c in var_lookup.index else c for c in to_convert.columns]
        elif to_convert.columns[0] in inverse_lookup:
            new_columns = [inverse_lookup.loc[c] if c in inverse_lookup.index else c for c in to_convert.columns]
        else:
            new_columns = to_convert.columns
        to_convert.columns = new_columns
        # convert index if there are dependent variable names
        if to_convert.index[0] in var_lookup:
            new_index = [var_lookup.loc[i] if i in var_lookup.index else i for i in to_convert.index]
        elif to_convert.index[0] in inverse_lookup:
            new_index = [inverse_lookup.loc[i] if i in inverse_lookup.index else i for i in to_convert.index]
        else: 
            new_index = to_convert.index
        to_convert.index = new_index
    elif isinstance(to_convert, (list, np.ndarray)):
        if to_convert[0] in var_lookup:
            return  [var_lookup.loc[c] if c in var_lookup.index else c for c in to_convert]
        elif to_convert[0] in inverse_lookup:
            return  [inverse_lookup.loc[c] if c in inverse_lookup.index else c for c in to_convert]
コード例 #3
0
 def get_plot_dir(self):
     if self.results_dir is None:
         results_dir = get_info('results_directory')
     else:
         results_dir = self.results_dir
     plot_dir = path.join(results_dir, 'dimensional_structure', 
                          self.dataset, 'Plots', self.ID)
     makedirs(plot_dir, exist_ok = True)
     return plot_dir
コード例 #4
0
 def get_plot_dir(self):
     if self.results_dir is None:
         results_dir = get_info('results_directory')
     else:
         results_dir = self.results_dir
     plot_dir = path.join(results_dir, 'dimensional_structure', 
                          self.dataset, 'Plots', self.ID)
     makedirs(plot_dir, exist_ok = True)
     return plot_dir
コード例 #5
0
def get_demographic_model_type(demographics, verbose=False):
    base = get_info('base_directory')
    R.source(os.path.join(base, 'selfregulation', 'utils', 'utils.R'))
    
    get_vartypes = robjects.globalenv['get_vartypes']
    out=get_vartypes(demographics, verbose)
    model_types = pandas.DataFrame(numpy.reshape(numpy.matrix(out),(-1,2), 'F'))
    model_types.iloc[:, 0] = demographics.columns
    return model_types
コード例 #6
0
def get_demographic_model_type(demographics, verbose=False):
    base = get_info('base_directory')
    R.source(os.path.join(base, 'selfregulation', 'utils', 'utils.R'))

    get_vartypes = robjects.globalenv['get_vartypes']
    out = get_vartypes(demographics, verbose)
    model_types = pandas.DataFrame(
        numpy.reshape(numpy.matrix(out), (-1, 2), 'F'))
    model_types.iloc[:, 0] = demographics.columns
    return model_types
コード例 #7
0
def load_results(datafile, name=None, results_dir=None):
    if results_dir is None:
        results_dir = get_info('results_directory')
    results = {}
    result_files = glob(os.path.join(results_dir, 'dimensional_structure/%s/Output/*results.pkl' % (datafile)))
    if name is not None:
        result_files = [i for i in result_files if name in i]
    for filey in result_files:
        name = os.path.basename(filey).split('_')[0]
        results[name] = Results(saved_obj_file=filey)
    return results
    
コード例 #8
0
def convert_var_names(to_convert):
    '''Convert array of variable names or columns/index of a dataframe. Assumes that all values either
    come from short of long variable names. If a dataframe is passed, variable conversion
    is done in place.
    '''
    assert(isinstance(to_convert, (list, np.ndarray, pd.DataFrame))), \
        'Object to convert must be a list, numpy array or pandas DataFrame'
    reference_location = os.path.join(get_info('base_directory'), 'references',
                                      'variable_name_lookup.csv')
    var_lookup = pd.Series.from_csv(reference_location)
    inverse_lookup = pd.Series(index=var_lookup.values, data=var_lookup.index)

    if type(to_convert) == pd.DataFrame:
        # convert columns if there are dependent variable names
        if to_convert.columns[0] in var_lookup:
            new_columns = [
                var_lookup.loc[c] if c in var_lookup.index else c
                for c in to_convert.columns
            ]
        elif to_convert.columns[0] in inverse_lookup:
            new_columns = [
                inverse_lookup.loc[c] if c in inverse_lookup.index else c
                for c in to_convert.columns
            ]
        else:
            new_columns = to_convert.columns
        to_convert.columns = new_columns
        # convert index if there are dependent variable names
        if to_convert.index[0] in var_lookup:
            new_index = [
                var_lookup.loc[i] if i in var_lookup.index else i
                for i in to_convert.index
            ]
        elif to_convert.index[0] in inverse_lookup:
            new_index = [
                inverse_lookup.loc[i] if i in inverse_lookup.index else i
                for i in to_convert.index
            ]
        else:
            new_index = to_convert.index
        to_convert.index = new_index
    elif isinstance(to_convert, (list, np.ndarray)):
        if to_convert[0] in var_lookup:
            return [
                var_lookup.loc[c] if c in var_lookup.index else c
                for c in to_convert
            ]
        elif to_convert[0] in inverse_lookup:
            return [
                inverse_lookup.loc[c] if c in inverse_lookup.index else c
                for c in to_convert
            ]
コード例 #9
0
def gen_reference_item_text(items_df):
    base_directory = get_info('base_directory')
    reference_location = os.path.join(base_directory,'references','variable_name_lookup.csv')
    ref = pd.read_csv(reference_location)
    # add item text
    item_text_lookup = items_df.groupby('item_ID').item_text.unique().apply(lambda x: x[0]).to_dict()
    item_text = [item_text_lookup[i] if i in item_text_lookup.keys() else np.nan for i in ref['Variable Name']]
    # add response text
    response_text_lookup = items_df.groupby('item_ID').response_text.unique().apply(lambda x: ', '.join(x))
    response_text = [response_text_lookup[i].replace('\n','') if i in response_text_lookup.keys() else np.nan for i in ref['Variable Name']]
    ref.loc[:,'Question'] = item_text
    ref.loc[:,'Responses'] = response_text
    ref.to_csv(reference_location, index = False)
コード例 #10
0
def get_fmri_pay(data):
    assert 'ontask_time' in data.columns, \
        'Task time not found. Must run "calc_time_taken" first.' 
    all_exps = data.experiment_exp_id.unique()
    exps_completed = data.groupby('worker_id').experiment_exp_id.unique()
    exps_not_completed = exps_completed.map(lambda x: list(set(all_exps) - set(x) - set(['selection_optimization_compensation'])))
    completed = exps_completed[exps_completed.map(lambda x: len(x)>=63)]
    not_completed = exps_not_completed[exps_not_completed.map(lambda x: len(x)>0)]
    # calculate time taken
    # get time taken for each task from previous mturk sample
    time_path = os.path.join(get_info('base_directory'),'references','experiment_lengths.json')
    task_time = json.load(open(time_path))
    time_missed = exps_not_completed.map(lambda x: np.sum([task_time[i] if task_time[i] is not None else 3 for i in x])/60)
    # calculate pay
    completed_pay = pd.Series(data = 100, index = completed.index)
    prorate_pay = 100-time_missed[not_completed.index]*10
    #remove anyone who was double counted
    pay= pd.concat([completed_pay, prorate_pay]).map(lambda x: round(x,1)).to_frame(name = 'base')
    pay['bonuses'] = get_bonuses(data, 15, 10)
    pay['total'] = pay.sum(axis = 1)
    return pay
コード例 #11
0
def gen_reference_item_text(items_df):
    base_directory = get_info('base_directory')
    reference_location = os.path.join(base_directory, 'references',
                                      'variable_name_lookup.csv')
    ref = pd.read_csv(reference_location)
    # add item text
    item_text_lookup = items_df.groupby('item_ID').item_text.unique().apply(
        lambda x: x[0]).to_dict()
    item_text = [
        item_text_lookup[i] if i in item_text_lookup.keys() else np.nan
        for i in ref['Variable Name']
    ]
    # add response text
    response_text_lookup = items_df.groupby(
        'item_ID').response_text.unique().apply(lambda x: ', '.join(x))
    response_text = [
        response_text_lookup[i].replace('\n', '')
        if i in response_text_lookup.keys() else np.nan
        for i in ref['Variable Name']
    ]
    ref.loc[:, 'Question'] = item_text
    ref.loc[:, 'Responses'] = response_text
    ref.to_csv(reference_location, index=False)
コード例 #12
0
from expanalysis.experiments.utils import remove_duplicates, result_filter
from expanalysis.results import get_filters, get_result_fields
from expanalysis.results import Result
from os import path, makedirs
import pickle

from selfregulation.utils.utils import get_info

#set token and data directory
token = get_info('expfactory_token', infile='/oak/stanford/groups/russpold/users/zenkavi/Self_Regulation_Ontology/Self_Regulation_Retest_Settings.txt')

data_dir=path.join('/oak/stanford/groups/russpold/users/zenkavi/Self_Regulation_Ontology/Data/','Retest_01-23-2018', 'Local')

if not path.exists(data_dir):
    makedirs(data_dir)


# Set up filters
filters = get_filters()
drop_columns = ['battery_description', 'experiment_reference', 'experiment_version', \
         'experiment_name','experiment_cognitive_atlas_task']
for col in drop_columns:
    filters[col] = {'drop': True}

# Strip token from specified file
f = open(token)
access_token = f.read().strip()

# Set up variables for the download request
battery = 'Self Regulation Retest Battery' 
url = 'http://www.expfactory.org/new_api/results/62/'
コード例 #13
0
    find_intersection, get_fully_connected_threshold, remove_island_variables
from selfregulation.utils.graph_utils import  Graph_Analysis, threshold, \
    threshold_proportional_sign
from selfregulation.utils.utils import get_behav_data, get_info

import bct
import igraph
import numpy as np
from os.path import join, exists
from os import makedirs
import pandas as pd
import seaborn as sns

# generic variables
save_plots = False
plot_dir = join(get_info('base_directory'), 'dimensional_structure', 'Plots')

# get dependent variables
graph_data = get_behav_data(file='taskdata_imputed.csv')


def run_graph_analysis(adj_dict, save_plots=False):
    """
    Takes in a dictionary with two keys: "name" and "adj", specifying
    an adjacency matrix (as a dataframe) and its corresponding name
    """
    def plot_name(name):
        return join(plot_dir, adj_name, name)

    adj_name = adj_dict['name']
    adj = adj_dict['adj']
コード例 #14
0
    default='all')
parser.add_argument(
    '--sample',
    help=
    'Specifies what sample to run. Options: "discovery", "validation", "incomplete").',
    nargs='+',
    default=['discovery', 'validation', 'incomplete'])

# get options
args = parser.parse_args()
job = args.job
sample = args.sample

print('Running Script. Job %s, sample: %s' % (job, sample))
#load Data
token = get_info('expfactory_token')
try:
    data_dir = get_info('data_directory')
except Exception:
    data_dir = path.join(get_info('base_directory'), 'Data')

if job == 'download' or job == "all":
    print('Beginning "Download"')
    #***************************************************
    # ********* Load Data **********************
    #**************************************************
    pd.set_option('display.width', 200)
    figsize = [16, 12]
    #set up filters
    filters = get_filters()
    drop_columns = ['battery_description', 'experiment_reference', 'experiment_version', \
コード例 #15
0
from selfregulation.utils.utils import get_info

parser = argparse.ArgumentParser(
    description='fMRI Analysis Entrypoint Script.')
parser.add_argument(
    '--job',
    help=
    'Specifies what part of the script to run. Options: download, extras, post, all").',
    default='post')

# get options
args = parser.parse_args()
job = args.job

#load Data
token = get_info('expfactory_token')
try:
    data_dir = get_info('data_directory')
except Exception:
    data_dir = path.join(get_info('base_directory'), 'Data')

if job == 'download' or job == "all":
    #***************************************************
    # ********* Load Data **********************
    #**************************************************
    pd.set_option('display.width', 200)
    figsize = [16, 12]
    #set up filters
    filters = get_filters()
    drop_columns = ['battery_description', 'experiment_reference', 'experiment_version', \
             'experiment_name','experiment_cognitive_atlas_task']
コード例 #16
0
from expanalysis.experiments.utils import remove_duplicates, result_filter
from expanalysis.results import get_filters, get_result_fields
from expanalysis.results import Result
from os import path, makedirs
import pickle

from selfregulation.utils.utils import get_info

#set token and data directory
token = get_info(
    'expfactory_token',
    infile=
    '/oak/stanford/groups/russpold/users/zenkavi/Self_Regulation_Ontology/Self_Regulation_Retest_Settings.txt'
)

data_dir = path.join(
    '/oak/stanford/groups/russpold/users/zenkavi/Self_Regulation_Ontology/Data/',
    'Retest_01-23-2018', 'Local')

if not path.exists(data_dir):
    makedirs(data_dir)

# Set up filters
filters = get_filters()
drop_columns = ['battery_description', 'experiment_reference', 'experiment_version', \
         'experiment_name','experiment_cognitive_atlas_task']
for col in drop_columns:
    filters[col] = {'drop': True}

# Strip token from specified file
f = open(token)
コード例 #17
0
    parser.add_argument('-j',"--n_jobs", help="number of processors",type=int,
                            default=2)
    parser.add_argument('-w',"--workdir", help="working directory")
    parser.add_argument('-r',"--resultsdir", help="results directory")
    parser.add_argument("--singlevar", nargs='*',help="run with single variables")
    parser.add_argument('--imputer',help='imputer to use',
                            default='SimpleFill')
    parser.add_argument("--smote_threshold", help="threshold for applying smote (distance from 0.5)",
                        type=float,default=0.05)
    args=parser.parse_args()

    # parameters to set

    if args.resultsdir is None:
        try:
            output_base=get_info('results_directory')
        except:
            output_base='.'
    else:
        output_base=args.resultsdir
    output_dir=os.path.join(output_base,'prediction_outputs')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    #assert args.dataset in ['survey','mirt','task','all','baseline']
    assert args.classifier in ['lasso','rf']
    # don't regress out baseline vars for baseline model
    if args.dataset=='baseline' or args.no_baseline_vars:
        baselinevars=False
        if args.verbose:
            print("turning off inclusion of baseline vars")
コード例 #18
0
from dimensional_structure.utils import transfer_scores
from selfregulation.utils.utils import get_behav_data, get_info
from selfregulation.utils.result_utils import load_results

# get contextualizing results
results_dataset = 'Complete_03-29-2018'
results = load_results(datafile=results_dataset)

# get fmri data
fmri_dataset = 'Fmri_Followup_10-22-2018'
data = get_behav_data(dataset=fmri_dataset)
# remove data where participants are missing more than 20% of the tasks
tasks = data.copy()
tasks.columns = [i.split('.')[0] for i in tasks.columns]
successful_subjects = (tasks.isnull().reset_index().melt(id_vars=['index']) \
                       .groupby(['index', 'variable']).mean() \
                       .groupby('index').sum()<12)
successful_subjects = successful_subjects[successful_subjects['value']]
data = data.loc[successful_subjects.index]

task_scores = transfer_scores(data, results['task'])
survey_scores = transfer_scores(data, results['survey'])

# save the scores
basename = 'factorscores_results-%s.csv' % results_dataset
task_scores.to_csv(
    path.join(get_info('base_directory'), 'Data', fmri_dataset,
              'task_' + basename))
survey_scores.to_csv(
    path.join(get_info('base_directory'), 'Data', fmri_dataset,
              'survey_' + basename))
コード例 #19
0
    find_intersection, get_fully_connected_threshold, remove_island_variables
from selfregulation.utils.graph_utils import  Graph_Analysis, threshold, \
    threshold_proportional_sign
from selfregulation.utils.utils import get_behav_data, get_info

import bct
import igraph
import numpy as np
from os.path import join, exists
from os import makedirs
import pandas as pd
import seaborn as sns

# generic variables
save_plots = False
plot_dir = join(get_info('base_directory'),'dimensional_structure','Plots')

# get dependent variables
graph_data = get_behav_data(file = 'taskdata_imputed.csv')  



def run_graph_analysis(adj_dict, save_plots=False):
    """
    Takes in a dictionary with two keys: "name" and "adj", specifying
    an adjacency matrix (as a dataframe) and its corresponding name
    """
    def plot_name(name):
        return join(plot_dir,adj_name,name)
        
    adj_name = adj_dict['name']
コード例 #20
0
from selfregulation.utils.utils import get_info, get_recent_dataset, get_retest_data
from selfregulation.utils.result_utils import load_results

# In[ ]:

get_ipython().run_line_magic('matplotlib', 'inline')
# import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import seaborn as sns

# In[ ]:

dataset = get_recent_dataset()
results_dir = get_info('results_directory')
ontology_results_dir = path.join(results_dir, 'ontology_reconstruction',
                                 dataset, '*', 'oblimin')
retest_data = get_retest_data(dataset.replace('Complete', 'Retest'))
plot_dir = glob(path.join(ontology_results_dir, 'Plots'))[0]
save = True

# In[ ]:

results = load_results(dataset)['task']
c = results.EFA.get_c()

# # Load Reconstructions

# In[ ]:
コード例 #21
0
#!/usr/bin/env python3
import argparse
from expanalysis.experiments.processing import get_exp_DVs
from glob import glob
from os import path
import pandas as pd

from selfregulation.utils.utils import get_info

try:
    data_dir=get_info('data_directory')
except Exception:
    data_dir=path.join(get_info('base_directory'),'Data')


# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('exp_id')
parser.add_argument('data')
parser.add_argument('--no_group', action='store_false')
# HDDM params
parser.add_argument('--out_dir', default=data_dir)
parser.add_argument('--hddm_samples', default=None, type=int)
parser.add_argument('--hddm_burn', default=None, type=int)
parser.add_argument('--hddm_thin', default=None, type=int)
parser.add_argument('--no_parallel', action='store_false')
parser.add_argument('--num_cores', default=None, type=int)
parser.add_argument('--mode', default=None, type=str)

args = parser.parse_args()
コード例 #22
0
from collections import OrderedDict as odict
from os import makedirs, path
import numpy as np
import pandas as pd
import pickle
from ideological_prediction.plot_utils import (plot_RSA,
                                               plot_outcome_ontological_similarity,
                                               plot_prediction, plot_prediction_scatter,
                                               importance_bar_plots,
                                               importance_polar_plots,
                                               plot_predictors_comparison)
from selfregulation.utils.utils import get_info

results_dir = path.join(get_info('results_directory'), 'ideology_prediction')
plot_dir = path.join(results_dir, 'Plots')
makedirs(plot_dir, exist_ok=True)


# load predictions
rotate='oblimin'
ext = 'pdf'
data = pickle.load(open(path.join(results_dir, 
                                'ideo_predictions.pkl'), 'rb'))
all_predictions = data['all_predictions']
all_shuffled_predictions = data['all_shuffled_predictions']
predictors = data['predictors']
targets = data['targets']
RSA = {}

# plot RSA for ideological variables
ideological_correlations = {}
from os import path
import pandas as pd
from selfregulation.utils.data_preparation_utils import calc_trial_order, \
    convert_date, convert_fmri_ids, download_data, get_bonuses, get_fmri_pay, \
    quality_check_correction
from selfregulation.utils.utils import get_info

parser = argparse.ArgumentParser(description='fMRI Analysis Entrypoint Script.')
parser.add_argument('--job', help='Specifies what part of the script to run. Options: download, extras, post, all").', default='post')

# get options
args = parser.parse_args()
job = args.job

#load Data
token = get_info('expfactory_token')
try:
    data_dir=get_info('data_directory')
except Exception:
    data_dir=path.join(get_info('base_directory'),'Data')

if job == 'download' or job == "all":
    #***************************************************
    # ********* Load Data **********************
    #**************************************************        
    pd.set_option('display.width', 200)
    figsize = [16,12]
    #set up filters
    filters = get_filters()
    drop_columns = ['battery_description', 'experiment_reference', 'experiment_version', \
             'experiment_name','experiment_cognitive_atlas_task']
コード例 #24
0
file in the same folder as these metadata files.


"""

from metadata_validator import validate_exp
from collections import OrderedDict
import pandas,unicodedata

import os,pickle,sys,math
import json
from selfregulation.utils.utils import get_info,get_behav_data,get_item_metadata,get_var_category

from measure_dictionaries import measure_longnames,measure_termurls,measure_sobcurls

basedir=get_info('base_directory')
dataset=get_info('dataset')
outdir=os.path.join(basedir,'Data/Derived_Data/%s'%dataset)

def get_subscale_vars():
    subscale_data=pandas.read_csv('../references/survey_subscale_reference.csv',
                                index_col=0)
    subscale_var_dict={}
    for v in subscale_data.index:
        if subscale_data.loc[v].iloc[2]=='sum items':
            subscale_var_dict[v]='SumAll'
        elif subscale_data.loc[v].iloc[2]=='mean items':
            subscale_var_dict[v]='MeanAll'
        else:
            d=[]
            for i in subscale_data.loc[v]:
コード例 #25
0
                varNums = ''
            if k == 'eating_survey':
                varNums = 'custom(%s)' % varNums
            measure_level_data.append([
                vname, shortname, m['title'], m['dataElements'][e]['title'],
                m['measureType'], varNums
            ])

measure_level_df = pandas.DataFrame(measure_level_data,
                                    columns=[
                                        'ExpFactoryName', 'ShortName',
                                        'MeasureName', 'VariableName',
                                        'MeasureType', 'SubscaleVarNums'
                                    ])
# doublecheck that all meaningful variables are here
dataset = get_info('dataset')
behavdata = get_behav_data(dataset)
measurevars = measure_level_df.ExpFactoryName.tolist()
for v in behavdata.columns:
    assert v in measurevars
measure_level_df.to_csv('meaningful_variables_metadata.csv', index=False)

#save item level data
item_level_df = pandas.DataFrame(item_level_data,
                                 columns=[
                                     'ExpFactoryName', 'MeasureName',
                                     'QuestionNumber', 'QuestionText',
                                     'Scoring', 'ResponseOptions'
                                 ])
item_level_df.to_csv('item_level_metadata.csv', index=False)
コード例 #26
0
#!/usr/bin/env python3
from os import path
import pandas as pd
from selfregulation.utils.utils import get_info

try:
    data_dir = get_info('data_directory')
except Exception:
    data_dir = path.join(get_info('base_directory'), 'Data')

complete = None
# concatenate discovery and validation data into one complete
discovery_path = path.join(data_dir, 'mturk_discovery_data_post.pkl')
validation_path = path.join(data_dir, 'mturk_validation_data_post.pkl')
complete_path = path.join(data_dir, 'mturk_complete_data_post.pkl')
if path.exists(discovery_path) and path.exists(validation_path):
    discovery = pd.read_pickle(discovery_path)
    validation = pd.read_pickle(validation_path)
    complete = pd.concat([discovery, validation])
    complete.to_pickle(complete_path)

# separate complete into two data subsets for particularly memory intensive analyses (DDM)
if path.exists(complete_path):
    if not complete:
        complete = pd.read_pickle(complete_path)
    workers = complete.worker_id.unique()
    mid = len(workers) // 2
    subset1 = complete.query('worker_id in %s' % list(workers)[:mid])
    subset2 = complete.query('worker_id in %s' % list(workers)[mid:])
    subset1.to_pickle(
        path.join(data_dir, 'mturk_complete_subset1_data_post.pkl'))
コード例 #27
0
import matplotlib.pyplot as plt
import numpy as np
from os import makedirs, path
import pandas as pd
from scipy.spatial.distance import squareform
from sklearn.manifold import MDS
import seaborn as sns
from dimensional_structure.HCA_plots import abs_pdist
from selfregulation.utils.result_utils import load_results
from selfregulation.utils.utils import get_info, get_recent_dataset

# get dataset of interest
basedir = get_info('base_directory')
dataset = get_recent_dataset()
dataset = path.join(basedir, 'Data', dataset)
datafile = dataset.split(path.sep)[-1]

# load data
results = load_results(datafile)
data = results['task'].data
out = results['task'].EFA.get_loading()
nfactors = out.shape[1]
task_subset = pd.concat([
    out.filter(regex='choice_reaction_time', axis=0),
    out.filter(regex='^stop_signal\.(hddm|SSRT)', axis=0)[1:5]
])
task_subset_data = data.loc[:, task_subset.index]
task_variables = list(task_subset.index)
plot_dir = output_dir = path.join(get_info('results_directory'),
                                  'ontology_reconstruction',
                                  results['task'].ID, 'Plots')
コード例 #28
0
import matplotlib.pyplot as plt
import numpy as np
from os import path
import pandas as pd
import seaborn as sns
from shutil import copyfile
from selfregulation.utils.utils import (get_behav_data, get_info, get_recent_dataset, 
                                        get_retest_data, get_var_category)
from selfregulation.utils.plot_utils import format_num
sns.set_palette("Set1", 8, .75)

base_dir = get_info('base_directory')
ext = 'pdf'
dpi = 300

# Raw Data Plots
"""
# Load data if plots need to be regenerated

post_process_data_loc = ''
data = pd.load_pickle(post_process_data_loc)

# plt total time on tasks
(data.groupby('worker_id').ontask_time.sum()/3600).hist(bins=40, 
                                                        grid=False, 
                                                        density=True,
                                                        figsize=(12,8))
plt.xlabel('Time (Hours)')
plt.title('Total Time on Tasks', weight='bold')

コード例 #29
0
#!/usr/bin/env python3
"""
export metdata to csv for Mackinnon group
"""

import os,pickle,sys
import json

from selfregulation.utils.utils import get_info,get_behav_data
basedir=get_info('base_directory')
dataset=get_info('dataset')
print('using dataset:',dataset)
datadir=os.path.join(basedir,'Data/%s'%dataset)
outdir=os.path.join(basedir,'Data/Derived_Data/%s'%dataset)
metadata=pickle.load(open(os.path.join(outdir,'survey_metadata.pkl'),'rb'))
surveys=list(metadata.keys())
surveys.sort()
with open(os.path.join(outdir,'survey_metadata.tsv'),'w') as f:
    for s in surveys:
        items=list(metadata[s].keys())
        items.sort()
        items.remove('MeasurementToolMetadata')
        for i in items:
            print(metadata[s][i])
            levels=list(metadata[s][i]['Levels'].keys())
            levels.sort()
            options='\t'.join(['%s:%s'%(k,metadata[s][i]['Levels'][k]) for k in levels])
            f.write('%s\t%s\t%s\n'%(i,
                metadata[s][i]['Description'],
                options))
コード例 #30
0
            if 'subscaleVarNums' in m['dataElements'][e]:
                varNums=m['dataElements'][e]['subscaleVarNums']
            else:
                varNums=''
            if k=='eating_survey':
                varNums='custom(%s)'%varNums
            measure_level_data.append([vname,shortname,
                                    m['title'],m['dataElements'][e]['title'],
                                    m['measureType'],varNums])


measure_level_df=pandas.DataFrame(measure_level_data,
    columns=['ExpFactoryName','ShortName','MeasureName','VariableName',
                'MeasureType','SubscaleVarNums'])
# doublecheck that all meaningful variables are here
dataset=get_info('dataset')
behavdata=get_behav_data(dataset)
measurevars=measure_level_df.ExpFactoryName.tolist()
for v in behavdata.columns:
    assert v in measurevars
measure_level_df.to_csv('meaningful_variables_metadata.csv',index=False)

#save item level data
item_level_df=pandas.DataFrame(item_level_data,
    columns=['ExpFactoryName','MeasureName','QuestionNumber',
    'QuestionText','Scoring','ResponseOptions'])
item_level_df.to_csv('item_level_metadata.csv',index=False)

outcome_df=pandas.DataFrame(outcome_data,
    columns=['ExpFactoryName','MeasureName',
    'QuestionText','Scoring','ResponseOptions'])
コード例 #31
0
    else:
        dataset = get_recent_dataset()


# In[ ]:


# additional setup
np.random.seed(12412)
results = load_results(dataset)['task']
c = results.EFA.results['num_factors']

classifiers = {'Ridge': Ridge(fit_intercept=False),
               'LR': LinearRegression(fit_intercept=False)}
# get output dir to store results
output_dir = path.join(get_info('results_directory'),
                       'ontology_reconstruction', results.ID)
makedirs(output_dir, exist_ok=True)


# In[ ]:


# get a random subset of variables to perform the calculation on if n_vars is set
if n_vars is not None:
    var_list = np.random.choice(results.data.columns, n_vars, replace=False)
else:
    var_list = results.data.columns


# Run simulation for every variable at different population sizes. 
コード例 #32
0
import matplotlib.pyplot as plt
import numpy as np
from os import makedirs, path
import pandas as pd
from scipy.spatial.distance import  squareform
from sklearn.manifold import MDS
import seaborn as sns
from dimensional_structure.HCA_plots import abs_pdist
from selfregulation.utils.result_utils import load_results
from selfregulation.utils.utils import get_info, get_recent_dataset

# get dataset of interest
basedir=get_info('base_directory')
dataset = get_recent_dataset()
dataset = path.join(basedir,'Data',dataset)
datafile = dataset.split(path.sep)[-1]

# load data
results = load_results(datafile)
data = results['task'].data
out = results['task'].EFA.get_loading()
nfactors = out.shape[1]
task_subset = pd.concat([
    out.filter(regex='choice_reaction_time', axis=0),
    out.filter(regex='^stop_signal\.(hddm|SSRT)', axis=0)[1:5]])
task_subset_data = data.loc[:, task_subset.index]
task_variables = list(task_subset.index)
plot_dir = output_dir = path.join(get_info('results_directory'),
                       'ontology_reconstruction', results['task'].ID, 'Plots')
makedirs(plot_dir, exist_ok=True)
コード例 #33
0
# ## Additional Setup

# In[ ]:

# Load dataset
np.random.seed(12412)
results = load_results(dataset)[result_subset]
c = results.EFA.get_c()

# Define classifiers
classifiers = {
    'Ridge': Ridge(fit_intercept=False),
    'LR': LinearRegression(fit_intercept=False)
}
# get output dir to store results
output_dir = path.join(get_info('results_directory'),
                       'ontology_reconstruction', dataset, results.ID,
                       EFA_rotation)
makedirs(output_dir, exist_ok=True)
# get plot dir to store plots
plot_dir = path.join(output_dir, 'Plots')
makedirs(plot_dir, exist_ok=True)

# In[ ]:

# get a random subset of variables to perform the calculation on if n_vars is set
measures = np.unique([i.split('.')[0] for i in results.data.columns])
if n_measures is not None:
    measure_list = np.random.choice(measures, n_measures, replace=False)
else:
    measure_list = measures
コード例 #34
0
import sys, os
import random
import pickle
import pandas, numpy
from selfregulation.utils.utils import get_info, get_behav_data
import fancyimpute
import matplotlib.pyplot as plt

dataset = 'Complete_10-27-2017'
basedir = get_info('base_directory')
nruns = int(sys.argv[1])
outdir = sys.argv[2]
datafile = sys.argv[3]

if not os.path.exists(outdir):
    os.mkdir(outdir)
#datafile=os.path.join(basedir,
#        'Data/Derived_Data/%s/behavdata_imputed_cleaned.csv'%dataset)

df = pandas.read_csv(datafile)

# shuffle, recompute, and store maximum for each run


def col_shuffle(df, test=False):
    """
    shuffle data within each column
    """
    if test:
        return (df)
    df_shuf = df.copy()
コード例 #35
0
from selfregulation.utils.result_utils import load_results

# get contextualizing results
results_dataset = 'Complete_03-29-2018'
results = load_results(datafile=results_dataset)

# get fmri data
fmri_dataset= 'Fmri_Followup_10-22-2018'
data = get_behav_data(dataset=fmri_dataset)
# remove data where participants are missing more than 20% of the tasks
tasks = data.copy()
tasks.columns = [i.split('.')[0] for i in tasks.columns]
successful_subjects = (tasks.isnull().reset_index().melt(id_vars=['index']) \
                       .groupby(['index', 'variable']).mean() \
                       .groupby('index').sum()<12)
successful_subjects = successful_subjects[successful_subjects['value']]
data = data.loc[successful_subjects.index]

task_scores = transfer_scores(data, results['task'])
survey_scores = transfer_scores(data, results['survey'])

# save the scores
basename = 'factorscores_results-%s.csv' % results_dataset
task_scores.to_csv(path.join(get_info('base_directory'),
                             'Data',
                             fmri_dataset,
                             'task_'+basename))
survey_scores.to_csv(path.join(get_info('base_directory'),
                             'Data',
                             fmri_dataset,
                             'survey_'+basename))