def save_task_data(data_loc, data):
    path = os.path.join(data_loc,'Individual_Measures')
    if not os.path.exists(path):
        os.makedirs(path)
    for exp_id in np.sort(data.experiment_exp_id.unique()):
        print('Saving %s...' % exp_id)
        extract_experiment(data,exp_id).to_csv(os.path.join(path, exp_id + '.csv.gz'), compression = 'gzip')
def save_task_data(data_loc, data):
    path = os.path.join(data_loc, 'Individual_Measures')
    if not os.path.exists(path):
        os.makedirs(path)
    for exp_id in np.sort(data.experiment_exp_id.unique()):
        print('Saving %s...' % exp_id)
        extract_experiment(data,
                           exp_id).to_csv(os.path.join(path,
                                                       exp_id + '.csv.gz'),
                                          compression='gzip')
def get_items(data):
    excluded_surveys = ['holt_laury_survey']
    items = []
    responses = []
    responses_text = []
    options = []
    workers = []
    item_nums = []
    exps = []
    for exp in data.experiment_exp_id.unique():
        if 'survey' in exp and exp not in excluded_surveys:
            survey = extract_experiment(data,exp)
            try:
                responses += list(survey.response.map(lambda x: float(x)))
            except ValueError:
                continue
            items += list(survey.text)
            responses_text += [str(i) for i in list(survey.response_text)]
            options += list(survey.options)
            workers += list(survey.worker_id)
            item_nums += list(survey.question_num)
            exps += [exp] * len(survey.text)
    
    items_df = pd.DataFrame({'survey': exps, 'worker': workers, 'item_text': items, 'coded_response': responses,
                             'response_text': responses_text, 'options': options}, dtype = float)
    items_df.loc[:,'item_num'] = [str(i).zfill(2) for i in item_nums]
    items_df.loc[:,'item_ID'] = items_df['survey'] + '.' + items_df['item_num'].astype(str)
    items_df=items_df[['worker','item_ID','coded_response','item_text','response_text','options','survey','item_num']]
    return items_df
def quality_check_correction(data):
    """
    This function corrects the issues with the stop signal tasks mentioned above
    """
    for exp in [
            'stop_signal', 'motor_selective_stop_signal',
            'stim_selective_stop_signal'
    ]:
        df = extract_experiment(data, exp)
        rt_thresh = 200
        acc_thresh = .6
        missed_thresh = .25
        response_thresh = .95
        passed_rt = df.query('rt != -1 and SS_trial_type=="go"').groupby(
            'worker_id').rt.median() >= rt_thresh
        passed_miss = df.query('SS_trial_type=="go"').groupby(
            'worker_id').rt.agg(lambda x: np.mean(x == -1)) < missed_thresh
        passed_acc = df.query('rt != -1').groupby(
            'worker_id').correct.mean() >= acc_thresh
        passed_response = np.logical_not(
            df.query('rt != -1').groupby('worker_id').key_press.agg(
                lambda x: np.any(
                    pd.value_counts(x) > pd.value_counts(x).sum() *
                    response_thresh)))
        passed_df = pd.concat(
            [passed_rt, passed_acc, passed_miss, passed_response],
            axis=1).fillna(False, inplace=False)
        passed = passed_df.all(axis=1)
        failed = passed[passed == False]
        for subj in failed.index:
            data.loc[(data.experiment_exp_id == exp) &
                     (data.worker_id == subj), 'passed_QC'] = False
        for subj in passed.index:
            data.loc[(data.experiment_exp_id == exp) &
                     (data.worker_id == subj), 'passed_QC'] = True
def get_average_variable(results, var):
    '''Prints time taken for each experiment in minutes
    '''
    averages = {}
    for exp in results.get_experiments():
        data = extract_experiment(results,exp)
        try:
            average = data[var].mean()
        except TypeError:
            print("Cannot average %s" % (var))
        averages[exp] = average
    return averages
Esempio n. 6
0
def get_average_variable(results, var):
    '''Prints time taken for each experiment in minutes
    '''
    averages = {}
    for exp in results.get_experiments():
        data = extract_experiment(results, exp)
        try:
            average = data[var].mean()
        except TypeError:
            print("Cannot average %s" % (var))
        averages[exp] = average
    return averages
def get_items(data):
    excluded_surveys = ['holt_laury_survey']
    items = []
    responses = []
    responses_text = []
    options = []
    workers = []
    item_nums = []
    exps = []
    for exp in data.experiment_exp_id.unique():
        if 'survey' in exp and exp not in excluded_surveys:
            survey = extract_experiment(data, exp)
            try:
                responses += list(survey.response.map(lambda x: float(x)))
            except ValueError:
                continue
            items += list(survey.text)
            responses_text += [str(i) for i in list(survey.response_text)]
            options += list(survey.options)
            workers += list(survey.worker_id)
            item_nums += list(survey.question_num)
            exps += [exp] * len(survey.text)

    items_df = pd.DataFrame(
        {
            'survey': exps,
            'worker': workers,
            'item_text': items,
            'coded_response': responses,
            'response_text': responses_text,
            'options': options
        },
        dtype=float)
    items_df.loc[:, 'item_num'] = [str(i).zfill(2) for i in item_nums]
    items_df.loc[:, 'item_ID'] = items_df['survey'] + '.' + items_df[
        'item_num'].astype(str)
    items_df = items_df[[
        'worker', 'item_ID', 'coded_response', 'item_text', 'response_text',
        'options', 'survey', 'item_num'
    ]]
    return items_df
def quality_check_correction(data):
    """
    This function corrects the issues with the stop signal tasks mentioned above
    """
    for exp in ['stop_signal','motor_selective_stop_signal',
                'stim_selective_stop_signal']:
        df = extract_experiment(data, exp)
        rt_thresh = 200
        acc_thresh = .6
        missed_thresh = .25
        response_thresh = .95
        passed_rt = df.query('rt != -1 and SS_trial_type=="go"').groupby('worker_id').rt.median() >= rt_thresh
        passed_miss = df.query('SS_trial_type=="go"').groupby('worker_id').rt.agg(lambda x: np.mean(x == -1)) < missed_thresh
        passed_acc = df.query('rt != -1').groupby('worker_id').correct.mean() >= acc_thresh
        passed_response = np.logical_not(df.query('rt != -1').groupby('worker_id').key_press.agg(
                                                lambda x: np.any(pd.value_counts(x) > pd.value_counts(x).sum()*response_thresh))) 
        passed_df = pd.concat([passed_rt,passed_acc,passed_miss,passed_response], axis = 1).fillna(False, inplace = False)
        passed = passed_df.all(axis = 1)
        failed = passed[passed == False]
        for subj in failed.index:
            data.loc[(data.experiment_exp_id == exp) & (data.worker_id == subj),'passed_QC'] = False
        for subj in passed.index:
            data.loc[(data.experiment_exp_id == exp) & (data.worker_id == subj),'passed_QC'] = True
def results_check(data, exp_id = None, worker = None, columns = ['correct', 'rt'], remove_practice = True, use_groups = True,  plot = False, silent = False):
    """Outputs info for a basic data check on the results object. Uses data_check to group, describe and plot
    dataframes. Function first filters the results object as specified,
    loops through each experiment and worker contained in the results object, performs some basic dataframe manipulation
    and runs data_check
    :data: the data from an expanalysis Result object
    :param experiment: a string or array of strings to select the experiment(s) before calculating basic stats
    :param worker: a string or array of strings to select the worker(s) before calculating basic stats
    :param columns: array of columns to subset summary statistics, if they exist
    :param remove_practice: bool, default True. If True will remove any rows labeled "practice" in the "exp_stage" column, if it exists
    :param use_groups: bool, default True. If True will lookup grouping variables using get_groupby for the experiment
    :param silent: bool, default False. If True will not print output
    :param plot: bool, default False: If True plots data using plot_groups
    :return summary, p: summary data frame and plot object
    """
    assert 'worker_id' in data.columns and 'experiment_exp_id' in data.columns, \
        "Results data must have 'worker_id' and 'experiment_exp_id' in columns"
    stats = {}
    results = result_filter(data, exp_id = exp_id, worker = worker)
    orig_plot = plot
    orig_silent = silent
    display = not silent or plot
    if display:
        print('******************************************************************************')
        print('Input: Type "exit" to end, "skip" to skip to the next experiment, or hit enter to continue')
        print('******************************************************************************')
    for experiment in numpy.unique(results['experiment_exp_id']):
        stats[experiment] = {}
        if display:
            print('******************************************************************************')
            print('    Experiment: ',  experiment)
            print('******************************************************************************')
        if use_groups:
            groupby = get_groupby(experiment)
        else:
            groupby = []
        experiment_df = extract_experiment(results, experiment)
        for worker in pandas.unique(experiment_df['worker_id']):
            if display:
                print('******************************************************************************')
                print('    Worker: ',  worker)
                print('******************************************************************************')
            df = experiment_df.query('worker_id == "%s"' % worker)
            summary, p = data_check(df, columns, remove_practice, groupby, silent, plot)
            #add summary and plot to dictionary of summaries
            stats[experiment]= {worker: {'summary': summary, 'plot': p}}
            if not silent or plot:
                input_text = input("Press Enter to continue...")
                plt.close()
                if input_text in ['skip', 'save']:
                    plot = False
                    silent = True
                    display = not silent or plot
                elif input_text == 'exit':
                    break
        if display:
            if input_text not in ['exit', 'save']: 
                plot = orig_plot
                silent = orig_silent
                display = not silent or plot
            elif input_text == 'exit':
                break
    return stats
def quality_check(data):
    """
    Checks data to make sure each experiment passed some "gut check" measures
    Used to exclude data on individual tasks or whole subjects if they fail
    too many tasks.
    NOTE: This function has an issue such that it inappropriately evaluates
    stop signal tasks based on the number of missed responses. Rather than 
    changing the function (which would affect our samples which are already
    determined) I am leaving it, and introducing a quality check correction
    that will be performed after subjects are already rejected
    """
    start_time = time()
    rt_thresh_lookup = {
        'angling_risk_task_always_sunny': 0,
        'simple_reaction_time': 150    
    }
    acc_thresh_lookup = {
        'digit_span': 0,
        'hierarchical_rule': 0,
        'information_sampling_task': 0,
        'probabilistic_selection': 0,
        'ravens': 0,
        'shift_task': 0,
        'spatial_span': 0,
        'tower_of_london': 0
        
    }
    missed_thresh_lookup = {
        'information_sampling_task': 1,
        'go_nogo': 1,
        'tower_of_london': 2
    }
    
    response_thresh_lookup = {
        'angling_risk_task_always_sunny': np.nan,
        'columbia_card_task_cold': np.nan,
        'discount_titrate': np.nan,
        'digit_span': np.nan,
        'go_nogo': .98,
        'kirby': np.nan,
        'simple_reaction_time': np.nan,
        'spatial_span': np.nan,
    }
    
    templates = data.groupby('experiment_exp_id').experiment_template.unique()
    data.loc[:,'passed_QC'] = True
    for exp in data.experiment_exp_id.unique():
        try:
            if templates.loc[exp] == 'jspsych':
                print('Running QC on ' + exp)
                df = extract_experiment(data, exp)
                rt_thresh = rt_thresh_lookup.get(exp,200)
                acc_thresh = acc_thresh_lookup.get(exp,.6)
                missed_thresh = missed_thresh_lookup.get(exp,.25)
                response_thresh = response_thresh_lookup.get(exp,.95)
                
                # special cases...
                if exp == 'information_sampling_task':
                    df.groupby('worker_id').which_click_in_round.value_counts()
                    passed_response = df.groupby('worker_id').which_click_in_round.mean() > 2
                    passed_rt = pd.Series([True] * len(passed_response), index = passed_response.index)
                    passed_miss = pd.Series([True] * len(passed_response), index = passed_response.index)
                    passed_acc = pd.Series([True] * len(passed_response), index = passed_response.index)
                elif exp == 'go_nogo':
                    passed_rt = df.query('rt != -1').groupby('worker_id').rt.median() >= rt_thresh
                    passed_miss = df.groupby('worker_id').rt.agg(lambda x: np.mean(x == -1)) < missed_thresh
                    df.correct = pd.to_numeric(df.correct)
                    passed_acc = df.groupby('worker_id').correct.mean() >= acc_thresh
                    passed_response = np.logical_not(df.groupby('worker_id').key_press.agg(
                                                            lambda x: np.any(pd.value_counts(x) > pd.value_counts(x).sum()*response_thresh)))
                elif exp == 'psychological_refractory_period_two_choices':
                    passed_rt = (df.groupby('worker_id').median()[['choice1_rt','choice2_rt']] >= rt_thresh).all(axis = 1)
                    passed_acc = df.query('choice1_rt != -1').groupby('worker_id').choice1_correct.mean() >= acc_thresh
                    passed_miss = ((df.groupby('worker_id').choice1_rt.agg(lambda x: np.mean(x!=-1) >= missed_thresh)) \
                                        + (df.groupby('worker_id').choice2_rt.agg(lambda x: np.mean(x>-1) >= missed_thresh))) == 2
                    passed_response1 = np.logical_not(df.query('choice1_rt != -1').groupby('worker_id').choice1_key_press.agg(
                                                            lambda x: np.any(pd.value_counts(x) > pd.value_counts(x).sum()*response_thresh)))
                    passed_response2 = np.logical_not(df.query('choice2_rt != -1').groupby('worker_id').choice2_key_press.agg(
                                                            lambda x: np.any(pd.value_counts(x) > pd.value_counts(x).sum()*response_thresh)))
                    passed_response = np.logical_and(passed_response1,passed_response2)
                elif exp == 'ravens':
                    passed_rt = df.query('rt != -1').groupby('worker_id').rt.median() >= rt_thresh
                    passed_acc = df.query('rt != -1').groupby('worker_id').correct.mean() >= acc_thresh
                    passed_response = np.logical_not(df.groupby('worker_id').stim_response.agg(
                                                            lambda x: np.any(pd.value_counts(x) > pd.value_counts(x).sum()*response_thresh)))
                    passed_miss = pd.Series([True] * len(passed_rt), index = passed_rt.index)
                elif exp == 'tower_of_london':
                    passed_rt = df.groupby('worker_id').rt.median() >= rt_thresh
                    passed_acc = df.query('trial_id == "feedback"').groupby('worker_id').correct.mean() >= acc_thresh
                    # Labeling someone as "missing" too many problems if they don't make enough moves
                    passed_miss = (df.groupby(['worker_id','problem_id']).num_moves_made.max().reset_index().groupby('worker_id').mean() >= missed_thresh).num_moves_made
                    passed_response = pd.Series([True] * len(passed_rt), index = passed_rt.index)
                elif exp == 'two_stage_decision':
                    passed_rt = (df.groupby('worker_id').median()[['rt_first','rt_second']] >= rt_thresh).all(axis = 1)
                    passed_miss = df.groupby('worker_id').trial_id.agg(lambda x: np.mean(x == 'incomplete_trial')) < missed_thresh
                    passed_acc = pd.Series([True] * len(passed_rt), index = passed_rt.index)
                    passed_response = pd.Series([True] * len(passed_rt), index = passed_rt.index)
                    passed_response1 = np.logical_not(df.query('rt_first != -1').groupby('worker_id').key_press_first.agg(
                                                            lambda x: np.any(pd.value_counts(x) > pd.value_counts(x).sum()*response_thresh)))
                    passed_response2 = np.logical_not(df.query('rt_second != -1').groupby('worker_id').key_press_second.agg(
                                                            lambda x: np.any(pd.value_counts(x) > pd.value_counts(x).sum()*response_thresh)))
                    passed_response = np.logical_and(passed_response1,passed_response2)
                elif exp == 'writing_task':
                    passed_response = df.query('trial_id == "write"').groupby('worker_id').final_text.agg(lambda x: len(x[0]) > 100)
                    passed_acc = pd.Series([True] * len(passed_response), index = passed_response.index)
                    passed_rt = pd.Series([True] * len(passed_response), index = passed_response.index)
                    passed_miss = pd.Series([True] * len(passed_response), index = passed_response.index)
                # everything else
                else:
                    passed_rt = df.query('rt != -1').groupby('worker_id').rt.median() >= rt_thresh
                    passed_miss = df.groupby('worker_id').rt.agg(lambda x: np.mean(x == -1)) < missed_thresh
                    if 'correct' in df.columns:
                        df.correct = pd.to_numeric(df.correct)
                        passed_acc = df.query('rt != -1').groupby('worker_id').correct.mean() >= acc_thresh
                    else:
                        passed_acc = pd.Series([True] * len(passed_rt), index = passed_rt.index)
                    if 'mouse_click' in df.columns:
                        passed_response = np.logical_not(df.query('rt != -1').groupby('worker_id').mouse_click.agg(
                                                            lambda x: np.any(pd.value_counts(x) > pd.value_counts(x).sum()*response_thresh)))
                    elif 'key_press' in df.columns:
                        passed_response = np.logical_not(df.query('rt != -1').groupby('worker_id').key_press.agg(
                                                            lambda x: np.any(pd.value_counts(x) > pd.value_counts(x).sum()*response_thresh)))   
                                                            
                passed_df = pd.concat([passed_rt,passed_acc,passed_miss,passed_response], axis = 1).fillna(False, inplace = False)
                passed = passed_df.all(axis = 1)
                failed = passed[passed == False]
                for subj in failed.index:
                    data.loc[(data.experiment_exp_id == exp) & (data.worker_id == subj),'passed_QC'] = False
        except AttributeError as e:
            print('QC could not be run on experiment %s' % exp)
            print(e)
    finish_time = (time() - start_time)/60
    print('Finished QC. Time taken: ' + str(finish_time))
        DVs_valence = []
    datasets.append((data,directory, DVs, DVs_valence))
    
# calculate DVs
for data,directory, DV_df, valence_df in datasets:
    readme_lines = []
    meta_dir = path.join(directory,'metadata')
    reference_dir = path.join(directory,'references')
    if not path.exists(meta_dir):
        makedirs(meta_dir)
    if not path.exists(reference_dir):
        makedirs(reference_dir)
    # save target datasets
    print('Saving to %s...' % directory)
    print('Saving target measures...')
    demog_data = extract_experiment(data,'demographics_survey')
    demog_data = process_demographics(demog_data, directory, meta_dir)
    alcohol_drug_data = extract_experiment(data,'alcohol_drugs_survey')
    alcohol_drug_data = process_alcohol_drug(alcohol_drug_data, directory, meta_dir)
    health_data = extract_experiment(data,'k6_survey')
    health_data = process_health(health_data, directory, meta_dir)
    activity_level = DV_df.pop('leisure_time_activity_survey.activity_level')
    # concatenate targets
    target_data = pd.concat([demog_data, alcohol_drug_data, 
                             health_data, activity_level], axis = 1)
    target_data.to_csv(path.join(directory,'demographic_health.csv'))
    # save items
    items_df = get_items(data)
    print('Saving items...')
    subjectsxitems = items_df.pivot('worker','item_ID','coded_response')
    # ensure there are the correct number of items
def quality_check(data):
    """
    Checks data to make sure each experiment passed some "gut check" measures
    Used to exclude data on individual tasks or whole subjects if they fail
    too many tasks.
    NOTE: This function has an issue such that it inappropriately evaluates
    stop signal tasks based on the number of missed responses. Rather than 
    changing the function (which would affect our samples which are already
    determined) I am leaving it, and introducing a quality check correction
    that will be performed after subjects are already rejected
    """
    start_time = time()
    rt_thresh_lookup = {
        'angling_risk_task_always_sunny': 0,
        'simple_reaction_time': 150
    }
    acc_thresh_lookup = {
        'digit_span': 0,
        'hierarchical_rule': 0,
        'information_sampling_task': 0,
        'probabilistic_selection': 0,
        'ravens': 0,
        'shift_task': 0,
        'spatial_span': 0,
        'tower_of_london': 0
    }
    missed_thresh_lookup = {
        'information_sampling_task': 1,
        'go_nogo': 1,
        'tower_of_london': 2
    }

    response_thresh_lookup = {
        'angling_risk_task_always_sunny': np.nan,
        'columbia_card_task_cold': np.nan,
        'discount_titrate': np.nan,
        'digit_span': np.nan,
        'go_nogo': .98,
        'kirby': np.nan,
        'simple_reaction_time': np.nan,
        'spatial_span': np.nan,
    }

    templates = data.groupby('experiment_exp_id').experiment_template.unique()
    data.loc[:, 'passed_QC'] = True
    for exp in data.experiment_exp_id.unique():
        try:
            if templates.loc[exp] == 'jspsych':
                print('Running QC on ' + exp)
                df = extract_experiment(data, exp)
                rt_thresh = rt_thresh_lookup.get(exp, 200)
                acc_thresh = acc_thresh_lookup.get(exp, .6)
                missed_thresh = missed_thresh_lookup.get(exp, .25)
                response_thresh = response_thresh_lookup.get(exp, .95)

                # special cases...
                if exp == 'information_sampling_task':
                    df.groupby('worker_id').which_click_in_round.value_counts()
                    passed_response = df.groupby(
                        'worker_id').which_click_in_round.mean() > 2
                    passed_rt = pd.Series([True] * len(passed_response),
                                          index=passed_response.index)
                    passed_miss = pd.Series([True] * len(passed_response),
                                            index=passed_response.index)
                    passed_acc = pd.Series([True] * len(passed_response),
                                           index=passed_response.index)
                elif exp == 'go_nogo':
                    passed_rt = df.query('rt != -1').groupby(
                        'worker_id').rt.median() >= rt_thresh
                    passed_miss = df.groupby('worker_id').rt.agg(
                        lambda x: np.mean(x == -1)) < missed_thresh
                    df.correct = pd.to_numeric(df.correct)
                    passed_acc = df.groupby(
                        'worker_id').correct.mean() >= acc_thresh
                    passed_response = np.logical_not(
                        df.groupby('worker_id').key_press.agg(lambda x: np.any(
                            pd.value_counts(x) > pd.value_counts(x).sum() *
                            response_thresh)))
                elif exp == 'psychological_refractory_period_two_choices':
                    passed_rt = (df.groupby('worker_id').median()[[
                        'choice1_rt', 'choice2_rt'
                    ]] >= rt_thresh).all(axis=1)
                    passed_acc = df.query('choice1_rt != -1').groupby(
                        'worker_id').choice1_correct.mean() >= acc_thresh
                    passed_miss = ((df.groupby('worker_id').choice1_rt.agg(lambda x: np.mean(x!=-1) >= missed_thresh)) \
                                        + (df.groupby('worker_id').choice2_rt.agg(lambda x: np.mean(x>-1) >= missed_thresh))) == 2
                    passed_response1 = np.logical_not(
                        df.query('choice1_rt != -1').groupby('worker_id').
                        choice1_key_press.agg(lambda x: np.any(
                            pd.value_counts(x) > pd.value_counts(x).sum() *
                            response_thresh)))
                    passed_response2 = np.logical_not(
                        df.query('choice2_rt != -1').groupby('worker_id').
                        choice2_key_press.agg(lambda x: np.any(
                            pd.value_counts(x) > pd.value_counts(x).sum() *
                            response_thresh)))
                    passed_response = np.logical_and(passed_response1,
                                                     passed_response2)
                elif exp == 'ravens':
                    passed_rt = df.query('rt != -1').groupby(
                        'worker_id').rt.median() >= rt_thresh
                    passed_acc = df.query('rt != -1').groupby(
                        'worker_id').correct.mean() >= acc_thresh
                    passed_response = np.logical_not(
                        df.groupby('worker_id').stim_response.agg(
                            lambda x: np.any(
                                pd.value_counts(x) > pd.value_counts(x).sum() *
                                response_thresh)))
                    passed_miss = pd.Series([True] * len(passed_rt),
                                            index=passed_rt.index)
                elif exp == 'tower_of_london':
                    passed_rt = df.groupby(
                        'worker_id').rt.median() >= rt_thresh
                    passed_acc = df.query('trial_id == "feedback"').groupby(
                        'worker_id').correct.mean() >= acc_thresh
                    # Labeling someone as "missing" too many problems if they don't make enough moves
                    passed_miss = (df.groupby([
                        'worker_id', 'problem_id'
                    ]).num_moves_made.max().reset_index().groupby(
                        'worker_id').mean() >= missed_thresh).num_moves_made
                    passed_response = pd.Series([True] * len(passed_rt),
                                                index=passed_rt.index)
                elif exp == 'two_stage_decision':
                    passed_rt = (df.groupby('worker_id').median()[[
                        'rt_first', 'rt_second'
                    ]] >= rt_thresh).all(axis=1)
                    passed_miss = df.groupby('worker_id').trial_id.agg(
                        lambda x: np.mean(x == 'incomplete_trial'
                                          )) < missed_thresh
                    passed_acc = pd.Series([True] * len(passed_rt),
                                           index=passed_rt.index)
                    passed_response = pd.Series([True] * len(passed_rt),
                                                index=passed_rt.index)
                    passed_response1 = np.logical_not(
                        df.query('rt_first != -1').groupby(
                            'worker_id').key_press_first.agg(lambda x: np.any(
                                pd.value_counts(x) > pd.value_counts(x).sum() *
                                response_thresh)))
                    passed_response2 = np.logical_not(
                        df.query('rt_second != -1').groupby(
                            'worker_id').key_press_second.agg(lambda x: np.any(
                                pd.value_counts(x) > pd.value_counts(x).sum() *
                                response_thresh)))
                    passed_response = np.logical_and(passed_response1,
                                                     passed_response2)
                elif exp == 'writing_task':
                    passed_response = df.query('trial_id == "write"').groupby(
                        'worker_id').final_text.agg(lambda x: len(x[0]) > 100)
                    passed_acc = pd.Series([True] * len(passed_response),
                                           index=passed_response.index)
                    passed_rt = pd.Series([True] * len(passed_response),
                                          index=passed_response.index)
                    passed_miss = pd.Series([True] * len(passed_response),
                                            index=passed_response.index)
                # everything else
                else:
                    passed_rt = df.query('rt != -1').groupby(
                        'worker_id').rt.median() >= rt_thresh
                    passed_miss = df.groupby('worker_id').rt.agg(
                        lambda x: np.mean(x == -1)) < missed_thresh
                    if 'correct' in df.columns:
                        df.correct = pd.to_numeric(df.correct)
                        passed_acc = df.query('rt != -1').groupby(
                            'worker_id').correct.mean() >= acc_thresh
                    else:
                        passed_acc = pd.Series([True] * len(passed_rt),
                                               index=passed_rt.index)
                    if 'mouse_click' in df.columns:
                        passed_response = np.logical_not(
                            df.query('rt != -1').groupby(
                                'worker_id').mouse_click.agg(lambda x: np.any(
                                    pd.value_counts(x) > pd.value_counts(x).
                                    sum() * response_thresh)))
                    elif 'key_press' in df.columns:
                        passed_response = np.logical_not(
                            df.query('rt != -1').groupby(
                                'worker_id').key_press.agg(lambda x: np.any(
                                    pd.value_counts(x) > pd.value_counts(x).
                                    sum() * response_thresh)))

                passed_df = pd.concat(
                    [passed_rt, passed_acc, passed_miss, passed_response],
                    axis=1).fillna(False, inplace=False)
                passed = passed_df.all(axis=1)
                failed = passed[passed == False]
                for subj in failed.index:
                    data.loc[(data.experiment_exp_id == exp) &
                             (data.worker_id == subj), 'passed_QC'] = False
        except AttributeError as e:
            print('QC could not be run on experiment %s' % exp)
            print(e)
    finish_time = (time() - start_time) / 60
    print('Finished QC. Time taken: ' + str(finish_time))
Esempio n. 13
0
def results_check(data,
                  exp_id=None,
                  worker=None,
                  columns=['correct', 'rt'],
                  remove_practice=True,
                  use_groups=True,
                  plot=False,
                  silent=False):
    """Outputs info for a basic data check on the results object. Uses data_check to group, describe and plot
    dataframes. Function first filters the results object as specified,
    loops through each experiment and worker contained in the results object, performs some basic dataframe manipulation
    and runs data_check
    :data: the data from an expanalysis Result object
    :param experiment: a string or array of strings to select the experiment(s) before calculating basic stats
    :param worker: a string or array of strings to select the worker(s) before calculating basic stats
    :param columns: array of columns to subset summary statistics, if they exist
    :param remove_practice: bool, default True. If True will remove any rows labeled "practice" in the "exp_stage" column, if it exists
    :param use_groups: bool, default True. If True will lookup grouping variables using get_groupby for the experiment
    :param silent: bool, default False. If True will not print output
    :param plot: bool, default False: If True plots data using plot_groups
    :return summary, p: summary data frame and plot object
    """
    assert 'worker_id' in data.columns and 'experiment_exp_id' in data.columns, \
        "Results data must have 'worker_id' and 'experiment_exp_id' in columns"
    stats = {}
    results = result_filter(data, exp_id=exp_id, worker=worker)
    orig_plot = plot
    orig_silent = silent
    display = not silent or plot
    if display:
        print(
            '******************************************************************************'
        )
        print(
            'Input: Type "exit" to end, "skip" to skip to the next experiment, or hit enter to continue'
        )
        print(
            '******************************************************************************'
        )
    for experiment in numpy.unique(results['experiment_exp_id']):
        stats[experiment] = {}
        if display:
            print(
                '******************************************************************************'
            )
            print('    Experiment: ', experiment)
            print(
                '******************************************************************************'
            )
        if use_groups:
            groupby = get_groupby(experiment)
        else:
            groupby = []
        experiment_df = extract_experiment(results, experiment)
        for worker in pandas.unique(experiment_df['worker_id']):
            if display:
                print(
                    '******************************************************************************'
                )
                print('    Worker: ', worker)
                print(
                    '******************************************************************************'
                )
            df = experiment_df.query('worker_id == "%s"' % worker)
            summary, p = data_check(df, columns, remove_practice, groupby,
                                    silent, plot)
            #add summary and plot to dictionary of summaries
            stats[experiment] = {worker: {'summary': summary, 'plot': p}}
            if not silent or plot:
                input_text = input("Press Enter to continue...")
                plt.close()
                if input_text in ['skip', 'save']:
                    plot = False
                    silent = True
                    display = not silent or plot
                elif input_text == 'exit':
                    break
        if display:
            if input_text not in ['exit', 'save']:
                plot = orig_plot
                silent = orig_silent
                display = not silent or plot
            elif input_text == 'exit':
                break
    return stats
        DVs_valence = []
    datasets.append((data,directory, DVs, DVs_valence))
    
# calculate DVs
for data,directory, DV_df, valence_df in datasets:
    readme_lines = []
    meta_dir = path.join(directory,'metadata')
    reference_dir = path.join(directory,'references')
    if not path.exists(meta_dir):
        makedirs(meta_dir)
    if not path.exists(reference_dir):
        makedirs(reference_dir)
    # save target datasets
    print('Saving to %s...' % directory)
    print('Saving target measures...')
    demog_data = extract_experiment(data,'demographics_survey')
    demog_data = process_demographics(demog_data, directory, meta_dir)
    alcohol_drug_data = extract_experiment(data,'alcohol_drugs_survey')
    alcohol_drug_data = process_alcohol_drug(alcohol_drug_data, directory, meta_dir)
    health_data = extract_experiment(data,'k6_survey')
    health_data = process_health(health_data, directory, meta_dir)
    activity_level = DV_df.pop('leisure_time_activity_survey.activity_level')
    # concatenate targets
    target_data = pd.concat([demog_data, alcohol_drug_data, 
                             health_data, activity_level], axis = 1)
    target_data.to_csv(path.join(directory,'demographic_health.csv'))
    # save items
    items_df = get_items(data)
    print('Saving items...')
    subjectsxitems = items_df.pivot('worker','item_ID','coded_response')
    # ensure there are the correct number of items