def generate_control_population_pickle(exam):

    host = utils_db.get_host()
    if host == 'staging':
        raise Exception("Cannot generate controls file from staging")

    # get all visit_exams
    visit_exams_df = utils_db.get_visit_exams(get_visits(n=np.inf))

    # get the most recent exam of each type per user
    subset = filter_most_recent_exams(visit_exams_df)

    # filter out the exam that we're looking for
    subset = subset[subset['exam'] == exam]

    # load the processed data for all of these visits
    all_processed_data = utils_db.load_visit_exams_processed(subset)

    # get the date so that we can know when this file was generated
    today = datetime.date.today()
    d = today.strftime("%b-%d-%Y")

    # save as pickle file in the control_data folder
    filename = './.control_data/' + exam + '_ProcessedData_' + d + '.pickle'
    with open(filename, 'wb') as f:
        pickle.dump(all_processed_data, f)
Exemple #2
0
def test_analysis():
    """

    The prep for this can be found in test_analysis.py

    :return:
    """
    expected_output = {
                        'PupillaryReflex': "{'pupil_max_dilation': 2.0675, 'pupil_max_constric': 0.04, 'pupil_range': 2.0583, 'pupil_dilation_velocity': -0.39, 'aniscoria': 'No', 'next_exam_params': None}",
                        'Convergence': "{'stimulus_phase_lag': 2.3588, 'stimulus_correlation': 0.2872, 'LR_phase_lag': 0.4175, 'LR_correlation': 0.5057, 'next_exam_params': None}",
                        'Prosaccade': "{'number_of_saccades': 43, 'saccades_per_movement': 1.23, 'median_reaction_time': 0.2, 'duration': 41.42, 'abnormal_path_proportion': 0.02, 'next_exam_params': None}",
                        'SmoothPursuit2D': "{'error_magnitude': 1.8545, 'error_angular': 18.8197, 'error_radial': 1.7848, 'next_exam_params': None}",
                        'SelfPacedSaccade': "{'sacc_per_sec': 6.18, 'median_vel': 23.53, 'vel_acc': 23.53, 'accuracy': 1, 'next_exam_params': None}",
                        'SmoothPursuit': "{'median_lag_left': 0.0708, 'median_lag_right': 0.085, 'next_exam_params': None}",
                        'CategoryFluency': "{'num_correct': 3, 'num_repeats': 0, 'num_intrusions': 1, 'next_exam_params': 'vegetables'}",
                        'Stroop': "{'speed_median': 0.6784, 'accuracy': 90.0, 'num_correct': 27, 'next_exam_params': None}",
                        'TrailMaking': "{'total_time': 12.1003, 'error_count': 12, 'repeat_count': 10, 'num_correct': 25, 'next_exam_params': None}",
                        'TrailMaking2': "{'total_time': 7.7975, 'error_count': 2, 'repeat_count': 12, 'num_correct': 25, 'next_exam_params': None}",
                        'LetterFluency': "{'num_correct': 1, 'num_repeats': 0, 'num_intrusions': 22, 'next_exam_params': 'c'}",
                        'BostonNaming': "{'speed_median': 0.7647, 'accuracy': 93.3333, 'num_correct': 14, 'next_exam_params': None}",
                        'DigitSpanForward': "{'max_level_perfect': 0, 'max_level': 5, 'next_exam_params': 4}",
                        'MemoryEncoding': "{'max_words_correct': 10, 'num_intrusions': 0, 'next_exam_params': 15}",
                        'Tapping': "{'right_section_right_presses': 88, 'right_section_left_presses': 1, 'left_section_right_presses': 0, 'left_section_left_presses': 86, 'alternate_section_right_presses': 74, 'alternate_section_left_presses': 66, 'ordering_errors': 8, 'next_exam_params': None}",
                        'DigitSpanBackward': "{'max_level_perfect': 0, 'max_level': 0, 'next_exam_params': 3}",
                        'MemoryRecall': "{'recall_num_correct': 0, 'recall_num_intrusions': 0, 'recognize_num_correct': 0, 'next_exam_params': None}"
                        }


    params = {"videos": False, "host": "local",
              "control_subj_quantity": 0,
              "overwrite": False}

    visit_exams_df = get_test_visit_exams_df()
    processed_df = utils_db.load_visit_exams_processed(visit_exams_df, params)
    output = {row['exam']: str(round_metrics(row['processed'])) for idx, row in processed_df.iterrows()}

    print("Checking database metrics haven't changed")
    check_output(output, expected_output)

    params = {"videos": False, "host": "local",
              "control_subj_quantity": 0,
              "overwrite": True}

    processed_df = utils_db.load_visit_exams_processed(visit_exams_df, params)
    output = {row['exam']: str(round_metrics(row['processed'])) for idx, row in processed_df.iterrows()}
    print("Checking reprocessing results in the same output")
    check_output(output, expected_output)
Exemple #3
0
def process_visit_exam(visit_exam_id, params):
    visit_exam_df = udb.get_visit_exams_by_id(visit_exam_id)
    visit_exam_processed_df = udb.load_visit_exams_processed(visit_exam_df,
                                                             params=params)

    if len(visit_exam_processed_df) == 0:
        logger.warning(
            'No exam associated with visit_exam_id: {}'.format(visit_exam_id))

    else:
        # do charting, saving and POST metrics for each exam
        visit_id = visit_exam_processed_df.iloc[0]['visit_id']

        subj_series = visit_exam_processed_df.iloc[0]
        cur_exam = subj_series['exam'].lower()

        control_subj_qty = 100 if 'control_subj_quantity' not in params else params[
            'control_subj_quantity']
        year_range = 20 if 'year_range' not in params else params['year_range']
        skip_gender = False if 'skip_gender' not in params else params[
            'skip_gender']
        params['overwrite'] = False if 'overwrite' not in params else params[
            'overwrite']

        # get processed_df with controls, POST metrics, and chart
        if check_params_for_exam(subj_series, params):
            print("running: " + subj_series['exam'] + ", visit_exam: " +
                  str(subj_series['visit_exam_id']) + ", visit_id: " +
                  subj_series['visit_id'])

            # get the results from analysis for all rows
            complete_processed_df = ucon.load_processed_controls_from_pickle(
                subj_series,
                n=control_subj_qty,
                max_controls=False,
                year_range=year_range)
            if len(complete_processed_df) <= 10:
                logger.warning(
                    "Not enough control data to accurately calculate performance relative to population."
                )

            # POST metrics to database
            response = post_metrics(complete_processed_df, visit_id, cur_exam)

            # todo: check more error codes
            if '201' not in str(response):
                logger.warning(
                    'Metrics may not POST to database with error code {}.'.
                    format(response))

            # do charting on the visit_exam
            if cur_exam in EXAM_TO_VIZ_FUNC:
                logger.info("PROCESSING: " + subj_series['exam'])

                func = EXAM_TO_VIZ_FUNC[cur_exam]
                fig = chart_and_save(func, complete_processed_df, params)

    return
Exemple #4
0
def test_charts():
    print("Imma testing chartssS!!!")

    visit_exam_df = get_test_visit_exams_df()

    params = {"videos": False, "host": "local",
              "control_subj_quantity": 0,
              "overwrite": False}

    processed_df = utils_db.load_visit_exams_processed(visit_exam_df, params)

    plot_funcs = processed_df['exam'].apply(lambda w: eval("chart.plot_" + w.lower())).values
    for ii in range(len(processed_df)):
        func = plot_funcs[ii]
        exam_id = processed_df['visit_exam_id'].iloc[ii]
        func(processed_df[ii:ii + 1], exam_id)
Exemple #5
0
def todo_test_charts_with_controls():


    visit_exam_df = get_test_with_controls_visit_exams_df()

    params = {"videos": False, "host": "local",
              "control_subj_quantity": 0,
              "overwrite": False}

    processed_df = utils_db.load_visit_exams_processed(visit_exam_df, params)

    exam_to_chart = lambda w: eval("chart.plot_" + w.lower())

    # todo: check that this works that the groups are formatted correctly
    # select just one example of each exam (with pandas groupby?)
    groups = processed_df.groupby("exam")
    for exam, group in groups:
        func = exam_to_chart(group.iloc[0]['exam'])
        exam_id = processed_df['visit_exam_id'].iloc[0]
        func(group, exam_id)
Exemple #6
0
def get_processed_df_for_visit(visit_id, params={}):
    visit_exams_df = udb.get_visit_exams(udb.get_visits(visit_id=visit_id))
    processed_df = udb.load_visit_exams_processed(visit_exams_df, params)
    return processed_df
Exemple #7
0
def test_load_processed():
    print("testing load processed")

    test_begin_time = datetime.datetime.now(datetime.timezone.utc)

    visit_id = "6feeef57-4047-4c2d-b5f1-7e02f60a0188"

    # todo: add each exam as it is finished
    visit_exams_df = udb.get_visit_exams(udb.get_visits(visit_id=visit_id))

    # filter out any we know haven't passed the test yet
    # todo: remove this filter because everything should pass the test
    exams_to_test = [
        'TrailMaking', 'TrailMaking2', 'CategoryFluency', 'LetterFluency'
    ]  #, 'Prosaccade']
    visit_exams_df = visit_exams_df[visit_exams_df['exam'].apply(
        lambda w: w in exams_to_test)]

    # note: host is AWS so that this doesn't leave a mess
    params = {
        'videos': False,
        'host': 'aws',
        'control_subj_quantity': 2,
        'exams': exams_to_test,
        'overwrite': True
    }

    print("Test recomputing results")
    processed_df = udb.load_visit_exams_processed(visit_exams_df, params)
    # Check that all the status passed
    for idx, subj_series in processed_df.iterrows():
        # check that each exam
        assert 'has_error' in subj_series, "missing error status field"
        assert subj_series[
            'has_error'] == False, "error occured in: " + subj_series['exam']
        assert 'has_error' in subj_series[
            'processed'], "missing status element in exam: " + subj_series[
                'exam']
        # check that the status is true
        assert subj_series['processed']['has_error'] == False, "processing has error in exam: " + subj_series['exam'] +\
                                                                        ", has_error: " + subj_series['processed']['has_error']

    # test that the processed files have all been modified by that process
    for idx, subj_series in processed_df.iterrows():
        response = udb.load_s3_object(udb.get_processed_path(subj_series))
        assert response[
            'LastModified'] > test_begin_time, "overwrite: True, processed file was not modified when it should have been for exam: " + subj_series[
                'exam']

    # tests below measure a few cases of the overwrite_now function
    # check that the files don't get reprocessed again if param is set to false
    test_begin_time = datetime.datetime.now(datetime.timezone.utc)
    params['overwrite'] = False
    processed_df2 = udb.load_visit_exams_processed(visit_exams_df, params)
    for idx, subj_series in processed_df2.iterrows():
        response = udb.load_s3_object(udb.get_processed_path(subj_series))
        assert response[
            'LastModified'] < test_begin_time, "overwrite:'False' processed file was modified when it shouldn't have been for exam: " + subj_series[
                'exam']

    # test batch overwrite: the files don't get reprocessed again for a long-running batch
    params['overwrite'] = 'batch'
    # pretend the batch started 2 hours ago
    test_begin_time = datetime.datetime.now(datetime.timezone.utc)
    params['batch_begin_time'] = test_begin_time - datetime.timedelta(hours=2)
    processed_df2 = udb.load_visit_exams_processed(visit_exams_df, params)
    for idx, subj_series in processed_df2.iterrows():
        response = udb.load_s3_object(udb.get_processed_path(subj_series))
        assert response[
            'LastModified'] < test_begin_time, "overwrite:'batch' processed file was modified when it shouldn't have been for exam: " + subj_series[
                'exam']

    # Test batch overwrite: the files *do* get reprocessed if older than when the batch started
    params['batch_begin_time'] = datetime.datetime.now(datetime.timezone.utc)
    processed_df2 = udb.load_visit_exams_processed(visit_exams_df, params)
    for idx, subj_series in processed_df2.iterrows():
        response = udb.load_s3_object(udb.get_processed_path(subj_series))
        assert response['LastModified'] > params[
            'batch_begin_time'], "overwrite:'batch' processed file was not modified when it should have been for exam: " + subj_series[
                'exam']
Exemple #8
0
def test_processing_functions():
    """
    test results of the processing functions

    :return:
    """
    visit_id = "6feeef57-4047-4c2d-b5f1-7e02f60a0188"

    # todo: add each exam as it is finished
    exams_to_test = [
        'TrailMaking2', 'CategoryFluency', 'LetterFluency', 'TrailMaking'
    ]  #, 'Prosaccade']

    # note: assumption is that the processing was already re-done in the earlier test function.
    params = {
        'videos': False,
        'host': 'aws',
        'control_subj_quantity': 2,
        'exams': exams_to_test,
        'overwrite': False
    }

    visit_exams_df = udb.get_visit_exams(udb.get_visits(visit_id=visit_id))
    processed_df = udb.load_visit_exams_processed(visit_exams_df, params)

    # filter out any we know haven't passed the test yet
    # todo: remove this filter because everything should pass the test
    processed_df = processed_df[processed_df['exam'].apply(
        lambda w: w in exams_to_test)]

    for idx, subj_series in processed_df.iterrows():

        if subj_series['exam'] == 'TrailMaking':
            assert subj_series['processed']['metrics'][
                'error_count'] == 13, "trailmaking produced wrong error count"
            assert subj_series['processed']['metrics'][
                'repeat_count'] == 14, "trailmaking produced wrong repeat count"
            assert subj_series['processed'][
                'active time'] == 22.0552, "trailmaking produced wrong active time"

        elif subj_series['exam'] == 'TrailMaking2':
            assert subj_series['processed']['metrics'][
                'error_count'] == 16, "trailmaking2 produced wrong error count"
            assert subj_series['processed']['metrics'][
                'repeat_count'] == 11, "trailmaking2 produced wrong repeat count"
            assert subj_series['processed'][
                'active time'] == 19.5775, "trailmaking2 produced wrong active time"

        elif subj_series['exam'] == 'CategoryFluency':
            assert subj_series['processed']['data']['responses'].iloc[0][
                'transcript'] == 'cucumbers', "transcript first word incorrect"
            assert subj_series['processed']['data']['responses'].iloc[1][
                'transcript'] == 'carrots', "transcript second word incorrect"
            assert subj_series['processed']['data']['responses'].iloc[2][
                'transcript'] == 'celery', "transcript second word incorrect"
            assert subj_series['processed']['metrics'][
                'num_correct'] == 6, "number correct wrong"

        elif subj_series['exam'] == 'LetterFluency':
            assert subj_series['processed']['data']['responses'].iloc[0][
                'transcript'] == 'apples'
            assert subj_series['processed']['data']['responses'].iloc[1][
                'transcript'] == 'asparagus'
            assert subj_series['processed']['data']['responses'].iloc[2][
                'transcript'] == 'australia'
            assert subj_series['processed']['metrics']['num_correct'] == 4
def update_controls_processed_pickle(exam):

    host = utils_db.get_host()
    if host == 'staging':
        raise Exception("Cannot update controls from staging")

    if exam not in EXAM_CONTROL_FILE_DICT:
        logger.warning(
            'No control pickle file to update for exam. Please check exam and generate pickle if necessary'
        )
    else:
        # open the old control file
        filepath = './.control_data/' + EXAM_CONTROL_FILE_DICT[exam]
        exams_on_file = pickle.load(open(filepath, "rb"))

        # take out the processed column so that we can compare to get_visit_exams
        without_processed_column = exams_on_file.iloc[:, 0:-1]

        # get all visit_exams
        current_visit_exams_df = utils_db.get_visit_exams(get_visits(n=np.inf))

        # get the most recent exam of each type per user
        subset = filter_most_recent_exams(current_visit_exams_df)

        # filter out the exam that we're looking for
        subset = subset[subset['exam'] == exam]

        # concatenate them together and drop any rows that are exactly the same
        check_differences = pd.concat([without_processed_column, subset],
                                      sort=False).drop_duplicates(keep=False)

        # drop older versions of the exam for each subject
        check_differences = check_differences.sort_values(
            'visit_exam_id',
            ascending=False).drop_duplicates(subset=['subject_id'],
                                             keep='last').sort_index()

        # for each row of the new results check to see if the subject_id exists in the old one
        # if so, check to see if it's an older exam and append the new one to list of exams to run load_processed
        new_exams = pd.DataFrame()
        for i, result in check_differences.iterrows():
            if result['subject_id'] in exams_on_file['subject_id'].values:
                subject_in_old = exams_on_file.loc[exams_on_file['subject_id']
                                                   == result['subject_id']]
                #                 print(result['created_date'], subject_in_old.iloc[0]['created_date'])
                if result['created_date'] != subject_in_old.iloc[0][
                        'created_date']:
                    new_exams = new_exams.append(result)
            else:
                new_exams = new_exams.append(result)

        # load the new processed_data
        new_processed = utils_db.load_visit_exams_processed(new_exams)

        # concatenate the new and old exams and drop older versions of the exam for each subject
        old_and_new_exams = pd.concat([exams_on_file, new_processed])
        old_and_new_exams = old_and_new_exams.sort_values(
            'visit_exam_id',
            ascending=False).drop_duplicates(subset=['subject_id'],
                                             keep='last').sort_index()

        # TODO: FIGURE OUT WHAT TO DO WITH THE OLD CONTROL FILES (need to keep for FDA probably)

        return old_and_new_exams