def generate_control_population_pickle(exam): host = utils_db.get_host() if host == 'staging': raise Exception("Cannot generate controls file from staging") # get all visit_exams visit_exams_df = utils_db.get_visit_exams(get_visits(n=np.inf)) # get the most recent exam of each type per user subset = filter_most_recent_exams(visit_exams_df) # filter out the exam that we're looking for subset = subset[subset['exam'] == exam] # load the processed data for all of these visits all_processed_data = utils_db.load_visit_exams_processed(subset) # get the date so that we can know when this file was generated today = datetime.date.today() d = today.strftime("%b-%d-%Y") # save as pickle file in the control_data folder filename = './.control_data/' + exam + '_ProcessedData_' + d + '.pickle' with open(filename, 'wb') as f: pickle.dump(all_processed_data, f)
def test_analysis(): """ The prep for this can be found in test_analysis.py :return: """ expected_output = { 'PupillaryReflex': "{'pupil_max_dilation': 2.0675, 'pupil_max_constric': 0.04, 'pupil_range': 2.0583, 'pupil_dilation_velocity': -0.39, 'aniscoria': 'No', 'next_exam_params': None}", 'Convergence': "{'stimulus_phase_lag': 2.3588, 'stimulus_correlation': 0.2872, 'LR_phase_lag': 0.4175, 'LR_correlation': 0.5057, 'next_exam_params': None}", 'Prosaccade': "{'number_of_saccades': 43, 'saccades_per_movement': 1.23, 'median_reaction_time': 0.2, 'duration': 41.42, 'abnormal_path_proportion': 0.02, 'next_exam_params': None}", 'SmoothPursuit2D': "{'error_magnitude': 1.8545, 'error_angular': 18.8197, 'error_radial': 1.7848, 'next_exam_params': None}", 'SelfPacedSaccade': "{'sacc_per_sec': 6.18, 'median_vel': 23.53, 'vel_acc': 23.53, 'accuracy': 1, 'next_exam_params': None}", 'SmoothPursuit': "{'median_lag_left': 0.0708, 'median_lag_right': 0.085, 'next_exam_params': None}", 'CategoryFluency': "{'num_correct': 3, 'num_repeats': 0, 'num_intrusions': 1, 'next_exam_params': 'vegetables'}", 'Stroop': "{'speed_median': 0.6784, 'accuracy': 90.0, 'num_correct': 27, 'next_exam_params': None}", 'TrailMaking': "{'total_time': 12.1003, 'error_count': 12, 'repeat_count': 10, 'num_correct': 25, 'next_exam_params': None}", 'TrailMaking2': "{'total_time': 7.7975, 'error_count': 2, 'repeat_count': 12, 'num_correct': 25, 'next_exam_params': None}", 'LetterFluency': "{'num_correct': 1, 'num_repeats': 0, 'num_intrusions': 22, 'next_exam_params': 'c'}", 'BostonNaming': "{'speed_median': 0.7647, 'accuracy': 93.3333, 'num_correct': 14, 'next_exam_params': None}", 'DigitSpanForward': "{'max_level_perfect': 0, 'max_level': 5, 'next_exam_params': 4}", 'MemoryEncoding': "{'max_words_correct': 10, 'num_intrusions': 0, 'next_exam_params': 15}", 'Tapping': "{'right_section_right_presses': 88, 'right_section_left_presses': 1, 'left_section_right_presses': 0, 'left_section_left_presses': 86, 'alternate_section_right_presses': 74, 'alternate_section_left_presses': 66, 'ordering_errors': 8, 'next_exam_params': None}", 'DigitSpanBackward': "{'max_level_perfect': 0, 'max_level': 0, 'next_exam_params': 3}", 'MemoryRecall': "{'recall_num_correct': 0, 'recall_num_intrusions': 0, 'recognize_num_correct': 0, 'next_exam_params': None}" } params = {"videos": False, "host": "local", "control_subj_quantity": 0, "overwrite": False} visit_exams_df = get_test_visit_exams_df() processed_df = utils_db.load_visit_exams_processed(visit_exams_df, params) output = {row['exam']: str(round_metrics(row['processed'])) for idx, row in processed_df.iterrows()} print("Checking database metrics haven't changed") check_output(output, expected_output) params = {"videos": False, "host": "local", "control_subj_quantity": 0, "overwrite": True} processed_df = utils_db.load_visit_exams_processed(visit_exams_df, params) output = {row['exam']: str(round_metrics(row['processed'])) for idx, row in processed_df.iterrows()} print("Checking reprocessing results in the same output") check_output(output, expected_output)
def process_visit_exam(visit_exam_id, params): visit_exam_df = udb.get_visit_exams_by_id(visit_exam_id) visit_exam_processed_df = udb.load_visit_exams_processed(visit_exam_df, params=params) if len(visit_exam_processed_df) == 0: logger.warning( 'No exam associated with visit_exam_id: {}'.format(visit_exam_id)) else: # do charting, saving and POST metrics for each exam visit_id = visit_exam_processed_df.iloc[0]['visit_id'] subj_series = visit_exam_processed_df.iloc[0] cur_exam = subj_series['exam'].lower() control_subj_qty = 100 if 'control_subj_quantity' not in params else params[ 'control_subj_quantity'] year_range = 20 if 'year_range' not in params else params['year_range'] skip_gender = False if 'skip_gender' not in params else params[ 'skip_gender'] params['overwrite'] = False if 'overwrite' not in params else params[ 'overwrite'] # get processed_df with controls, POST metrics, and chart if check_params_for_exam(subj_series, params): print("running: " + subj_series['exam'] + ", visit_exam: " + str(subj_series['visit_exam_id']) + ", visit_id: " + subj_series['visit_id']) # get the results from analysis for all rows complete_processed_df = ucon.load_processed_controls_from_pickle( subj_series, n=control_subj_qty, max_controls=False, year_range=year_range) if len(complete_processed_df) <= 10: logger.warning( "Not enough control data to accurately calculate performance relative to population." ) # POST metrics to database response = post_metrics(complete_processed_df, visit_id, cur_exam) # todo: check more error codes if '201' not in str(response): logger.warning( 'Metrics may not POST to database with error code {}.'. format(response)) # do charting on the visit_exam if cur_exam in EXAM_TO_VIZ_FUNC: logger.info("PROCESSING: " + subj_series['exam']) func = EXAM_TO_VIZ_FUNC[cur_exam] fig = chart_and_save(func, complete_processed_df, params) return
def test_charts(): print("Imma testing chartssS!!!") visit_exam_df = get_test_visit_exams_df() params = {"videos": False, "host": "local", "control_subj_quantity": 0, "overwrite": False} processed_df = utils_db.load_visit_exams_processed(visit_exam_df, params) plot_funcs = processed_df['exam'].apply(lambda w: eval("chart.plot_" + w.lower())).values for ii in range(len(processed_df)): func = plot_funcs[ii] exam_id = processed_df['visit_exam_id'].iloc[ii] func(processed_df[ii:ii + 1], exam_id)
def todo_test_charts_with_controls(): visit_exam_df = get_test_with_controls_visit_exams_df() params = {"videos": False, "host": "local", "control_subj_quantity": 0, "overwrite": False} processed_df = utils_db.load_visit_exams_processed(visit_exam_df, params) exam_to_chart = lambda w: eval("chart.plot_" + w.lower()) # todo: check that this works that the groups are formatted correctly # select just one example of each exam (with pandas groupby?) groups = processed_df.groupby("exam") for exam, group in groups: func = exam_to_chart(group.iloc[0]['exam']) exam_id = processed_df['visit_exam_id'].iloc[0] func(group, exam_id)
def get_processed_df_for_visit(visit_id, params={}): visit_exams_df = udb.get_visit_exams(udb.get_visits(visit_id=visit_id)) processed_df = udb.load_visit_exams_processed(visit_exams_df, params) return processed_df
def test_load_processed(): print("testing load processed") test_begin_time = datetime.datetime.now(datetime.timezone.utc) visit_id = "6feeef57-4047-4c2d-b5f1-7e02f60a0188" # todo: add each exam as it is finished visit_exams_df = udb.get_visit_exams(udb.get_visits(visit_id=visit_id)) # filter out any we know haven't passed the test yet # todo: remove this filter because everything should pass the test exams_to_test = [ 'TrailMaking', 'TrailMaking2', 'CategoryFluency', 'LetterFluency' ] #, 'Prosaccade'] visit_exams_df = visit_exams_df[visit_exams_df['exam'].apply( lambda w: w in exams_to_test)] # note: host is AWS so that this doesn't leave a mess params = { 'videos': False, 'host': 'aws', 'control_subj_quantity': 2, 'exams': exams_to_test, 'overwrite': True } print("Test recomputing results") processed_df = udb.load_visit_exams_processed(visit_exams_df, params) # Check that all the status passed for idx, subj_series in processed_df.iterrows(): # check that each exam assert 'has_error' in subj_series, "missing error status field" assert subj_series[ 'has_error'] == False, "error occured in: " + subj_series['exam'] assert 'has_error' in subj_series[ 'processed'], "missing status element in exam: " + subj_series[ 'exam'] # check that the status is true assert subj_series['processed']['has_error'] == False, "processing has error in exam: " + subj_series['exam'] +\ ", has_error: " + subj_series['processed']['has_error'] # test that the processed files have all been modified by that process for idx, subj_series in processed_df.iterrows(): response = udb.load_s3_object(udb.get_processed_path(subj_series)) assert response[ 'LastModified'] > test_begin_time, "overwrite: True, processed file was not modified when it should have been for exam: " + subj_series[ 'exam'] # tests below measure a few cases of the overwrite_now function # check that the files don't get reprocessed again if param is set to false test_begin_time = datetime.datetime.now(datetime.timezone.utc) params['overwrite'] = False processed_df2 = udb.load_visit_exams_processed(visit_exams_df, params) for idx, subj_series in processed_df2.iterrows(): response = udb.load_s3_object(udb.get_processed_path(subj_series)) assert response[ 'LastModified'] < test_begin_time, "overwrite:'False' processed file was modified when it shouldn't have been for exam: " + subj_series[ 'exam'] # test batch overwrite: the files don't get reprocessed again for a long-running batch params['overwrite'] = 'batch' # pretend the batch started 2 hours ago test_begin_time = datetime.datetime.now(datetime.timezone.utc) params['batch_begin_time'] = test_begin_time - datetime.timedelta(hours=2) processed_df2 = udb.load_visit_exams_processed(visit_exams_df, params) for idx, subj_series in processed_df2.iterrows(): response = udb.load_s3_object(udb.get_processed_path(subj_series)) assert response[ 'LastModified'] < test_begin_time, "overwrite:'batch' processed file was modified when it shouldn't have been for exam: " + subj_series[ 'exam'] # Test batch overwrite: the files *do* get reprocessed if older than when the batch started params['batch_begin_time'] = datetime.datetime.now(datetime.timezone.utc) processed_df2 = udb.load_visit_exams_processed(visit_exams_df, params) for idx, subj_series in processed_df2.iterrows(): response = udb.load_s3_object(udb.get_processed_path(subj_series)) assert response['LastModified'] > params[ 'batch_begin_time'], "overwrite:'batch' processed file was not modified when it should have been for exam: " + subj_series[ 'exam']
def test_processing_functions(): """ test results of the processing functions :return: """ visit_id = "6feeef57-4047-4c2d-b5f1-7e02f60a0188" # todo: add each exam as it is finished exams_to_test = [ 'TrailMaking2', 'CategoryFluency', 'LetterFluency', 'TrailMaking' ] #, 'Prosaccade'] # note: assumption is that the processing was already re-done in the earlier test function. params = { 'videos': False, 'host': 'aws', 'control_subj_quantity': 2, 'exams': exams_to_test, 'overwrite': False } visit_exams_df = udb.get_visit_exams(udb.get_visits(visit_id=visit_id)) processed_df = udb.load_visit_exams_processed(visit_exams_df, params) # filter out any we know haven't passed the test yet # todo: remove this filter because everything should pass the test processed_df = processed_df[processed_df['exam'].apply( lambda w: w in exams_to_test)] for idx, subj_series in processed_df.iterrows(): if subj_series['exam'] == 'TrailMaking': assert subj_series['processed']['metrics'][ 'error_count'] == 13, "trailmaking produced wrong error count" assert subj_series['processed']['metrics'][ 'repeat_count'] == 14, "trailmaking produced wrong repeat count" assert subj_series['processed'][ 'active time'] == 22.0552, "trailmaking produced wrong active time" elif subj_series['exam'] == 'TrailMaking2': assert subj_series['processed']['metrics'][ 'error_count'] == 16, "trailmaking2 produced wrong error count" assert subj_series['processed']['metrics'][ 'repeat_count'] == 11, "trailmaking2 produced wrong repeat count" assert subj_series['processed'][ 'active time'] == 19.5775, "trailmaking2 produced wrong active time" elif subj_series['exam'] == 'CategoryFluency': assert subj_series['processed']['data']['responses'].iloc[0][ 'transcript'] == 'cucumbers', "transcript first word incorrect" assert subj_series['processed']['data']['responses'].iloc[1][ 'transcript'] == 'carrots', "transcript second word incorrect" assert subj_series['processed']['data']['responses'].iloc[2][ 'transcript'] == 'celery', "transcript second word incorrect" assert subj_series['processed']['metrics'][ 'num_correct'] == 6, "number correct wrong" elif subj_series['exam'] == 'LetterFluency': assert subj_series['processed']['data']['responses'].iloc[0][ 'transcript'] == 'apples' assert subj_series['processed']['data']['responses'].iloc[1][ 'transcript'] == 'asparagus' assert subj_series['processed']['data']['responses'].iloc[2][ 'transcript'] == 'australia' assert subj_series['processed']['metrics']['num_correct'] == 4
def update_controls_processed_pickle(exam): host = utils_db.get_host() if host == 'staging': raise Exception("Cannot update controls from staging") if exam not in EXAM_CONTROL_FILE_DICT: logger.warning( 'No control pickle file to update for exam. Please check exam and generate pickle if necessary' ) else: # open the old control file filepath = './.control_data/' + EXAM_CONTROL_FILE_DICT[exam] exams_on_file = pickle.load(open(filepath, "rb")) # take out the processed column so that we can compare to get_visit_exams without_processed_column = exams_on_file.iloc[:, 0:-1] # get all visit_exams current_visit_exams_df = utils_db.get_visit_exams(get_visits(n=np.inf)) # get the most recent exam of each type per user subset = filter_most_recent_exams(current_visit_exams_df) # filter out the exam that we're looking for subset = subset[subset['exam'] == exam] # concatenate them together and drop any rows that are exactly the same check_differences = pd.concat([without_processed_column, subset], sort=False).drop_duplicates(keep=False) # drop older versions of the exam for each subject check_differences = check_differences.sort_values( 'visit_exam_id', ascending=False).drop_duplicates(subset=['subject_id'], keep='last').sort_index() # for each row of the new results check to see if the subject_id exists in the old one # if so, check to see if it's an older exam and append the new one to list of exams to run load_processed new_exams = pd.DataFrame() for i, result in check_differences.iterrows(): if result['subject_id'] in exams_on_file['subject_id'].values: subject_in_old = exams_on_file.loc[exams_on_file['subject_id'] == result['subject_id']] # print(result['created_date'], subject_in_old.iloc[0]['created_date']) if result['created_date'] != subject_in_old.iloc[0][ 'created_date']: new_exams = new_exams.append(result) else: new_exams = new_exams.append(result) # load the new processed_data new_processed = utils_db.load_visit_exams_processed(new_exams) # concatenate the new and old exams and drop older versions of the exam for each subject old_and_new_exams = pd.concat([exams_on_file, new_processed]) old_and_new_exams = old_and_new_exams.sort_values( 'visit_exam_id', ascending=False).drop_duplicates(subset=['subject_id'], keep='last').sort_index() # TODO: FIGURE OUT WHAT TO DO WITH THE OLD CONTROL FILES (need to keep for FDA probably) return old_and_new_exams