def parse_args(): parser = GooeyParser( description='Format redcap CBCL export for ASEBA import') required = parser.add_argument_group('Required Arguments') required.add_argument('--redcap_export', required=True, widget='FileChooser', help='Demographics + CBCL export from REDCap') required.add_argument('--study_name', required=True, choices=['NEWT', 'NT', 'other']) required.add_argument('--form_type', required=True, choices=['cbcl', 'ycbcl']) required.add_argument('--outdir', widget='DirChooser', required=True, help='where to store output zip') other = parser.add_argument_group( 'Other study options (can ignore if using named study)', gooey_options={'columns': 1}) other.add_argument( '--varfile', widget='FileChooser', help= 'csv file with REDCap to ASEBA mapping (see H:/REDCap Scripts/static/*cbcl_column_map.csv for examples)' ) other.add_argument( '--wide', action='store_true', help= 'if REDCap export contains multiple sessions per row (vs each session on own line)' ) args = parser.parse_args() if args.study_name == 'other' and not args.varfile: parser.error('Must supply varfile if not using named study') return args
def parse_args(): parser = GooeyParser( description= 'Converts data in REDCap format (one row per session) to SPSS format (one row per subject) and vice versa' ) parser.add_argument( 'input_file', widget='FileChooser', help= 'csv file (full path) to be formatted (first column should contain subject ids)' ) parser.add_argument( 'output_file', help= 'csv file (full path) to store formatted data (if file does not exist, it will be created)' ) args = parser.parse_args() if not args.input_file.endswith('.csv') or not args.output_file.endswith( '.csv'): parser.error('Input and output files must be of type csv') return args
def parse_args(): parser = GooeyParser( description='Format ASEBA score export for REDCap import') required = parser.add_argument_group('Required arguments') required.add_argument('--aseba_export', required=True, widget='FileChooser', help='Excel scores export from ASEBA') required.add_argument('--study_name', required=True, choices=['NEWT', 'NT', 'other']) required.add_argument('--form_type', required=True, choices=['cbcl', 'ycbcl']) other = parser.add_argument_group( 'Other study options (can ignore if using named study)', gooey_options={'columns': 1}) other.add_argument( '--varfile', widget='FileChooser', help= 'csv file with REDCap to ASEBA mapping (see H:/REDCap Scripts/static/*cbcl_column_map.csv for examples)' ) other.add_argument( '--wide', action='store_true', help= 'if REDCap has multiple sessions per row (vs each session on own line)' ) args = parser.parse_args() if args.study_name == 'other' and not args.varfile: parser.error('Must supply varfile if not using named study') return args
def format_track_data(): # set up expected arguments and associated help text parser = GooeyParser( description='Formats TRACK data from REDCap csv export') required = parser.add_argument_group('Required Arguments', gooey_options={'columns': 1}) required.add_argument('--input_file', required=True, widget='FileChooser', help='REDCap export file') required.add_argument('--output_file', required=True, widget='FileChooser', help='CSV file to store formatted data in') required.add_argument('--api_password', required=True, widget='PasswordField', help='Password to access API token') optional = parser.add_argument_group('Optional Arguments', gooey_options={'columns': 2}) optional.add_argument( '-c', '--consecutive', type=int, metavar='num_consecutive_years', help= 'Limit results to particpants with data for a number of consecutive years' ) optional.add_argument('-d', '--duration', action='store_true', help='Calculate diabetes diagnosis duration') variable_options = parser.add_argument_group( 'Variable options', 'Space-separated lists of data points (category, column prefix, and/or variable) participants must have data for in export', gooey_options={ 'columns': 1, 'show_border': True }) variable_options.add_argument( '--all', nargs='+', default=None, help= 'All specified data points required for participant to be included in result' ) variable_options.add_argument( '--any', nargs='+', default=None, help= 'At least one specified data point required for participant to be included in result' ) format_options = parser.add_argument_group('Formatting options', gooey_options={ 'columns': 2, 'show_border': True }) format_options.add_argument( '-e', '--expand', action='store_true', help='Arrange data with one row per subject per session') format_options.add_argument('-t', '--transpose', action='store_true', help='Transpose the data') args = parser.parse_args() if not args.input_file.endswith('.csv') or not args.output_file.endswith( '.csv'): parser.error('Input and output files must be of type csv') # create initial dataframe structure df = redcap_common.create_df(args.input_file) df = df[df[TRACK_STUDY_ID].str.contains(r'TRACK\d+')] # remove test rows project = None if any(arg is not None for arg in [args.all, args.any, args.duration, args.consecutive]): project = redcap_common.get_redcap_project('track', args.api_token) if args.all: df = redcap_common.check_for_all(df, args.all, project) if args.any: df = redcap_common.check_for_any(df, args.any, project) fields = None if args.duration or args.consecutive: fields = redcap_common.get_matching_columns( project.field_names, r'\w*(' + '|'.join(DURATION_FIELDS) + ')') df = redcap_common.merge_api_data(df, project, fields, [TRACK_STUDY_ID]) # expand/rename after api merge to ensure column names match up df, non_session_cols = redcap_common.expand(df.set_index(TRACK_STUDY_ID)) df = redcap_common.rename_common_columns(df, RENAMES, False) df[redcap_common.SESSION_DATE] = pd.to_datetime( df[redcap_common.SESSION_DATE]) df = df[pd.notnull(df[ redcap_common.SESSION_DATE])] # remove rows for non-attended sessions if args.duration: df = redcap_common.prepare_age_calc(df) dx_vars = {'dx_date': 'db_dx_date', 'dx_age': 'db_onset_age'} df['db_dx_date'] = pd.to_datetime(df['db_dx_date']) dx_age_df = df.loc[df[redcap_common.SESSION_NUMBER] == 's1'].apply( redcap_common.get_diagnosis_age, args=(dx_vars, ), axis=1) df = df.groupby([redcap_common.STUDY_ID ]).apply(redcap_common.calculate_diagnosis_duration, 'db', dx_age_df) df = df.drop('session_age', axis=1) if args.consecutive: df[redcap_common.SESSION_YEAR] = df[redcap_common.SESSION_DATE].apply( lambda x: x.year if x else None) df = df.groupby([redcap_common.STUDY_ID ]).apply(redcap_common.get_consecutive_years, args.consecutive) df = df.drop([redcap_common.SESSION_YEAR], axis=1) df = redcap_common.rename_common_columns( df, RENAMES, True) # rename common columns back to original names pre-flattening df = df.set_index([TRACK_STUDY_ID, redcap_common.SESSION_NUMBER]) if not args.expand: non_session_cols = { col: 's1_' + col for col in df.columns if not re.match(r's\d_', col) } df = df.rename(columns=non_session_cols) df = redcap_common.flatten( df) # always reflatten at end, unless expand flag is set if args.transpose: df = df.transpose() # clean up dataframe revert_non_session_cols = { 's1_' + col: col for col in non_session_cols.keys() } df = df.rename(columns=revert_non_session_cols) if fields: drop_fields = fields if not args.expand else DURATION_FIELDS # if leaving expanded, then the columns we brought in don't match the current columns df = redcap_common.cleanup_api_merge(df, drop_fields) redcap_common.write_results_and_open(df, args.output_file)
def format_wolfram_data(): # set up expected arguments and associated help text parser = GooeyParser( description='Formats Wolfram data from REDCap csv export') required = parser.add_argument_group('Required Arguments', gooey_options={'columns': 1}) required.add_argument('--input_file', required=True, widget='FileChooser', help='REDCap export file') # required.add_argument('--output_file', required=True, widget='FileChooser', help='CSV file to store formatted data in') optional = parser.add_argument_group('Optional Arguments', gooey_options={'columns': 1}) optional.add_argument( '-c', '--consecutive', type=int, metavar='num_consecutive_years', help= 'Limit results to particpants with data for a number of consecutive years' ) optional.add_argument( '-d', '--duration', nargs='*', dest='dx_types', widget='Listbox', default=None, choices=ALL_DX_TYPES, help='Calculate diagnosis duration for specified diagnosis types') # optional.add_argument('--duration-type', dest='duration_type', default='clinic date', choices=['clinic date','MRI date','MRI date if available, otherwise clinic date ("mri_or_clinic")'], help='Visit date to use when calculating dx durations') optional.add_argument( '--duration-type', dest='duration_type', default='clinic date', choices=['clinic date', 'MRI date'], help='Visit date to use when calculating dx durations') optional.add_argument( '--drop_non_mri', action='store_true', help='Drop all sessions that do not have an "mri_date" entry.') optional.add_argument( '--old-db', action='store_true', help='whether data was sourced from old Wolfram database') optional.add_argument( '--api_token', widget='PasswordField', help= 'REDCap API token (if not specified, will not pull anything from REDCap)' ) variable_options = parser.add_argument_group( 'Variable options', 'Space-separated lists of data points (category, column prefix, and/or variable) participants must have data for in export', gooey_options={ 'columns': 1, 'show_border': True }) variable_options.add_argument( '--all', nargs='+', default=None, help= 'All specified data points required for participant to be included in result' ) variable_options.add_argument( '--any', nargs='+', default=None, help= 'At least one specified data point required for participant to be included in result' ) format_options = parser.add_argument_group('Formatting options', gooey_options={ 'columns': 2, 'show_border': True }) format_options.add_argument( '-f', '--flatten', action='store_true', help='Arrange all session data in single row for participant') format_options.add_argument( '--flatten_by', default='session number', choices=['session number', 'clinic year'], help='Flatten data by session number or clinic year') format_options.add_argument('-t', '--transpose', action='store_true', help='Transpose the data') format_options.add_argument( '-s', '--sort_by', default='variable', choices=['variable', 'session'], help='Sort flattened data by session or variable') args = parser.parse_args() dur_label = '' flatten_label = '' mri_label = '' if not args.old_db: print( '### "old_db" not checked, only pulling data from the "new" database ###' ) if not args.input_file.endswith('.csv'): parser.error('Input file must be of type csv') # create dataframe from REDCap data df = redcap_common.create_df(args.input_file) df = df[df[WFS_STUDY_ID].str.contains( r'WOLF_\d{4}_.+')] # remove Test and Wolf_AN rows num_clinic_years = len( df['redcap_event_name'].unique() ) - 1 # FIXME: should be counting max number of sessions for participants (still may cause error because they might not be consecutive) print('### Number of clinic years detected = {} ###'.format( num_clinic_years)) # get number of subjects in dataframe num_subjects = len(df[WFS_STUDY_ID].unique()) print('### Number of subjects detected in {} = {} ###'.format( args.input_file, num_subjects)) # only create API project if actions require it and data needed is not already present, AND if API token is given project = None # check for fields missing from csv df fields = [ WFS_SESSION_NUMBER, WFS_CLINIC_YEAR ] if WFS_SESSION_NUMBER not in df.columns else [ ] # always need to get session number if not in data (used to determine which rows to keep) if MISSED_SESSION not in df.columns: fields.append( MISSED_SESSION ) # need missed_session var to remove rows for unattended session if args.dx_types is not None: for dx_type in args.dx_types: dx_age_field = get_dx_column(dx_type, 'best_age_calc') if dx_age_field not in df.columns: fields.append(dx_age_field) for non_dx_field in NON_DX_FIELDS_FOR_DURATION: if non_dx_field not in df.columns: fields.append(non_dx_field) if fields: # missing some fields, go get from REDCap print('### need to get some fields from REDCap ###') if args.api_token == "": raise RuntimeError( "Thre are missing fields in the input csv, so we need to get data from REDCap, but no API token is given. Ask Jon about REDCap API access." ) else: redcap_project_key = 'itrack' if not args.old_db else 'wolfram' project = project if project else redcap_common.get_redcap_project( redcap_project_key, args.api_token) df = redcap_common.merge_api_data( df, project, fields, [WFS_STUDY_ID, 'redcap_event_name']) # rename common columns after api merge to ensure column names match up df = redcap_common.rename_common_columns(df, RENAMES, False) if args.consecutive is not None and args.consecutive not in range( 2, num_clinic_years + 1): parser.error( 'Consecutive years must be greater than 1 and cannot exceed number of clinic years ({})' .format(num_clinic_years)) df.loc[(df['redcap_event_name'] == 'stable_patient_cha_arm_1'), [redcap_common.SESSION_NUMBER]] = df.loc[ (df['redcap_event_name'] == 'stable_patient_cha_arm_1'), [redcap_common.SESSION_NUMBER]].fillna(0) # remove rows for sessions not attended (will have a flag saying they did not attend) df = df[pd.notnull(df[redcap_common.SESSION_NUMBER])] df = df[pd.isnull(df[MISSED_SESSION])] df[redcap_common.SESSION_NUMBER] = df[redcap_common.SESSION_NUMBER].astype( int ) # once NANs are gone, we can cast as int (nicer for flatten display) # if duration argument specified, calculate diagnosis duration for types specified or all (if none specified) if args.dx_types is not None: # explicit None check because empty array is valid # this puts a 'session_age' field into the df using dob and session_date (where session_date is from clinic_date) df = redcap_common.prepare_age_calc(df) df = mri_age_calc(df) if args.duration_type == 'MRI date if available, otherwise clinic date ("mri_or_clinic")': df['mri_or_clinic_age'] = df.apply( lambda row: select_best_age(row), axis=1) for dx_type in args.dx_types: dx_vars = {'dx_age': get_dx_column(dx_type, 'best_age_calc')} # df[dx_vars['dx_date']] = pd.to_datetime(df[dx_vars['dx_date']], errors='coerce') dx_age_df = df.loc[df['redcap_event_name'] == 'stable_patient_cha_arm_1'].apply( redcap_common.get_diagnosis_age, args=(dx_vars, ), axis=1) if args.duration_type == 'clinic date': dur_label = '_clinic_duration' dx_type_clinic = '_'.join([dx_type, 'clinic']) df = df.groupby([redcap_common.STUDY_ID]).apply( redcap_common.calculate_diagnosis_duration, dx_type_clinic, dx_age_df, 'session_age') dx_dur_field = get_dx_column(dx_type, 'clinic_duration') df.loc[~(df[dx_dur_field] > 0), dx_dur_field] = np.nan elif args.duration_type == 'MRI date': dur_label = '_mri_duration' dx_type_mri = '_'.join([dx_type, 'mri']) df = df.groupby([redcap_common.STUDY_ID]).apply( redcap_common.calculate_diagnosis_duration, dx_type_mri, dx_age_df, 'mri_age') dx_mri_dur_field = get_dx_column(dx_type, 'mri_duration') df.loc[~(df[dx_mri_dur_field] > 0), dx_mri_dur_field] = np.nan elif args.duration_type == 'MRI date if available, otherwise clinic date ("mri_or_clinic")': dur_label = '_mri_or_clinic_duration' dx_type_mri_or_clinic = '_'.join([dx_type, 'mri_or_clinic']) df = df.groupby([redcap_common.STUDY_ID]).apply( redcap_common.calculate_diagnosis_duration, dx_type_mri_or_clinic, dx_age_df, 'mri_or_clinic_age') dx_best_dur_field = get_dx_column(dx_type, 'mri_or_clinic_duration') df.loc[~(df[dx_best_dur_field] > 0), dx_best_dur_field] = np.nan else: raise Exception( "ERROR: dx_types chosen, but no duration_type chosen") # df = df.drop(['session_age', 'redcap_event_name'], axis=1) # if varaibles are specified, filter out rows that don't have data for them (if null or non-numeric) if args.all: df = redcap_common.check_for_all(df, args.all, project, True) if args.any: df = redcap_common.check_for_any(df, args.any, project, True) # remove session data for participants that did not occur in consecutive years if args.consecutive: df = df.groupby([redcap_common.STUDY_ID ]).apply(redcap_common.get_consecutive_years, args.consecutive) if df.empty: stderr.write( 'No data to return. Selections have filtered out all rows.') exit(1) # add clinic_year df['clinic_year'] = df.apply(lambda row: get_clinic_year(row), axis=1) # rename common columns back to original names df = redcap_common.rename_common_columns(df, RENAMES, True) # if we have brought in dx info/demographics from the API, remove it after the calculation and rename columns that were suffixed due to merge if not fields == [ WFS_SESSION_NUMBER, WFS_CLINIC_YEAR ] and args.api_token: # don't need to go through deletion logic if only field is session number if WFS_SESSION_NUMBER in fields: fields.remove( WFS_SESSION_NUMBER) # remove session number from fields df = redcap_common.cleanup_api_merge(df, fields) # rename session_age to clinic_age df = df.rename(columns={"session_age": "clinic_age"}) # remove dob, clinic date and MRI date df = df.drop(['dob'], axis=1, errors="ignore") df = df.drop(['clinic_date'], axis=1, errors="ignore") df = df.drop(['mri_date'], axis=1, errors="ignore") df = df.drop(['redcap_event_name'], axis=1, errors="ignore") # drop non-MRI sessions if args.drop_non_mri: df = df[(df[MRI_AGE] > 0.0) | (df['clinic_year'] == 0)] mri_label = '_just_mri' # df.to_csv(r'C:\temp\df_before_flatten.csv') # puts all sessions/clinic years for a participant on one line (suffixed with year/session) if args.flatten: # multi-index column for flattening if args.flatten_by == 'session number': flatten_by_column = 'wolfram_sessionnumber' flatten_label = '_flattened_by_session' # df.set_index([redcap_common.STUDY_ID, redcap_common.SESSION_NUMBER], inplace=True) flatten_group_prefix = 's' elif args.flatten_by == 'clinic year': flatten_by_column = 'clinic_year' flatten_label = '_flattened_by_clinic' # df.set_index([redcap_common.STUDY_ID, 'clinic_year'], inplace=True) flatten_group_prefix = 'c' else: raise Exception('ERROR: flatten_by check failed') sort = args.sort_by == 'session' df = redcap_common.flatten(df, flatten_by_column, sort, flatten_group_prefix) if args.transpose: df = df.transpose() # df.to_csv(r'C:\temp\df_right_before_save.csv') # make output_file name output_file = args.input_file.replace( '.csv', '{}{}{}.csv'.format(dur_label, flatten_label, mri_label)) redcap_common.write_results_and_open(df, output_file)
def dot_dbs_import(): parser = GooeyParser(description='Formats kinematics data for redcap import') required_group = parser.add_argument_group('Required Arguments', gooey_options={'columns': 1}) required_group.add_argument('--folder', widget='DirChooser', required=True, help='Folder containing subject directories to be processed') optional_group = parser.add_argument_group('Optional Arguments', gooey_options={'columns': 1}) optional_group.add_argument('-s', '--subjects', nargs='+', help='Space-separated list of subject ids to run for (if blank, runs all in folder)') args = parser.parse_args() if not exists(args.folder): parser.error('Specified folder does not exist.') script_dir = getcwd() chdir(args.folder) result = None subject_dirs = [ d for d in listdir(getcwd()) if re.match(r'DOTDBS(\d)+$', d) ] if not args.subjects else args.subjects if not subject_dirs: stderr.write('No subject directories found matching pattern: {}'.format('DOTDBS##')) exit(1) for i, subject in enumerate(subject_dirs): chdir(subject) summary_files = [ f for f in listdir(getcwd()) if f.endswith('.xlsx') ] if not summary_files: print('Subject {} does not have a summary file'.format(subject)) chdir('..') continue subject_df = pd.read_excel(summary_files[0], index_col=0, sheet_name=[0,1,2]) temp_df = None for sheet in subject_df.keys(): print('Processing: {}, block {}'.format(subject, sheet+1)) subject_df[sheet] = subject_df[sheet][subject_df[sheet].index.notnull()] measures = subject_df[sheet].index measures = [ ''.join([w[0].lower() for w in s ]) for s in [titlecase(m).split() for m in measures ]] # get strings of lowercased first letters for each word in each measure measures = [ '_'.join([m[:2], m[2:]]).replace('rt', 'at').replace('ft_pvcov', 'ft_pvcv').replace('b11h', '') for m in measures ] # split var into action type and measure and rename mismatched var names subject_df[sheet].index = measures subject_df[sheet].insert(0, 'record_id', subject) subject_df[sheet].insert(1, 'block', 'Block' + str(sheet+1)) subject_df[sheet].set_index(['record_id', 'block'], append=True, inplace=True) subject_df[sheet] = subject_df[sheet].reorder_levels([1,2,0]) subject_df[sheet] = subject_df[sheet].dropna(axis=1, how='all') subject_df[sheet].columns = [ col.lower() for col in subject_df[sheet].columns ] subject_df[sheet] = subject_df[sheet].rename(columns={'left avg': 'Left', 'right avg': 'Right'}) subject_df[sheet] = subject_df[sheet].drop([col for col in subject_df[sheet].columns if col not in ['Left', 'Right']], axis=1) subject_df[sheet] = subject_df[sheet].unstack([1,2]) # .sort_index(1, level=1) subject_df[sheet].columns = [ ('_'.join([tup[1], tup[2], tup[0]])).lower() for tup in subject_df[sheet].columns ] temp_df = subject_df[sheet] if sheet == 0 else pd.concat([temp_df, subject_df[sheet]], axis=1) result = temp_df if i == 0 else pd.concat([result, temp_df], axis=0) chdir('..') ft_mpv_columns = { col: col.replace('right', 'left_right') for col in result.columns if col.endswith('ft_mpv_right') } result.rename(columns=ft_mpv_columns, inplace=True) # hack to fix deviation from regular naming convention ('_left_right' instead of '_right') # unused_cols = [ col for col in result.columns if any(var in col for var in UNUSED_VARS) ] # result.drop(unused_cols, axis=1, inplace=True) for i in range(1,4): result['_'.join(['block', str(i), 'kinematics_complete'])] = 1 print('here') redcap_common.write_results_and_open(result, join(script_dir, 'formatted_dotdbs.csv')) return