def get_units(cols_input): # connect to castor api to fetch information on variable lists config = configparser.ConfigParser() config.read('../user_settings.ini') # create this once and never upload path_creds = config['CastorCredentials']['local_private_path'] c = ca.CastorApi(path_creds) c.select_study_by_name(config['CastorCredentials']['study_name']) optiongroups = c.request_study_export_optiongroups() studystruct = c.request_study_export_structure() cols = pd.Series(cols_input) units = pd.Series(cols_input) units[:] = '' lookup_dict, numeric_vars = get_unit_lookup_dict() for variable in cols.to_list(): if variable in numeric_vars: # the one with 1.0 as conversion factor is used. for ind, conversion in lookup_dict[numeric_vars[variable]].items(): if conversion == 1.0: option_group_id = studystruct['Field Option Group'][ studystruct['Field Variable Name'] == numeric_vars[variable]] options = optiongroups[['Option Name', 'Option Value' ]][optiongroups['Option Group Id'] == option_group_id.values[0]] unit = options['Option Name'][ options['Option Value'].values.astype(int) == ind] units[cols == variable] = unit.values[0] return units.to_list()
def setUp(self): # this test is using dummy credentials for a fake study # with a fake account. # The secret and client id are stored in github secrets client_id = os.getenv('castor_clientid') client_secret = os.getenv('castor_secret') self.c = ca.CastorApi(client_id=client_id, client_secret=client_secret)
def import_study_report_structure(path_to_api_creds=None, dailyreportsonly=True): config = configparser.ConfigParser() config.read(os.path.join(os.path.dirname(__file__), 'user_settings.ini')) if path_to_api_creds is None: path_to_api_creds = config['CastorCredentials']['local_private_path'] # input: private folder where client & secret files (no extension, # 1 string only per file) from castor are saved by the user # see also: # https://helpdesk.castoredc.com/article/124-application-programming-interface-api c = ca.CastorApi(path_to_api_creds) # e.g. in user dir outside of GIT repo # get study ID for COVID study c.select_study_by_name(config['CastorCredentials']['study_name']) # STEP 0: collect answer options from optiongroups # get answer option groups optiongroups_struct = c.request_study_export_optiongroups() # STEP 1: collect data from study # get the main study structure (i.e. questions) structure = c.request_study_export_structure() # sort on form collection order and field order # (this matches how data is filled in castor) structure_filtered = structure \ .sort_values(['Form Collection Name', 'Form Collection Order', 'Form Order', 'Field Order']) # filter variables that have no Field Variable name; these field do not # record data structure_filtered[~structure_filtered['Field Variable Name'].isna()] # select only study variables study_structure = structure_filtered[structure_filtered['Form Type'].isin( ['Study'])] # select only report variables (may contain duplicates) reports_structure = structure_filtered[ structure_filtered['Form Type'].isin(['Report'])] if dailyreportsonly: reports_structure = reports_structure[ reports_structure['Form Collection Name'].isin( ['Daily case record form'])] return study_structure, reports_structure, optiongroups_struct
def import_data_by_record(path_to_api_creds=None): config = configparser.ConfigParser() config.read(os.path.join(os.path.dirname(__file__), 'user_settings.ini')) if path_to_api_creds is None: path_to_api_creds = config['CastorCredentials']['local_private_path'] # alternative for import_data if import_data fails due to server-side # timeout errors (i.e. for large datasets);this alternative loops over # the records and report instances to load the data # input: private folder where client & secret files (no extension, # 1 string only per file) from castor are saved by the user # see also: # https://helpdesk.castoredc.com/article/124-application-programming-interface-api c = ca.CastorApi(path_to_api_creds) # e.g. in user dir outside of GIT repo # get study ID for COVID study c.select_study_by_name(config['CastorCredentials']['study_name']) df_study, df_structure_study, df_report, df_structure_report,\ df_optiongroups_structure = \ c.records_reports_all(report_names=['Daily'], add_including_center=True) # remove test institute and archived (deleted) records test_inst = [ i for i in c.request_institutes() if 'test' in i['name'].lower() ][0] test_records = [ r['record_id'] for r in c.request_study_records(institute=test_inst['institute_id']) ] test_records += [ r['record_id'] for r in c.request_study_records() if r['archived'] == 1 ] df_study.drop( index=df_study[df_study['Record Id'].isin(test_records)].index, inplace=True) df_report.drop( index=df_report[df_report['Record Id'].isin(test_records)].index, inplace=True) return df_study, df_structure_study, df_report, \ df_structure_report, df_optiongroups_structure
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Apr 3 12:46:07 2020 @author: wouterpotters """ import os import castorapi as ca import configparser config = configparser.ConfigParser() config.read(os.path.join(os.path.dirname(__file__), '../user_settings.ini')) c = ca.CastorApi(config['CastorCredentials']['local_private_path']) study_id = c.select_study_by_name(config['CastorCredentials']['study_name']) varname = 'Outcome' # select AMC + VUmc + MUMC institutes = c.request_institutes() inst_amc_vumc = [inst['institute_id'] for inst in c.request_institutes() if (inst['name'] == 'AUMC - AMC' or inst['name'] == 'AUMC - VUmc' or inst['name'] == 'MUMC')] records = c.request_study_records(institute=inst_amc_vumc[0]) +\ c.request_study_records(institute=inst_amc_vumc[1]) options = c.request_fieldoptiongroup( optiongroup_id=c.field_optiongroup_by_variable_name(varname)) values = c.field_values_by_variable_name(varname, records=records)
def import_study_report_structure(path_to_api_creds=None): config = configparser.ConfigParser() config.read(os.path.join(os.path.dirname(__file__), 'user_settings.ini')) if path_to_api_creds is None: path_to_api_creds = config['CastorCredentials']['local_private_path'] # input: private folder where client & secret files (no extension, # 1 string only per file) from castor are saved by the user # see also: # https://helpdesk.castoredc.com/article/124-application-programming-interface-api c = ca.CastorApi(path_to_api_creds) # e.g. in user dir outside of GIT repo # get study ID for COVID study c.select_study_by_name(config['CastorCredentials']['study_name']) # STEP 0: collect answer options from optiongroups # get answer option groups optiongroups_struct = c.request_study_export_optiongroups() # STEP 1: collect data from study # get the main study structure (i.e. questions) study_structure = c.request_study_export_structure() # filter unused columns # sort on form collection order and field order # (this matches how data is filled in castor) study_structure_filtered = study_structure \ .filter(['Form Type', 'Form Collection Name', 'Form Collection Order', 'Form Name', 'Form Order', 'Field Variable Name', 'Field Label', 'Field ID', 'Field Type', 'Field Order', 'Calculation Template', 'Field Option Group'], axis=1) \ .sort_values(['Form Order', 'Form Collection Name', 'Form Collection Order', 'Field Order']) # filter datatypes that are (most of the times) unusable for ML model # filter variables that are repeated measurements (i.e. reports data). # filter variables that have no Field Variable name (remarks by user?) # keep only study forms; reports can exist multiple times study_structure_filtered = study_structure_filtered[ study_structure_filtered['Field Type'].isin([ 'radio', 'date', 'dropdown', 'checkbox', 'string', 'numeric', 'calculation', 'time' ]) & study_structure_filtered['Form Type'].isin(['Study']) & ~(study_structure_filtered['Field Variable Name'].isna())] # filter relevant columns for reports variables # sort on form collection order and field order (this matches castor order) reports_structure_filtered = study_structure\ .filter(['Form Type', 'Form Collection Name', 'Form Collection Order', 'Form Name', 'Form Order', 'Field Variable Name', 'Field Label', 'Field ID', 'Field Type', 'Field Order', 'Calculation Template', 'Field Option Group'], axis=1) \ .sort_values(['Form Order', 'Form Collection Name', 'Form Collection Order', 'Field Order']) # filter datatypes that are (most of the times) unusable for ML model # filter variables that are repeated measurements (i.e. reports data). # filter variables that have no Field Variable name (additional remarks) reports_structure_filtered = reports_structure_filtered[ reports_structure_filtered['Field Type'].isin([ 'radio', 'date', 'dropdown', 'checkbox', 'string', 'numeric', 'calculation', 'time' ])] reports_structure_filtered = reports_structure_filtered[ reports_structure_filtered['Form Type'].isin(['Report'])] reports_structure_filtered = reports_structure_filtered[ ~reports_structure_filtered['Field Variable Name'].isna()] reports_structure_filtered = reports_structure_filtered[ reports_structure_filtered['Form Collection Name'].isin( ['Daily case record form'])] return study_structure_filtered, reports_structure_filtered, \ optiongroups_struct
def import_data(path_to_api_creds=None): # STEP 0: connect to API # create this config once using covid19_createconfig # and never upload this file to git. config = configparser.ConfigParser() config.read('user_settings.ini') if path_to_api_creds is None: path_to_api_creds = config['CastorCredentials']['local_private_path'] # input: private folder where client & secret files (no extension, # 1 string only per file) from castor are saved by the user # see also: # https://helpdesk.castoredc.com/article/124-application-programming-interface-api c = ca.CastorApi(path_to_api_creds) # e.g. in user dir outside of GIT repo # get study ID for COVID study c.select_study_by_name(config['CastorCredentials']['study_name']) # STEP 0: collect answer options from optiongroups # get answer option groups optiongroups_struct = c.request_study_export_optiongroups() # STEP 1: collect data from study # get the main study structure (i.e. questions) study_structure = c.request_study_export_structure() # filter unused columns # sort on form collection order and field order # (this matches how data is filled in castor) study_structure_filtered = study_structure \ .filter(['Form Type', 'Form Collection Name', 'Form Collection Order', 'Form Name', 'Form Order', 'Field Variable Name', 'Field Label', 'Field ID', 'Field Type', 'Field Order', 'Calculation Template', 'Field Option Group'], axis=1) \ .sort_values(['Form Order', 'Form Collection Name', 'Form Collection Order', 'Field Order']) # filter datatypes that are (most of the times) unusable for ML model. # filter variables that are repeated measurements (i.e. reports data). # filter variables that have no Field Variable name (additional remarks?) # keep only study forms; reports can exist multiple times. study_structure_filtered = study_structure_filtered[ study_structure_filtered['Field Type'].isin([ 'radio', 'date', 'dropdown', 'checkbox', 'string', 'numeric', 'calculation', 'time' ]) & study_structure_filtered['Form Type'].isin(['Study']) & ~(study_structure_filtered['Field Variable Name'].isna())] # Get study data study_data = c.request_study_export_data() # Filter data tbat is not a study entry (i.e. reports, complications) # - repeated measures; could be summarized first # Filter archived data (=DELETED data) # Filter all patients from test institute (=TEST patient) study_data['Record ID'] = study_data['Record ID'].astype(str) study_data_filtered = study_data[ study_data['Form Type'].isin(['Study']) & (~study_data['Record ID'].str.match('^ARCHIVED-.*'))] \ .filter(['Record ID', 'Field ID', 'Form Type', 'Value', 'Date'], axis=1) # combine study data (patients and values) and study structure (variables) study_data_final = pandas.merge( study_structure_filtered[['Field Variable Name', 'Field ID']], study_data_filtered[['Record ID', 'Value', 'Field ID']], on='Field ID') \ .pivot(index='Record ID', columns='Field Variable Name', values='Value') # STEP 2A: collect data from DAILY reports # get raw data without deleted and test data, ignore junk form instances reports_data_filtered = study_data[ study_data['Form Type'].isin(['Report']) & (~study_data['Record ID'].str.match('^ARCHIVED-.*')) & (~study_data['Record ID'].str.match('000001'))] reports_data_filtered = reports_data_filtered[ ~reports_data_filtered['Form Instance ID'].isna()] # problem: daily reports are dynamic, changing over time. # As are their ID's. On top of that people can rename the form. # solution: look for all reports that start with 'Daily' and find their # Form Instance ID. Then use that to select all reports. daily_report_form_instance_IDs = \ reports_data_filtered['Form Instance ID'][ reports_data_filtered['Form Instance Name'] .str.match('^Daily .*')].unique() daily_report_true = [ s in daily_report_form_instance_IDs for s in reports_data_filtered['Form Instance ID'] ] reports_data_filtered = reports_data_filtered[daily_report_true] reports_data_filtered = reports_data_filtered.filter([ 'Record ID', 'Field ID', 'Form Type', 'Form Instance ID', 'Form Instance Name', 'Value', 'Date' ]) # filter relevant columns for reports variables # sort on form collection order and field order (this matches Castor) reports_structure_filtered = study_structure.filter( ['Form Type', 'Form Collection Name', 'Form Collection Order', 'Form Name', 'Form Order', 'Field Variable Name', 'Field Label', 'Field ID', 'Field Type', 'Field Order', 'Calculation Template', 'Field Option Group'], axis=1) \ .sort_values(['Form Order', 'Form Collection Name', 'Form Collection Order', 'Field Order']) # filter datatypes that are (most of the times) unusable for ML model # filter variables that are repeated measurements (i.e. reports data). # filter variables that have no Field Variable name (additional remarks?) reports_structure_filtered = reports_structure_filtered[ reports_structure_filtered['Field Type'].isin([ 'radio', 'date', 'dropdown', 'checkbox', 'string', 'numeric', 'calculation', 'time' ])] reports_structure_filtered = reports_structure_filtered[ reports_structure_filtered['Form Type'].isin(['Report'])] reports_structure_filtered = reports_structure_filtered[ ~reports_structure_filtered['Field Variable Name'].isna()] reports_structure_filtered = reports_structure_filtered[ reports_structure_filtered['Form Collection Name'].isin( ['Daily case record form'])] # merge the structure and the data to get full dataset reports_data_all = pandas.merge(reports_structure_filtered[ ['Field Variable Name', 'Field ID']], reports_data_filtered[['Record ID', 'Value', 'Form Instance ID', 'Field ID']], on='Field ID')\ .pivot(index='Form Instance ID', columns='Field Variable Name', values='Value') # Record ID has vanished; now add Record ID again. # (probably smarter to do this using pivot_table, but doesnt work?) reports_data_all = pandas.merge(reports_data_all, reports_data_filtered[ ['Record ID', 'Form Instance ID']], on='Form Instance ID')\ .drop_duplicates() # reorganize data to put record id and assesment date in front. cols = reports_data_all.columns.tolist() # admission date ICU according to report cols.insert(0, cols.pop(cols.index('assessment_dt'))) cols.insert(0, cols.pop(cols.index('Record ID'))) cols.pop(cols.index('Form Instance ID')) # drop this one, not needed reports_data_final = reports_data_all.reindex(columns=cols) # STEP 2B: collect data from COMPLICATIONS reports # PLEASE NOTE THAT THIS WORKS, but as of 31/3 no complications data is # present; hence this option is disabled. # if you enable it, make sure to add two outputs as well. if False: complications_struct = study_structure \ .filter(['Form Type', 'Form Collection Name', 'Form Collection Order', 'Form Name', 'Form Order', 'Field Variable Name', 'Field Label', 'Field ID', 'Field Type', 'Field Order', 'Calculation Template', 'Field Option Group'], axis=1) \ .sort_values(['Form Order', 'Form Collection Name', 'Form Collection Order', 'Field Order']) complications_struct = complications_struct[ complications_struct['Form Type'].isin(['Report'])] complications_struct = complications_struct[ ~complications_struct['Field Variable Name'].isna()] complications_struct = complications_struct[ complications_struct['Form Collection Name'].isin( ['Complications'])] # TODO: get actual complications # get raw data without deleted and test data, ignore junk instances complications_data = study_data[study_data['Form Type'].isin( ['Complications'])] complications_data_filtered = complications_data[ ~complications_data['Form Instance ID'].isna()] # problem: daily reports are dynamic, changing over time. # As are their ID's. On top of that people can rename the form. # solution: look for all reports that start with 'Daily' and find # their Form Instance ID. Then use that to select all reports. complication_form_instance_IDs = \ complications_data_filtered['Form Instance ID'][ complications_data_filtered['Form Instance Name'] .str.match('.*Complications.*')].unique() complication_true = [ s in complication_form_instance_IDs for s in complications_data_filtered['Form Instance ID'] ] complications_data_filtered = \ complications_data_filtered[complication_true] complications_data_filtered = \ complications_data_filtered.filter( ['Record ID', 'Field ID', 'Form Type', 'Form Instance ID', 'Form Instance Name', 'Value', 'Date']) # STEP 3: CLEANUP del (c, cols, reports_data_filtered, reports_data_all, study_structure) del (study_data_filtered, study_data, daily_report_form_instance_IDs, daily_report_true) # STEP 4: RETURN THIS DATA # study data: # study_structure_filtered # study_data_final # note that record ID is the named index # reports data; # reports_structure_filtered # reports_data_final # note that record ID can not be the named # index, because multiple entries exist. # STEP 5: (TODO) summarize data from reports and add the summary stats # to study_data_final TODO return study_data_final, study_structure_filtered, reports_data_final,\ reports_structure_filtered, optiongroups_struct
# step 2: install package castorapi # like this: https://docs.anaconda.com/anaconda/navigator/tutorials/manage-packages/#installing-a-package # # step 1 and 2 combined: # right click on the green arrow in environment, click run terminal # run: `conda install -c wouterpotters castorapi` in the terminal # store the secret and client files as described here: # https://github.com/wouterpotters/castorapi/blob/master/README.md#usage # now use the package import castorapi path_to_client_secret = r'C:/path/to/api_secret_a' # FORWARD SLASHES! OR \\ for each backward slash c = castorapi.CastorApi( path_to_client_secret) # e.g. in user dir outside of GIT repo # get study ID for Parkinson study c.select_study_by_name( 'parkinson') # change name to match study name in castor ### STEP 0: collect answer options from optiongroups # get answer option groups for multiple choice questions df_answeroptions_struct = c.request_study_export_optiongroups() # get the main study structure (i.e. questions) df_study_structure = c.request_study_export_structure() # filter unused columns from df_study_structure, sort fields df_study_structure = df_study_structure \
return study_structure, reports_structure, optiongroups_struct if __name__ == "__main__": config = configparser.ConfigParser() config.read(os.path.join(os.path.dirname(__file__), '../user_settings.ini')) path_to_api_creds = config['CastorCredentials']['local_private_path'] # input: private folder where client & secret files (no extension, # 1 string only per file) from castor are saved by the user # see also: # https://helpdesk.castoredc.com/article/124-application-programming-interface-api c = ca.CastorApi(path_to_api_creds) # e.g. in user dir outside of GIT repo # get study ID for COVID study if False: name = 'COVID-19 NL' excel_postfix = '' else: name = 'Clinical features of COVID-19 positive patients in VieCuri' excel_postfix = '_viecurie.xlsx' study_id = c.select_study_by_name(name) study_name = c.request_study(study_id=study_id)['name'] # # Get all data from Castor database (without any selection criterium) # Note that you need export rights for every individual center.
Created on Thu Mar 26 21:51:39 2020 @author: wouterpotters """ import time import statistics import castorapi as ca import configparser config = configparser.ConfigParser() config.read('../user_settings.ini') # put both the secret, client and the tokens_slack file here location_castor_slack_api_data = config['SlackAPI']['local_private_path'] c = ca.CastorApi( location_castor_slack_api_data) # e.g. in user dir outside of GIT repo # get study ID for COVID study study_id = c.select_study_by_name('COVID-19 NL') # Posting to a Slack channel def send_message_to_slack(text): from urllib import request import json post = {"text": "{0}".format(text)} try: json_data = json.dumps(post) # the tokens_slack file should contain the full URL with the token to submit data to slack