def create_summary_diagnosis(join_tables_output): try: #Test If Previous Node Has Completed Successfully if join_tables_output is not None: sql_script = summary_discharge_diagnosis_query() inject_sql(sql_script, "create-summary-diagnosis") #Add Return Value For Kedro Not To Throw Data Error return dict( status='Success', message = "Creating Summary Diagnosis Complete" ) else: logging.error( "Creating Summary Diagnosis Did Not Execute To Completion") return None except Exception as e: logging.error("!!! An error occured creating summary diagnosis: ") cron_log = open(cron_log_file,"a+") #cron_log = open("C:\/Users\/morris\/Documents\/BRTI\/logs\/data_pipeline_cron.log","a+") cron_log.write("StartTime: {0} Instance: {1} Status: Failed Stage: Creating Summary Diagnosis ".format(cron_time,mode)) cron_log.close() logging.error(formatError(e)) sys.exit(1)
def create_summary_vitalsigns(): vital_signs_count = 0 tble_exists = False try: tble_exists = table_exists('derived','vitalsigns'); if tble_exists: vital_signs_count = table_data_count('derived','vitalsigns') if (vital_signs_count> 0): summary_vitals_script = summary_vital_signs_query() summary_vitals_day1_script = summary_day_one_vitals_query() summary_vitals_day2_script = summary_day_two_vitals_query() summary_vitals_day3_script = summary_day_three_vitals_query() # Run Summary Vital Signs Query inject_sql(summary_vitals_script, "create-summary-vital-signs") # Run Day1 Summary Vital Signs Query inject_sql(summary_vitals_day1_script, "create-summary-day1-vital-signs") # Run Day2 Summary Vital Signs Query inject_sql(summary_vitals_day2_script, "create-summary-day2-vital-signs") # Run Day3 Summary Vital Signs Query inject_sql(summary_vitals_day3_script, "create-summary-day3-vital-signs") else: pass; except Exception as e: logging.error("!!! An error occured creating Vital Signs Summaries: ") cron_log = open(cron_log_file,"a+") cron_log.write("StartTime: {0} Instance: {1} Status: Failed Stage: Creating Summary Maternal Outcomes ".format(cron_time,mode)) cron_log.close() logging.error(formatError(e)) sys.exit(1)
def create_summary_baseline(join_tables_output): tble_exists = False try: # Test if table exist before executing query tble_exists = table_exists('derived', 'baseline') #Test If Previous Node Has Completed Successfully if tble_exists: if join_tables_output is not None: sql_script = summary_baseline_query() inject_sql(sql_script, "create-summary-baseline") #Add Return Value For Kedro Not To Throw Data Error return dict(status='Success', message="Creating Summary Baseline Complete") else: logging.error( "Creating Summary Baseline Did Not Execute To Completion") return None else: return dict(status='Skipped') except Exception as e: logging.error("!!! An error occured creating summary baseline: ") cron_log = open(cron_log_file, "a+") #cron_log = open("C:\/Users\/morris\/Documents\/BRTI\/logs\/data_pipeline_cron.log","a+") cron_log.write( "StartTime: {0} Instance: {1} Status: Failed Stage: Creating Summary Baseline " .format(cron_time, mode)) cron_log.close() raise e logging.error(formatError(e)) sys.exit(1)
def create_maternal_completeness_summary(): try: maternal_completeness_count = 0 mat_completeness_exists = False mat_completeness_exists = table_exists('derived', 'maternity_completeness') if mat_completeness_exists: maternal_completeness_count = table_data_count( 'derived', 'maternity_completeness') if (maternal_completeness_count > 0): sql_script = summary_maternal_completeness_query() inject_sql(sql_script, "create-summary-maternal-completeness") else: pass except Exception as e: logging.error("!!! An error occured creating Vital Signs Summaries: ") cron_log = open(cron_log_file, "a+") cron_log.write( "StartTime: {0} Instance: {1} Status: Failed Stage: Creating Summary Maternal Completeness " .format(cron_time, mode)) cron_log.close() logging.error(formatError(e)) sys.exit(1)
def grant_privileges(create_summary_counts_output): try: #Test If Previous Node Has Completed Successfully if create_summary_counts_output: sql_script = grant_usage_query() inject_sql_procedure(sql_script, "grant-usage-on-tables") end = time.time() execution_time = end-start execution_time_seconds = 0 execution_time_minutes = 0 if execution_time > 0: execution_time_minutes = round(execution_time//60) execution_time_seconds = round(execution_time % 60) cron_log.write("StartTime: {0} Instance: {1} Status: Success ExecutionTime: {2} mins {3} seconds \n".format(cron_time,mode,execution_time_minutes,execution_time_seconds)) cron_log.close() #Add Return Value For Kedro Not To Throw Data Error return dict( status='Success', message = "Granting Priviledges Complete" ) else: logging.error( "Granting Priviledges Complete Did Not Execute To Completion") return None except Exception as e: logging.error( "!!! An error occured Granting Priviledges: ") cron_log.write("StartTime: {0} Instance: {1} Status: Failed Stage: Granting Privileges".format(cron_time,mode)) cron_log.close() logging.error(formatError(e)) sys.exit(1)
def inject_sql_procedure(sql_script, file_name): try: engine.connect().execution_options( isolation_level="AUTOCOMMIT").execute(sql_script) except Exception as e: logging.error('Something went wrong with the SQL file') logging.error(formatError(e)) sys.exit() logging.info('... {0} has successfully run'.format(file_name))
def manually_fix_admissions(tidy_data_output): try: #Test If Previous Node Has Completed Successfully if tidy_data_output is not None: sql_script = manually_fix_admissions_query() inject_sql(sql_script, "manually-fix-admissions") #Add Return Value For Kedro Not To Throw Data Error return dict(status='Success', message="Manual Fixing Of Admissions Complete") else: logging.error( "Manual Fixing Of Admissions Did Not Execute To Completion") return None except Exception as e: logging.error("!!! An error occured manually fixing admissions: ") cron_log = open(cron_log_file, "a+") cron_log.write( "StartTime: {0} Instance: {1} Status: Failed Stage: Manually Fixing Admissions " .format(cron_time, mode)) cron_log.close() logging.error(formatError(e)) sys.exit(1)
def restructure_new_format(k, v, mcl): try: #Check If Multi Value Column if len(v['values']['label']) > 1: k = k v = v['values'] mcl.append(k) else: if len(v['values']['label']) > 0 and len(v['values']['value']) > 0: k = str(k).strip() # Unpack The Values Object To Get Single Values v = { 'label': v['values']['label'][0], 'value': v['values']['value'][0] } # #Add Other Values T MCL Columns For Exploding if str(k).endswith('Oth') or k == "AdmReason": mcl.append(k) return k, v, mcl except Exception as ex: logging.error(v) logging.error(formatError(ex))
def get_key_values(data_raw): mcl = [] # Will store the final list of uid, ingested_at & reformed key-value pairs data_new = [] for index, row in data_raw.iterrows(): # to store all the restructured keys & values for each row try: new_entry = {} # add uid and ingested_at first app_version = None if 'appVersion' in row: app_version = row['appVersion'] if (app_version != None and app_version != ''): #Remove any Other Characters that are non-numeric app_version = int(''.join(d for d in app_version if d.isdigit())) if 'facility' in row: new_entry['facility'] = row['facility'] # Convert All UIDS TO UPPER CASE new_entry['uid'] = str(row['uid']).upper() if 'ingested_at_admission' in row: new_entry['ingested_at'] = row['ingested_at_admission'] if 'ingested_at_discharge' in row: new_entry['ingested_at'] = row['ingested_at_discharge'] if 'started_at' in row: new_entry['started_at'] = row['started_at'] if 'started_at' in row: new_entry['started_at'] = row['started_at'] if 'completed_at' in row: new_entry['completed_at'] = row['completed_at'] if 'ingested_at' in row: new_entry['ingested_at'] = row['ingested_at'] # iterate through key, value and add to dict for c in row['entries']: #RECORDS FORMATTED WITH NEW FORMAT, CONTAINS THE jsonFormat Key and C is the Key if (app_version != '' and app_version != None and (app_version > 454 or int(str(app_version)[:1]) >= 5)): k, v, mcl = restructure_new_format(c, row['entries'][c], mcl) #SET UID FOR ZIM DISCHARGES WHICH COME WITH NULL UID NEW FORMAT if ((k == 'NeoTreeID' or k == 'NUID_BC' or k == 'NUID_M' or k == 'NUID_S') and new_entry['uid'] is None): new_entry['uid'] = v.value #ELSE USE THE OLD FORMAT else: k, v, mcl = restructure(c, mcl) #SET UID FOR ZIM DISCHARGES WHICH COME WITH NULL UID OLD FORMAT if ((k == 'NeoTreeID' or k == 'NUID_BC' or k == 'NUID_M' or k == 'NUID_S') and new_entry['uid'] is None): new_entry['uid'] = v.value new_entry[k] = v # for each row add all the keys & values to a list data_new.append(new_entry) except Exception as ex: logging.error(formatError(ex)) return data_new, set(mcl)
mode = params['env'] interval = 1 cronDir = os.getcwd() #The number of hours before next execution of the next job as set in the database.ini file if 'cron_interval' in params: interval = int(params['cron_interval']) try: # Set The User To Run The Cron Job cron = CronTab(user=True) # Set The Command To Run The Data Pipeline script and activate the virtual environment if cronDir is not None: job = cron.new( command='cd {0} && env/bin/python -m kedro run --env={1}'.format( cronDir, mode)) else: logging.info( 'Please specify directory to find your kedro project in your database.ini file' ) sys.exit() # Set The Time For The Cron Job # Use job.minute for quick testing job.every(interval).hours() # Write the Job To CronTab cron.write(user=True) except Exception as e: logging.error("!!Cron Job Failed To Start Due To Errors: ") logging.error(formatError(e)) sys.exit(1)
def neolab_cleanup(df: pd.DataFrame, position): try: if "Org1.label" in df.columns: if str(df.at[position, "Org1.label"]).lower().strip().find( "coagulase negative staph") > -1: df.at[position, "Org1.label"] = 'Coagulase negative staphylococcus' if df.at[ position, "Org1.value"] == 'Oth' and "OtherOrg1.value" in df.columns: # CONS if (str(df.at[position, "OtherOrg1.value"]).lower().find("staphyloc") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("coagulase negative") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("stapgylococcus")): df.at[position, "Org1.label"] = 'Coagulase negative staphylococcus' df.at[position, "Org1.value"] = 'CONS' #df.at[position,"OtherOrg1.value"] = None # Klebsiella if (str(df.at[position, "OtherOrg1.value"]).lower().find("klesiella") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("klebsiella") > -1 or str(df.at[position, "OtherOrg1.value"]).lower().find("kleb") > -1): df.at[position, "Org1.label"] = 'Klebsiella sp.' df.at[position, "Org1.value"] = 'KLS' #df.at[position,"OtherOrg1.value"] = None # Streptococcus pyogenes if (str(df.at[position, "OtherOrg1.value"]).lower().find( "streptococcus pyogenes") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("streptococcus pygenes") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("streptococcus pyoges") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("s payogenes") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("strptococcus pyogenes") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("b-haemolytic strep") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("streptococcus agalactiae") > -1): df.at[ position, "Org1.label"] = 'Streptococcus pyogenes (Group A Beta haemolytic Strep)' df.at[position, "Org1.value"] = 'StrepPy' #df.at[position,"OtherOrg1.value"] = None #Streptococcus species if (str(df.at[position, "OtherOrg1.value"]).lower().find( "streptococcus species") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("streptococcus species") > -1): df.at[position, "Org1.label"] = 'Streptococcus sp.' df.at[position, "Org1.value"] = 'StrepSp' #df.at[position,"OtherOrg1.value"] = None #Staphylococcus auris if (str(df.at[position, "OtherOrg1.value"]).lower().find("s.aureus") > -1): df.at[position, "Org1.label"] = 'Staphylococcus aureus' df.at[position, "Org1.value"] = 'SA' #df.at[position,"OtherOrg1.value"] = None # Citrobacter if (str(df.at[position, "OtherOrg1.value"]).lower().find("citrobacter") > -1 or str(df.at[position, "OtherOrg1.value"]).lower( ).find("citribacter") > -1): df.at[position, "Org1.label"] = 'Citrobacter sp.' df.at[position, "Org1.value"] = 'Cit' #df.at[position,"OtherOrg1.value"] = None # Proteus if (str(df.at[position, "OtherOrg1.value"]).lower().find("proteus") > -1 or str(df.at[position, "OtherOrg1.value"]).lower().find("ptoteus") > -1): df.at[position, "Org1.label"] = 'Proteus sp.' df.at[position, "Org1.value"] = 'Prot' #df.at[position,"OtherOrg1.value"] = None # Yeasts excluding candida albicans if (str(df.at[position, "OtherOrg1.value"]).lower().find( "yeasts excluding candida albicans") > -1 or str(df.at[position, "OtherOrg1.value"]).lower().find("yeasts") > -1): df.at[position, "Org1.label"] = 'Yeasts (excluding candida)' df.at[position, "Org1.value"] = 'Yea' #df.at[position,"OtherOrg1.value"] = None # Enterobacter if (str(df.at[position, "OtherOrg1.value"]).lower().find("enterobacter") > -1): df.at[position, "Org1.label"] = 'Enterobacter sp.' df.at[position, "Org1.value"] = 'Ent' #df.at[position,"OtherOrg1.value"] = None # Group D streptococcus species if (str(df.at[position, "OtherOrg1.value"]).lower().find("group d") > -1): df.at[position, "Org1.label"] = 'Group D Strep' df.at[position, "Org1.value"] = 'GDS' #df.at[position,"OtherOrg1.value"] = None # Non-haemolytic strep if (str(df.at[position, "OtherOrg1.value"]).lower().find( "non-haemolytic strep") > -1): df.at[position, "Org1.label"] = 'Non haemolytic streptococcus' df.at[position, "Org1.value"] = 'NHS' #df.at[position,"OtherOrg1.value"] = None # Non-lactose fermenter if (str(df.at[position, "OtherOrg1.value"]).lower().find( "non-haemolytic strep") > -1): df.at[position, "Org1.label"] = 'Non-lactose fermenting coliform' df.at[position, "Org1.value"] = 'NLFC' #df.at[position,"OtherOrg1.value"] = None # Pseudomonas aeruginosa if (str(df.at[position, "OtherOrg1.value"]).lower().find("pseudomonas") > -1): df.at[position, "Org1.label"] = 'Pseudomonas aeruginosa' df.at[position, "Org1.value"] = 'Pseud' #df.at[position,"OtherOrg1.value"] = None # Viridans Streptococci if (str(df.at[position, "OtherOrg1.value"]).lower().find("viridans") > -1): df.at[position, "Org1.label"] = 'Viridans streptococcus' df.at[position, "Org1.value"] = 'VirSt' #df.at[position,"OtherOrg1.value"] = None else: # Remove All White Spaces df.at[position, "Org1.label"] = str(df.at[position, "Org1.label"]).strip() except Exception as ex: logging.error("Something Happened Cleaning Up Neolab") logging.error(formatError(ex)) sys.exit(1)
def tidy_tables(): # try: # tuples = fix_duplicate_uid() # duplicate_df = pd.DataFrame(tuples,columns=['id','uid','DateAdmission']); # if not duplicate_df.empty: # unique_uids = duplicate_df['uid'].copy().unique(); # alphabet = "0A1B2C3D4E5F6789" # for ind in unique_uids: # dup_df = duplicate_df[(duplicate_df['uid'] == str(ind))].copy().reset_index(drop=True) # if not dup_df.empty and len(dup_df)>1: # prev_record = None; # for dup_index, dup in dup_df.iterrows(): # if dup_index >=1 and dup['DateAdmission'] is not None: # adm_date = str(dup['DateAdmission']) # prev_adm_date = None # if prev_record is not None and prev_record['DateAdmission'] is not None: # prev_adm_date = str(prev_record['DateAdmission']) # if adm_date == prev_adm_date: # # RECORD IS A DUPLICATE AND WILL BE DELT WITH DURING DEDUPLICATION PROCESS ON NEXT RUN OF PIPELINE # pass; # else: # # #GENERATE NEW UID # uid = '78'.join((random.choice(alphabet)) for x in range(2))+'-'+str(random.randint(1000,9999)); # update_uid('public','sessions',dup['id'],uid); # prev_record = dup; # logging.info("...DONE WITH UPDATE......") # sys.exit() # except Exception as ex: # raise ex; # Read the raw admissions and discharge data into dataframes logging.info("... Fetching raw admission and discharge data") try: #Read Admisiions From The Kedro Catalog adm_raw = catalog.load('read_admissions'); #Read Discharges From The Kedro Catalog dis_raw = catalog.load('read_discharges'); #Read Maternal OutComes from Kedro Catalog mat_outcomes_raw = catalog.load('read_maternal_outcomes') #Read Vital Signs from Kedro Catalog vit_signs_raw = catalog.load('read_vital_signs') #Read Neo Lab Data from Kedro Catalog neolab_raw = catalog.load('read_neolab_data') #Read Baseline Data from Kedro Catalog baseline_raw = catalog.load('read_baseline_data') #Read Diagnoses Data from Kedro Catalog diagnoses_raw = catalog.load('read_diagnoses_data') #Read Maternity Completeness Data from Kedro Catalog mat_completeness_raw = catalog.load('read_mat_completeness_data') except Exception as e: logging.error("!!! An error occured fetching the data: ") logging.error(formatError(e)) # Now let's fetch the list of properties recorded in that table logging.info("... Extracting keys") try: adm_new_entries, adm_mcl = get_key_values(adm_raw) dis_new_entries, dis_mcl = get_key_values(dis_raw) mat_outcomes_new_entries,mat_outcomes_mcl = get_key_values(mat_outcomes_raw) vit_signs_new_entries,vit_signs_mcl = get_key_values(vit_signs_raw) neolab_new_entries,noelab_mcl = get_key_values(neolab_raw) baseline_new_entries,baseline_mcl = get_key_values(baseline_raw) diagnoses_new_entries = get_diagnoses_key_values(diagnoses_raw) mat_completeness_new_entries,mat_completeness_mcl = get_key_values(mat_completeness_raw) except Exception as e: logging.error("!!! An error occured extracting keys: ") logging.error(formatError(e)) # Create the dataframe (df) where each property is pulled out into its own colum logging.info( "... Creating normalized dataframes - one for admissions and one for discharges") try: adm_df = pd.json_normalize(adm_new_entries) if "uid" in adm_df: adm_df.set_index(['uid']) dis_df = pd.json_normalize(dis_new_entries) if "uid" in dis_df: dis_df.set_index(['uid']) mat_outcomes_df =pd.json_normalize(mat_outcomes_new_entries) if "uid" in mat_outcomes_df: mat_outcomes_df.set_index(['uid']) vit_signs_df = pd.json_normalize(vit_signs_new_entries) if "uid" in vit_signs_df: vit_signs_df.set_index(['uid']) neolab_df = pd.json_normalize(neolab_new_entries) baseline_df = pd.json_normalize(baseline_new_entries) if "uid" in baseline_df: baseline_df.set_index(['uid']) diagnoses_df = pd.json_normalize(diagnoses_new_entries) # if "uid" in diagnoses_df: # diagnoses_df.set_index(['uid']) mat_completeness_df = pd.json_normalize(mat_completeness_new_entries) if "uid" in mat_completeness_df: mat_completeness_df.set_index(['uid']) # INITIALISE THE EPISODE COLUMN ON NEOAB DF SO THAT THE COLUMN GETS CREATED # ADD TIME SPENT TO ALL DFs if "started_at" in adm_df and 'completed_at' in adm_df : format_date_without_timezone(adm_df,'started_at'); format_date_without_timezone(adm_df,'completed_at'); adm_df['time_spent'] = (adm_df['completed_at'] - adm_df['started_at']).astype('timedelta64[m]') else: adm_df['time_spent'] = None if "started_at" in dis_df and 'completed_at' in dis_df : format_date_without_timezone(dis_df,'started_at'); format_date_without_timezone(dis_df,'completed_at'); dis_df['time_spent'] = (dis_df['completed_at'] -dis_df['started_at']).astype('timedelta64[m]') else: dis_df['time_spent'] = None if "started_at" in mat_outcomes_df and 'completed_at' in mat_outcomes_df : format_date_without_timezone(mat_outcomes_df,'started_at'); format_date_without_timezone(mat_outcomes_df,'completed_at'); mat_outcomes_df['time_spent'] = (mat_outcomes_df['completed_at'] - mat_outcomes_df['started_at']).astype('timedelta64[m]') else: mat_outcomes_df['time_spent'] = None if "started_at" in vit_signs_df and 'completed_at' in vit_signs_df : format_date_without_timezone(vit_signs_df,'started_at'); format_date_without_timezone(vit_signs_df,'completed_at'); vit_signs_df['time_spent'] = (vit_signs_df['completed_at']-vit_signs_df['started_at']).astype('timedelta64[m]') else: vit_signs_df['time_spent'] = None if "started_at" in neolab_df and 'completed_at' in neolab_df : format_date_without_timezone(neolab_df,'started_at'); format_date_without_timezone(neolab_df,'completed_at'); neolab_df['time_spent'] = (neolab_df['completed_at'] - neolab_df['started_at']).astype('timedelta64[m]') else: neolab_df['time_spent'] = None if "started_at" in baseline_df and 'completed_at' in baseline_df : format_date_without_timezone(baseline_df,'started_at'); format_date_without_timezone(baseline_df,'completed_at'); baseline_df['time_spent'] = (baseline_df['completed_at'] -baseline_df['started_at']).astype('timedelta64[m]') else: baseline_df['time_spent'] = None if ("DateBCR.value" in neolab_df and 'DateBCT.value' in neolab_df and neolab_df['DateBCR.value'] is not None and neolab_df['DateBCT.value'] is not None): neolab_df['BCReturnTime'] = (pd.to_datetime(neolab_df['DateBCR.value'], format='%Y-%m-%dT%H:%M:%S',utc=True).astype('datetime64[ns]') - pd.to_datetime(neolab_df['DateBCT.value'], format='%Y-%m-%dT%H:%M:%S',utc=True).astype('datetime64[ns]')).astype('timedelta64[h]') else: neolab_df['BCReturnTime'] = None baseline_df['LengthOfStay.value'] = None baseline_df['LengthOfStay.label'] = None baseline_df['LengthOfLife.value'] = None baseline_df['LengthOfLife.label'] = None #Length of Life and Length of Stay on Baseline Data date_format = "%Y-%m-%d" for index, row in baseline_df.iterrows(): baseline_df['LengthOfStay.label'].iloc[index] ="Length of Stay" if (is_date(str(row['DateTimeDischarge.value'])) and is_date(str(row['DateTimeAdmission.value']))): DateTimeDischarge = dt.strptime(str(str(row['DateTimeDischarge.value']))[:10].strip().replace('T',''),date_format) DateTimeAdmission = dt.strptime(str(str(row['DateTimeAdmission.value']))[:10].strip().replace('T',''),date_format) delta_los = DateTimeDischarge-DateTimeAdmission baseline_df['LengthOfStay.value'].iloc[index] = delta_los.days else: baseline_df['LengthOfStay.value'].iloc[index] = None baseline_df['LengthOfLife.label'].iloc[index] ="Length of Life" if 'DateTimeDeath.value' in row and (is_date(str(row['DateTimeDeath.value'])) and is_date(str(row['DateTimeAdmission.value']))): DateTimeDeath = dt.strptime(str(str(row['DateTimeDeath.value']))[:10].strip().replace('T',''), date_format) DateTimeAdmission = dt.strptime(str(str(row['DateTimeAdmission.value']))[:10].strip().replace('T',''), date_format) delta_lol = DateTimeDeath - DateTimeAdmission baseline_df['LengthOfLife.value'].iloc[index] = delta_lol.days; else: baseline_df['LengthOfLife.value'].iloc[index] = None; # watch out for time zone (tz) issues if you change code (ref: https://github.com/pandas-dev/pandas/issues/25571) set_key_to_none(adm_df,'DateHIVtest.value') set_key_to_none(adm_df,'DateHIVtest.label') set_key_to_none(adm_df,'HIVtestResult.value') set_key_to_none(adm_df,'HIVtestResult.label') set_key_to_none(adm_df,'ANVDRLDate.value') set_key_to_none(adm_df,'ANVDRLDate.label') set_key_to_none(adm_df,'HAART.value') set_key_to_none(adm_df,'HAART.label') set_key_to_none(adm_df,'LengthHAART.value') set_key_to_none(adm_df,'LengthHAART.label') set_key_to_none(adm_df,'NVPgiven.value') set_key_to_none(adm_df,'NVPgiven.label') set_key_to_none(adm_df,'DateTimeAdmission.value') set_key_to_none(adm_df,'DateTimeAdmission.label') set_key_to_none(adm_df,'ROMlength.label') set_key_to_none(adm_df,'ROMlength.value') set_key_to_none(adm_df,'ROMLength.label') set_key_to_none(adm_df,'ROMLength.value') #Format Dates Admissions Tables format_date(adm_df,'DateTimeAdmission.value') format_date(adm_df,'EndScriptDatetime.value') format_date(adm_df,'DateHIVtest.value') format_date(adm_df,'ANVDRLDate.value') #Format Dates Discharge Table format_date(dis_df,'DateAdmissionDC.value') format_date(dis_df,'DateDischVitals.value') format_date(dis_df,'DateDischWeight.value') format_date(dis_df,'DateTimeDischarge.value') format_date(dis_df,'EndScriptDatetime.value') format_date(dis_df,'DateWeaned.value') format_date(dis_df,'DateTimeDeath.value') format_date(dis_df,'DateAdmission.value') format_date(dis_df,'BirthDateDis.value') format_date(dis_df,'DateHIVtest.value') format_date(dis_df,'DateVDRLSameHIV.value') # Maternal Outcomes set_key_to_none(mat_outcomes_df,'TypeBirth.label') set_key_to_none(mat_outcomes_df,'Presentation.label') set_key_to_none(mat_outcomes_df,'BabyNursery.label') set_key_to_none(mat_outcomes_df,'Reason.label') set_key_to_none(mat_outcomes_df,'ReasonOther.label') set_key_to_none(mat_outcomes_df,'CryBirth.label') set_key_to_none(mat_outcomes_df,'Apgar1.value') set_key_to_none(mat_outcomes_df,'Apgar5.value') set_key_to_none(mat_outcomes_df,'Apgar10.value') set_key_to_none(mat_outcomes_df,'PregConditions.label') set_key_to_none(mat_outcomes_df,'BirthDateDis.value') # Baselines Tables format_date(baseline_df,'DateTimeAdmission.value') format_date(baseline_df,'DateTimeDischarge.value') format_date(baseline_df,'DateTimeDeath.value') set_key_to_none(baseline_df,'AWGroup.value') set_key_to_none(baseline_df,'BWGroup.value') #Vital Signs Table format_date(vit_signs_df,'D1Date.value') format_date(vit_signs_df,'TimeTemp1.value') format_date(vit_signs_df,'TimeTemp2.value') format_date(vit_signs_df,'EndScriptDatetime.value') # CREATE AGE CATEGORIES if not adm_df.empty: for position,admission in adm_df.iterrows(): age_list =[] period = 0 if 'Age.value' in admission and str(admission['Age.value']).isdigit(): period = admission['Age.value'] else: if 'Age.value' in admission and str(admission['Age.value']) != 'nan': # Get The Value which is a string e.g 3 days, 4 hours age_list = str(admission['Age.value']).split(",") else: if 'AgeB.value' in admission and str(admission['AgeB.value']) != 'nan': age_list = str(admission['AgeB.value']).split(",") # Initialise Hours hours = 0 # If size of List is 1 it either means its days only or hours only if len(age_list) == 1: age = age_list[0] # Check if hours or Days if 'hour' in age: hours= [int(s) for s in age.replace("-","").split() if s.isdigit()] # Check if value contains figures if len(hours) >0: period = hours[0] else: if "an" in age: # IF AN HOUR period = 1 elif 'day' in age: hours = [int(s) for s in age.replace("-","").split() if s.isdigit()] if len(hours) >0: period = hours[0] * 24 elif 'second' in age: # FEW SECONDS CAN BE ROUNDED OFF 1 HOUR period = 1 elif 'minute' in age: # MINUTES CAN BE ROUNDED OFF 1 HOUR period = 1 pass; # Contains Both Hours and Days elif len(age_list) == 2: age_days = age_list[0] age_hours = age_list[1] if 'day' in age_days and 'hour' in age_hours: number_hours_days= [int(s) for s in age_days.split() if s.isdigit()] number_hours = [int(s) for s in age_hours.split() if s.isdigit()] if (len(number_hours) >0 and len(number_hours_days)>0): period = (number_hours_days[0]) * 24 +(number_hours[0]) else: pass; if period>0: adm_df.loc[position,'Age.value'] = period if period< 2: adm_df.loc[position,'AgeCategory'] = 'Fresh Newborn (< 2 hours old)' elif period>2 and period<=23: adm_df.loc[position,'AgeCategory'] = 'Newborn (2 - 23 hrs old)' elif period>23 and period<=47: adm_df.loc[position,'AgeCategory']= 'Newborn (1 day - 1 day 23 hrs old)' elif period>47 and period<= 71: adm_df.loc[position,'AgeCategory']= 'Infant (2 days - 2 days 23 hrs old)' else: adm_df.loc[position,'AgeCategory'] = 'Infant (> 3 days old)' ########################## UPDATE ADMISSION SCRIPT WITH NEW KEYS ######################## if "BirthWeight.value" in admission and str(admission["BirthWeight.value"]) != 'nan' and admission["BirthWeight.value"] is not None: pass; else: key_change(adm_df,admission,position,'BW.value','BirthWeight.value') if "Convulsions.value" in admission and str(admission["Convulsions.value"]) != 'nan' and admission["Convulsions.value"] is not None: pass; else: key_change(adm_df,admission,position,'Conv.value','Convulsions.value') if ('SymptomReviewNeurology.value' in admission and str(admission["SymptomReviewNeurology.value"]) != 'nan' and admission["SymptomReviewNeurology.value"] is not None): pass; else: key_change(adm_df,admission,position,'SRNeuroOther.value','SymptomReviewNeurology.value') if 'LowBirthWeight.value' in admission and str(admission["LowBirthWeight.value"]) !='nan' and admission["LowBirthWeight.value"] is not None: pass; else: key_change(adm_df,admission,position,'LBW.value','LowBirthWeight.value') if 'AdmissionWeight.value' in admission and str(admission["AdmissionWeight.value"]) != 'nan' and admission["AdmissionWeight.value"] is not None : pass; else: key_change(adm_df,admission,position,'AW.value','AdmissionWeight.value') #Fix differences in Column data type definition if 'BSUnitmg.value' in admission and str(admission["BSUnitmg.value"]) !='nan' and admission["BSUnitmg.value"] is not None: pass; else: key_change(adm_df,admission,position,'BSmgdL.value','BSUnitmg.value') if 'BSmmol.value' in admission and str(admission["BSmmol.value"])!='nan' and admission["BSmmol.value"] is not None: key_change(adm_df,admission,position,'BSmmol.value','BloodSugarmmol.value'); if 'BSmg.value' in admission and str(admission["BSmg.value"])!='nan' and admission["BSmg.value"] is not None: key_change(adm_df,admission,position,'BSmg.value','BloodSugarmg.value') if "ROMlength.value" in admission and str(admission["ROMlength.value"]) != 'nan' and admission["ROMlength.value"] is not None: key_change(adm_df,admission,position,'ROMlength.value','ROMLength.value'); if "ROMlength.label" in admission and str(admission["ROMlength.label"]) != 'nan' and admission["ROMlength.label"] is not None: key_change(adm_df,admission,position,'ROMlength.label','ROMLength.label'); if "Age.value" in adm_df: adm_df['Age.value'] = pd.to_numeric(adm_df['Age.value'], errors='coerce') if 'AdmissionWeight.value' in adm_df: adm_df['AdmissionWeight.value'] = pd.to_numeric(adm_df['AdmissionWeight.value'], errors='coerce') if 'BirthWeight.value' in adm_df: adm_df['BirthWeight.value'] = pd.to_numeric(adm_df['BirthWeight.value'], errors='coerce') if not dis_df.empty: for position,discharge in dis_df.iterrows(): if 'BirthWeight.value' in discharge and str(discharge['BirthWeight.value'])!='nan' and discharge['BirthWeight.value'] is not None: pass; else: key_change(dis_df,discharge,position,'BWTDis.value','BirthWeight.value') if 'DOBTOB.value' in discharge and str(discharge['DOBTOB.value'])!='nan' and discharge['DOBTOB.value'] is not None: pass; else: key_change(dis_df,discharge,position,'BirthDateDis.value','DOBTOB.value') if 'ModeDelivery.value' in discharge and str(discharge['ModeDelivery.value'])!='nan' and discharge['ModeDelivery.value'] is not None: pass; else: key_change(dis_df,discharge,position,'Delivery.value','ModeDelivery.value') if 'Temperature.value' in discharge and str(discharge['Temperature.value'])!='nan' and discharge['Temperature.value'] is not None: pass; else: key_change(dis_df,discharge,position,'NNUAdmTemp.value','Temperature.value') if 'Gestation.value' in discharge and str(discharge['Gestation.value'])!='nan' and discharge['Gestation.value'] is not None: pass; else: key_change(dis_df,discharge,position,'GestBirth.value','Gestation.value') if 'AdmReason.value' in discharge and str(discharge['AdmReason.value'])!='nan' and discharge['AdmReason.value'] is not None: pass; else: key_change(dis_df,discharge,position,'PresComp.value','AdmReason.value') # Join Maternal Completeness and Maternal Outcomes /A Case For Malawi if not mat_outcomes_df.empty and not mat_completeness_df.empty: latest_mat_outcomes_df = mat_outcomes_df[pd.to_datetime(mat_outcomes_df['DateAdmission.value']) >='2021-10-01'] previous_mat_outcomes_df = mat_completeness_df[pd.to_datetime(mat_completeness_df['DateAdmission.value']) <='2021-09-30'] mat_completeness_df = pd.concat([latest_mat_outcomes_df, previous_mat_outcomes_df], ignore_index=True) # Create Episode Column for Neolab Data if not neolab_df.empty: # Initialise the column neolab_df['episode'] = 0 # Initialise BCR TYPE neolab_df['BCType']= None neolab_df['DateBCT.value']=pd.to_datetime(neolab_df['DateBCT.value']) for index,row in neolab_df.iterrows(): # Data Cleaning neolab_cleanup(neolab_df,index) #Set Episodes control_df = neolab_df[neolab_df['uid'] == row['uid']].copy().sort_values(by=['DateBCT.value']).reset_index(drop=True) if not control_df.empty: episode =1; if neolab_df.at[index,'episode'] ==0: for innerIndex, innerRow in control_df.iterrows() : if innerIndex == 0: #Episode Remains 1 pass; else: control_df_date_bct = control_df.at[innerIndex,'DateBCT.value'] prev_control_df_date_bct = control_df.at[innerIndex-1,'DateBCT.value'] if len(str(control_df_date_bct)) >9 and len(str(prev_control_df_date_bct)) > 9 : if str(control_df_date_bct)[:10] == str(prev_control_df_date_bct)[:10]: # Episode Remains the same as previous Episode pass; else: episode = episode+1; # Set The Episode Value For All Related Episodes in the Main DF control_df.loc[innerIndex,'episode']= episode; neolab_df.loc[(neolab_df['uid'] ==control_df.at[innerIndex,'uid']) & (neolab_df['DateBCT.value'] ==control_df.at[innerIndex,'DateBCT.value']) & (neolab_df['DateBCR.value'] == control_df.at[innerIndex,'DateBCR.value']),'episode'] = episode #Add BCR TYPE TO CONTROL DF # Loop is necessary since BCType is dependant on the set episodes for control_index, bct_row in control_df.iterrows() : bct_type_df = control_df[(control_df['uid'] == bct_row['uid']) & (control_df['episode'] == bct_row['episode'])].copy().sort_values(by=['DateBCR.value']).reset_index(drop=True) if not bct_type_df.empty: preliminary_index= 1; for bct_index, row in bct_type_df.iterrows(): bct_value = None; bct_values_from_df = neolab_df.loc[(neolab_df['uid'] ==bct_type_df.at[bct_index,'uid']) & (neolab_df['DateBCT.value'] ==bct_type_df.at[bct_index,'DateBCT.value']) & (neolab_df['DateBCR.value'] == bct_type_df.at[bct_index,'DateBCR.value'])]['BCType'].values if len(bct_values_from_df) >0: bct_value = bct_values_from_df[0] if bct_value is None: if (bct_type_df.at[bct_index,'BCResult.value'] != 'Pos' and bct_type_df.at[bct_index,'BCResult.value'] != 'Neg' and bct_type_df.at[bct_index,'BCResult.value'] != 'PC'): bct_type_df.loc[bct_index,'BCType'] = "PRELIMINARY-"+str(preliminary_index); preliminary_index=preliminary_index+1 else: if bct_index == len(bct_type_df)-1: bct_type_df.loc[bct_index,'BCType'] = "FINAL"; else: bct_type_df.loc[bct_index,'BCType'] = "PRELIMINARY-"+str(preliminary_index); preliminary_index = preliminary_index+1; # Set The BCR Type For All Related Records in the Main DFclear if bct_type_df.at[bct_index,'BCType'] is not None: neolab_df.loc[(neolab_df['uid'] ==bct_type_df.at[bct_index,'uid']) & (neolab_df['DateBCT.value'] ==bct_type_df.at[bct_index,'DateBCT.value']) & (neolab_df['DateBCR.value'] == bct_type_df.at[bct_index,'DateBCR.value']),'BCType'] = bct_type_df.at[bct_index,'BCType'] # Make changes to admissions and baseline data to match fields in power bi if not adm_df.empty: adm_df = create_columns(adm_df) if not baseline_df.empty: baseline_df = create_columns(baseline_df) except Exception as e: logging.error( "!!! An error occured normalized dataframes/changing data types: ") logging.error(formatError(e)) # Now write the cleaned up admission and discharge tables back to the database logging.info( "... Writing the tidied admission and discharge back to the database") try: #Save Derived Admissions To The DataBase Using Kedro if not adm_df.empty: adm_df.columns = adm_df.columns.str.replace(r"[()-]", "_") catalog.save('create_derived_admissions',adm_df) #Save Derived Admissions To The DataBase Using Kedro if not dis_df.empty: catalog.save('create_derived_discharges',dis_df) #Save Derived Maternal Outcomes To The DataBase Using Kedro if not mat_outcomes_df.empty: catalog.save('create_derived_maternal_outcomes',mat_outcomes_df) #Save Derived Vital Signs To The DataBase Using Kedro if not vit_signs_df.empty: catalog.save('create_derived_vital_signs',vit_signs_df) #Save Derived NeoLab To The DataBase Using Kedro if not neolab_df.empty: #SET INDEX if "uid" in neolab_df: neolab_df.set_index(['uid']) if ("episode" in neolab_df): neolab_df.sort_values(by=['uid','episode']) catalog.save('create_derived_neolab',neolab_df) #Save Derived Baseline To The DataBase Using Kedro if not baseline_df.empty: catalog.save('create_derived_baselines',baseline_df) #Save Derived Diagnoses To The DataBase Using Kedro if not diagnoses_df.empty: catalog.save('create_derived_diagnoses',diagnoses_df) #Save Derived Maternity Completeness To The DataBase Using Kedro if not mat_completeness_df.empty: catalog.save('create_derived_maternity_completeness',mat_completeness_df) except Exception as e: logging.error( "!!! An error occured writing admissions and discharge output back to the database: ") logging.error(formatError(e)) logging.info("... Creating MCL count tables") try: if not adm_df.empty: explode_column(adm_df, adm_mcl,"") if not dis_df.empty: explode_column(dis_df, dis_mcl,"disc_") if not mat_outcomes_df.empty: explode_column(mat_outcomes_df,mat_outcomes_mcl,"mat_") if not vit_signs_df.empty: explode_column(vit_signs_df,vit_signs_mcl,"vit_") if not baseline_df.empty: explode_column(baseline_df,baseline_mcl,"bsl_") if not mat_completeness_df.empty: explode_column(mat_completeness_df,mat_completeness_mcl,"matcomp_") except Exception as e: logging.error("!!! An error occured exploding MCL columns: ") logging.error(formatError(e))