def _make_sure_patient(self): if not self._patient_df: # Loading Parquet files and flattening only happens once. self._patient_df = self._spark.read.parquet(self._file_root + '/Patient') # TODO create inspection functions common.custom_log('Number of Patient resources= {}'.format( self._patient_df.count()))
def _make_sure_obs(self): if not self._obs_df: self._obs_df = self._spark.read.parquet(self._file_root + '/Observation') common.custom_log('Number of Observation resources= {}'.format( self._obs_df.count())) if not self._flat_obs: self._flat_obs = _SparkPatientQuery._flatten_obs( self._obs_df, self._code_system) common.custom_log('Number of flattened obs rows = {}'.format( self._flat_obs.count()))
def calc_TX_TB(patient_agg_obs: pd.DataFrame, TX_TB_plan: str, ARV_plan: str, TX_TB_plan_answer: List[str], ART_plan_answer: List[str], TB_screening: str, YES_CODE: str, end_date_str: str = None) -> pd.DataFrame: """Calculates TX_TB indicator with its corresponding disaggregations. TX_TB indicator counts the number of ART patients screened for TB in the semiannual reporting period who start TB treatment. Args: patient_agg_obs: A DataFrame generated by `patient_query.find_patient_aggregates()`. TX_TB_plan: The concept question code for TB treatment PLAN ARV_plan: The concept question code for ANTIRETROVIRAL PLAN TB_screening: The concept question code screened for TB YES_CODE: The concept answer codes for YES TX_TB_plan_answer: The concept answer codes for START DRUG, CONTINUE REGIMEN, REFILLED ART_plan_answer: The concept answer codes for START DRUG, CONTINUE REGIMEN, REFILLED end_date_str: The string representation of the last date. Returns: The aggregated DataFrame with age/gender buckets. """ end_date = datetime.today() if end_date_str: end_date = date_parser.parse(end_date_str) # Check for TB TREATMENT PLAN (if START/RELAPSE it means diagnosis was done) tb_tx_df = patient_agg_obs[(patient_agg_obs['code'] == TX_TB_plan)].copy() tb_tx_df['TX_TB_status'] = (tb_tx_df['last_value_code'].isin(TX_TB_plan_answer)) # check if patient is on ART art_tx_df = patient_agg_obs[(patient_agg_obs['code'] == ARV_plan)].copy() art_tx_df['ART_TX'] = (art_tx_df['last_value_code'].isin(ART_plan_answer)) # check if patient was screened for TB tb_screen_df = patient_agg_obs[(patient_agg_obs['code'] == TB_screening)].copy() tb_screen_df['TB_screening'] = (tb_screen_df['last_value_code'].isin([YES_CODE])) # join the 2 DF temp_df = tb_tx_df.merge(art_tx_df[['patientId', 'ART_TX']], on='patientId').merge(tb_screen_df[['patientId', 'TB_screening']], on='patientId') # evaluate temp_df['TX_TB'] = ((temp_df['ART_TX'] == True) & (temp_df['TX_TB_status'] == True) & (temp_df['TB_screening'] == True)) common.custom_log('Number of rows in TX_TB temp_df= {}'.format( temp_df.index.size)) temp_df = _gen_counts_and_ratio(temp_df, end_date, 'TX_TB') return temp_df
def get_patient_obs_view(self, sample_count: tp.Optional[int] = None ) -> pandas.DataFrame: """See super-class doc.""" self._make_sure_spark() self._make_sure_patient() self._make_sure_obs() self._make_sure_encounter() base_patient_url = "Patient/" # Recalculating the rest is needed since the constraints can be updated. flat_enc = self._flatten_encounter("Encounter/", force_location_type_columns=False) # TODO figure where `context` comes from and why. join_df = self._flat_obs.join( flat_enc, flat_enc.encounterId == self._flat_obs.encounterId).where( self._all_constraints_sql()) agg_obs_df = SparkPatientQuery._aggregate_patient_codes(join_df) common.custom_log("Number of aggregated obs= {}".format( agg_obs_df.count())) self._patient_agg_obs_df = SparkPatientQuery._join_patients_agg_obs( self._patient_df, agg_obs_df, base_patient_url) common.custom_log("Number of joined patient_agg_obs= {}".format( self._patient_agg_obs_df.count())) # Spark is supposed to automatically cache DFs after shuffle but it seems # this is not happening! self._patient_agg_obs_df.cache() temp_pd_df = self._patient_agg_obs_df.toPandas() common.custom_log("patient_obs_view size= {}".format( temp_pd_df.index.size)) temp_pd_df["last_value"] = temp_pd_df.max_date_value.str.split( DATE_VALUE_SEPARATOR, expand=True)[1] temp_pd_df["first_value"] = temp_pd_df.min_date_value.str.split( DATE_VALUE_SEPARATOR, expand=True)[1] temp_pd_df[ "last_value_code"] = temp_pd_df.max_date_value_code.str.split( DATE_VALUE_SEPARATOR, expand=True)[1] temp_pd_df[ "first_value_code"] = temp_pd_df.min_date_value_code.str.split( DATE_VALUE_SEPARATOR, expand=True)[1] # This is good for debug! # return temp_pd_df return temp_pd_df[[ "patientId", "birthDate", "gender", "code", "num_obs", "min_value", "max_value", "min_date", "max_date", "first_value", "last_value", "first_value_code", "last_value_code", ]]
def get_patient_obs_view(self, base_url: str) -> pandas.DataFrame: """See super-class doc.""" self._make_sure_spark() self._make_sure_patient() self._make_sure_obs() self._make_sure_encounter() base_patient_url = base_url + 'Patient/' # Recalculating the rest is needed since the constraints can be updated. flat_enc = self._flatten_encounter(base_url + 'Encounter/', force_location_type_columns=False) # TODO figure where `context` comes from and why. join_df = self._flat_obs.join( flat_enc, flat_enc.encounterId == self._flat_obs.encounterId).where( self.all_constraints_sql()) agg_obs_df = _SparkPatientQuery._aggregate_patient_codes(join_df) common.custom_log('Number of aggregated obs= {}'.format( agg_obs_df.count())) self._patient_agg_obs_df = _SparkPatientQuery._join_patients_agg_obs( self._patient_df, agg_obs_df, base_patient_url) common.custom_log('Number of joined patient_agg_obs= {}'.format( self._patient_agg_obs_df.count())) # Spark is supposed to automatically cache DFs after shuffle but it seems # this is not happening! self._patient_agg_obs_df.cache() temp_pd_df = self._patient_agg_obs_df.toPandas() common.custom_log('patient_obs_view size= {}'.format( temp_pd_df.index.size)) temp_pd_df['last_value'] = temp_pd_df.max_date_value.str.split( DATE_VALUE_SEPARATOR, expand=True)[1] temp_pd_df['first_value'] = temp_pd_df.min_date_value.str.split( DATE_VALUE_SEPARATOR, expand=True)[1] temp_pd_df[ 'last_value_code'] = temp_pd_df.max_date_value_code.str.split( DATE_VALUE_SEPARATOR, expand=True)[1] temp_pd_df[ 'first_value_code'] = temp_pd_df.min_date_value_code.str.split( DATE_VALUE_SEPARATOR, expand=True)[1] # This is good for debug! # return temp_pd_df return temp_pd_df[[ 'patientId', 'birthDate', 'gender', 'code', 'num_obs', 'min_value', 'max_value', 'min_date', 'max_date', 'first_value', 'last_value', 'first_value_code', 'last_value_code' ]]
def _make_sure_encounter(self): if not self._enc_df: self._enc_df = self._spark.read.parquet(self._file_root + '/Encounter') common.custom_log('Number of Encounter resources= {}'.format( self._enc_df.count()))