예제 #1
0
 def _make_sure_patient(self):
     if not self._patient_df:
         # Loading Parquet files and flattening only happens once.
         self._patient_df = self._spark.read.parquet(self._file_root +
                                                     '/Patient')
         # TODO create inspection functions
         common.custom_log('Number of Patient resources= {}'.format(
             self._patient_df.count()))
예제 #2
0
 def _make_sure_obs(self):
     if not self._obs_df:
         self._obs_df = self._spark.read.parquet(self._file_root +
                                                 '/Observation')
         common.custom_log('Number of Observation resources= {}'.format(
             self._obs_df.count()))
     if not self._flat_obs:
         self._flat_obs = _SparkPatientQuery._flatten_obs(
             self._obs_df, self._code_system)
         common.custom_log('Number of flattened obs rows = {}'.format(
             self._flat_obs.count()))
예제 #3
0
def calc_TX_TB(patient_agg_obs: pd.DataFrame, TX_TB_plan: str, ARV_plan: str,
                TX_TB_plan_answer: List[str], ART_plan_answer: List[str],
                TB_screening: str, YES_CODE: str, end_date_str: str = None) -> pd.DataFrame:
  """Calculates TX_TB indicator with its corresponding disaggregations.

  TX_TB indicator counts the number of ART patients screened for TB in
  the semiannual reporting period who start TB treatment.

  Args:
    patient_agg_obs: A DataFrame generated by `patient_query.find_patient_aggregates()`.
    TX_TB_plan: The concept question code for TB treatment PLAN
    ARV_plan: The concept question code for ANTIRETROVIRAL PLAN
    TB_screening: The concept question code screened for TB
    YES_CODE: The concept answer codes for YES
    TX_TB_plan_answer: The concept answer codes for START DRUG, CONTINUE REGIMEN, REFILLED
    ART_plan_answer: The concept answer codes for START DRUG, CONTINUE REGIMEN, REFILLED
    end_date_str: The string representation of the last date.
  Returns:
    The aggregated DataFrame with age/gender buckets.
  """
  end_date = datetime.today()
  if end_date_str:
    end_date = date_parser.parse(end_date_str)
  # Check for TB TREATMENT PLAN (if START/RELAPSE it means diagnosis was done)
  tb_tx_df = patient_agg_obs[(patient_agg_obs['code'] == TX_TB_plan)].copy()
  tb_tx_df['TX_TB_status'] = (tb_tx_df['last_value_code'].isin(TX_TB_plan_answer))

  # check if patient is on ART
  art_tx_df = patient_agg_obs[(patient_agg_obs['code'] == ARV_plan)].copy()
  art_tx_df['ART_TX'] = (art_tx_df['last_value_code'].isin(ART_plan_answer))

  # check if patient was screened for TB
  tb_screen_df = patient_agg_obs[(patient_agg_obs['code'] == TB_screening)].copy()
  tb_screen_df['TB_screening'] = (tb_screen_df['last_value_code'].isin([YES_CODE]))

  # join the 2 DF
  temp_df = tb_tx_df.merge(art_tx_df[['patientId', 'ART_TX']],
                         on='patientId').merge(tb_screen_df[['patientId', 'TB_screening']],
                                               on='patientId')
  # evaluate
  temp_df['TX_TB'] = ((temp_df['ART_TX'] == True) & (temp_df['TX_TB_status'] == True) &
                      (temp_df['TB_screening'] == True))
  common.custom_log('Number of rows in TX_TB temp_df= {}'.format(
      temp_df.index.size))
  temp_df = _gen_counts_and_ratio(temp_df, end_date, 'TX_TB')
  return temp_df
 def get_patient_obs_view(self,
                          sample_count: tp.Optional[int] = None
                          ) -> pandas.DataFrame:
     """See super-class doc."""
     self._make_sure_spark()
     self._make_sure_patient()
     self._make_sure_obs()
     self._make_sure_encounter()
     base_patient_url = "Patient/"
     # Recalculating the rest is needed since the constraints can be updated.
     flat_enc = self._flatten_encounter("Encounter/",
                                        force_location_type_columns=False)
     # TODO figure where `context` comes from and why.
     join_df = self._flat_obs.join(
         flat_enc,
         flat_enc.encounterId == self._flat_obs.encounterId).where(
             self._all_constraints_sql())
     agg_obs_df = SparkPatientQuery._aggregate_patient_codes(join_df)
     common.custom_log("Number of aggregated obs= {}".format(
         agg_obs_df.count()))
     self._patient_agg_obs_df = SparkPatientQuery._join_patients_agg_obs(
         self._patient_df, agg_obs_df, base_patient_url)
     common.custom_log("Number of joined patient_agg_obs= {}".format(
         self._patient_agg_obs_df.count()))
     # Spark is supposed to automatically cache DFs after shuffle but it seems
     # this is not happening!
     self._patient_agg_obs_df.cache()
     temp_pd_df = self._patient_agg_obs_df.toPandas()
     common.custom_log("patient_obs_view size= {}".format(
         temp_pd_df.index.size))
     temp_pd_df["last_value"] = temp_pd_df.max_date_value.str.split(
         DATE_VALUE_SEPARATOR, expand=True)[1]
     temp_pd_df["first_value"] = temp_pd_df.min_date_value.str.split(
         DATE_VALUE_SEPARATOR, expand=True)[1]
     temp_pd_df[
         "last_value_code"] = temp_pd_df.max_date_value_code.str.split(
             DATE_VALUE_SEPARATOR, expand=True)[1]
     temp_pd_df[
         "first_value_code"] = temp_pd_df.min_date_value_code.str.split(
             DATE_VALUE_SEPARATOR, expand=True)[1]
     # This is good for debug!
     # return temp_pd_df
     return temp_pd_df[[
         "patientId",
         "birthDate",
         "gender",
         "code",
         "num_obs",
         "min_value",
         "max_value",
         "min_date",
         "max_date",
         "first_value",
         "last_value",
         "first_value_code",
         "last_value_code",
     ]]
예제 #5
0
 def get_patient_obs_view(self, base_url: str) -> pandas.DataFrame:
     """See super-class doc."""
     self._make_sure_spark()
     self._make_sure_patient()
     self._make_sure_obs()
     self._make_sure_encounter()
     base_patient_url = base_url + 'Patient/'
     # Recalculating the rest is needed since the constraints can be updated.
     flat_enc = self._flatten_encounter(base_url + 'Encounter/',
                                        force_location_type_columns=False)
     # TODO figure where `context` comes from and why.
     join_df = self._flat_obs.join(
         flat_enc,
         flat_enc.encounterId == self._flat_obs.encounterId).where(
             self.all_constraints_sql())
     agg_obs_df = _SparkPatientQuery._aggregate_patient_codes(join_df)
     common.custom_log('Number of aggregated obs= {}'.format(
         agg_obs_df.count()))
     self._patient_agg_obs_df = _SparkPatientQuery._join_patients_agg_obs(
         self._patient_df, agg_obs_df, base_patient_url)
     common.custom_log('Number of joined patient_agg_obs= {}'.format(
         self._patient_agg_obs_df.count()))
     # Spark is supposed to automatically cache DFs after shuffle but it seems
     # this is not happening!
     self._patient_agg_obs_df.cache()
     temp_pd_df = self._patient_agg_obs_df.toPandas()
     common.custom_log('patient_obs_view size= {}'.format(
         temp_pd_df.index.size))
     temp_pd_df['last_value'] = temp_pd_df.max_date_value.str.split(
         DATE_VALUE_SEPARATOR, expand=True)[1]
     temp_pd_df['first_value'] = temp_pd_df.min_date_value.str.split(
         DATE_VALUE_SEPARATOR, expand=True)[1]
     temp_pd_df[
         'last_value_code'] = temp_pd_df.max_date_value_code.str.split(
             DATE_VALUE_SEPARATOR, expand=True)[1]
     temp_pd_df[
         'first_value_code'] = temp_pd_df.min_date_value_code.str.split(
             DATE_VALUE_SEPARATOR, expand=True)[1]
     # This is good for debug!
     # return temp_pd_df
     return temp_pd_df[[
         'patientId', 'birthDate', 'gender', 'code', 'num_obs', 'min_value',
         'max_value', 'min_date', 'max_date', 'first_value', 'last_value',
         'first_value_code', 'last_value_code'
     ]]
예제 #6
0
 def _make_sure_encounter(self):
     if not self._enc_df:
         self._enc_df = self._spark.read.parquet(self._file_root +
                                                 '/Encounter')
         common.custom_log('Number of Encounter resources= {}'.format(
             self._enc_df.count()))