def load_procedures(self, procedures): fp = (CompassFile & 'type = "procedure"').fetch1('file') encounters = np.unique(cohort.Cohort.Encounter().fetch('encounter_id')) tab = csv.read_csv(fp) tab = tab.filter( pc.is_in( tab['order_name'], options=pc.SetLookupOptions(value_set=pa.array(procedures)))) tab = tab.filter( pc.is_in( tab['encounter_id'], options=pc.SetLookupOptions(value_set=pa.array(encounters)))) return tab
def test_is_in(): arr = pa.array([1, 2, None, 1, 2, 3]) result = pc.is_in(arr, value_set=pa.array([1, 3, None])) assert result.to_pylist() == [True, False, True, True, False, True] result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True) assert result.to_pylist() == [True, False, False, True, False, True] result = pc.is_in(arr, value_set=pa.array([1, 3])) assert result.to_pylist() == [True, False, False, True, False, True] result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) assert result.to_pylist() == [True, False, False, True, False, True]
def load_procedures(self, procedures=None, person_id=None): procedures = procedures or self.Procedure.fetch('procedure') fp = (CompassFile & 'type = "procedure"').fetch1('file') tab = csv.read_csv(fp) tab = tab.filter( pc.is_in( tab['order_name'], options=pc.SetLookupOptions(value_set=pa.array(procedures)))) if person_id is not None: tab = tab.filter( pc.is_in(tab['person_id'], options=pc.SetLookupOptions( value_set=pa.array(person_id)))) return tab
def dates( self, resampling_frequency: Optional[Frequency], realizations: Optional[Sequence[int]] = None, ) -> List[datetime.datetime]: if resampling_frequency is not None: raise ValueError("Resampling is not supported by this provider") timer = PerfTimer() table = self._get_or_read_table(["DATE", "REAL"]) et_read_ms = timer.lap_ms() if realizations: mask = pc.is_in(table["REAL"], value_set=pa.array(realizations)) table = table.filter(mask) et_filter_ms = timer.lap_ms() intersected_dates = find_intersected_dates_between_realizations(table) et_find_unique_ms = timer.lap_ms() LOGGER.debug(f"dates() took: {timer.elapsed_ms()}ms (" f"read={et_read_ms}ms, " f"filter={et_filter_ms}ms, " f"find_unique={et_find_unique_ms}ms)") return intersected_dates.astype(datetime.datetime).tolist()
def load_batches(self, key): k = Cohort.Alignment & key k_df = pd.DataFrame(k.fetch(as_dict=True)) f = (CompassFile & 'type = "flowsheet"' & key).fetch1('file') dat = ds.dataset('./compass/flowsheet', format='parquet').to_table() tab = dat.filter( pc.is_in(dat['encounter_id'], options=pc.SetLookupOptions( value_set=pa.array(k_df.encounter_id.unique())))) for b in tab.to_batches(max_chunksize=200): df = b.to_pandas().rename( columns={ 'flowsheet_time': 'time', 'flowsheet_value': 'value', 'flowsheet_days_since_birth': 'days_from_dob' }) df['cohort_id'] = key['cohort_id'] df['procedure'] = key['procedure'] df.days_from_dob = pd.to_numeric(df.days_from_dob, errors='coerce') df['value'] = df['value'].replace({ 'Not delirious- CAM-': 'N', 'Unable to assess': 'U', 'Delirious- CAM+': 'Y', '': np.nan }) yield pd.merge(df.dropna(), k_df, on=['cohort_id', 'encounter_id', 'procedure'])
def isin(self, values): if pa_version_under2p0: return super().isin(values) value_set = [ pa_scalar.as_py() for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] if pa_scalar.type in (pa.string(), pa.null()) ] # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True # for null values, so we short-circuit to return all False array. if not len(value_set): return np.zeros(len(self), dtype=bool) kwargs = {} if pa_version_under3p0: # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises # with unexpected keyword argument in pyarrow 3.0.0+ kwargs["skip_null"] = True result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_)
def _split_into_per_realization_tables(table: pa.Table) -> Dict[int, pa.Table]: per_real_tables: Dict[int, pa.Table] = {} unique_reals = table.column("REAL").unique().to_pylist() for real in unique_reals: # pylint: disable=no-member mask = pc.is_in(table["REAL"], value_set=pa.array([real])) real_table = table.filter(mask).drop(["REAL"]) per_real_tables[real] = real_table return per_real_tables
def get_vectors_for_date_df( self, date: datetime.datetime, vector_names: Sequence[str], realizations: Optional[Sequence[int]] = None, ) -> pd.DataFrame: timer = PerfTimer() columns_to_get = ["DATE", "REAL"] columns_to_get.extend(vector_names) table = self._get_or_read_table(columns_to_get) et_read_ms = timer.lap_ms() # Note that we use MS here to be aligned with storage type in arrow file lookup_date = pa.scalar(date, type=pa.timestamp("ms")) mask = pc.equal(table["DATE"], lookup_date) if realizations: real_mask = pc.is_in(table["REAL"], value_set=pa.array(realizations)) mask = pc.and_(mask, real_mask) table = table.drop(["DATE"]) # table = table.filter(mask).combine_chunks() table = table.filter(mask) et_filter_ms = timer.lap_ms() df = table.to_pandas() # df = table.to_pandas(split_blocks=True, zero_copy_only=True) # del table # not necessary, but a good practice et_to_pandas_ms = timer.lap_ms() LOGGER.debug( f"get_vectors_for_date_df() took: {timer.elapsed_ms()}ms (" f"read={et_read_ms}ms, " f"filter={et_filter_ms}ms, " f"to_pandas={et_to_pandas_ms}ms), " f"#vecs={len(vector_names)}, " f"#real={len(realizations) if realizations else 'all'}, " f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}") return df
def get_vectors_for_date_df( self, date: datetime.datetime, vector_names: Sequence[str], realizations: Optional[Sequence[int]] = None, ) -> pd.DataFrame: if not vector_names: raise ValueError("List of requested vector names is empty") timer = PerfTimer() columns_to_get = ["DATE", "REAL"] columns_to_get.extend(vector_names) table = self._get_or_read_table(columns_to_get) et_read_ms = timer.lap_ms() if realizations: real_mask = pc.is_in(table["REAL"], value_set=pa.array(realizations)) table = table.filter(real_mask) et_filter_ms = timer.lap_ms() np_lookup_date = np.datetime64(date, "ms") table = sample_segmented_multi_real_table_at_date(table, np_lookup_date) et_resample_ms = timer.lap_ms() table = table.drop(["DATE"]) df = table.to_pandas() et_to_pandas_ms = timer.lap_ms() LOGGER.debug( f"get_vectors_for_date_df() took: {timer.elapsed_ms()}ms (" f"read={et_read_ms}ms, " f"filter={et_filter_ms}ms, " f"resample={et_resample_ms}ms, " f"to_pandas={et_to_pandas_ms}ms), " f"#vecs={len(vector_names)}, " f"#real={len(realizations) if realizations else 'all'}, " f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}" ) return df
def demographics(self): cid = self.cohort_id enc_df = (Cohort.Encounter() & { 'cohort_id': cid }).fetch(format='frame').reset_index() fp = (CompassFile & {'type': 'encounter'}).fetch1('file') tab = csv.read_csv(fp) tab = tab.filter( pc.is_in(tab['encounter_id'], options=pc.SetLookupOptions( value_set=pa.array(enc_df.encounter_id.unique())))) enc_df = tab.to_pandas() cols = ['gender', 'age', 'death_during_encounter'] categorical = ['gender', 'death_during_encounter'] return TableOne(enc_df, cols, categorical, nonnormal=['age'], missing=False)
def binary_col(op, l, r): """ interpretor for executing binary operator expressions """ if op == "+": return compute.add_checked(l, r) if op == "*": return compute.multiply_checked(l, r) if op == '-': return compute.subtract_checked(l, r) if op == "=": return compute.equal(l, r) if op == "<>": return compute.not_equal(l, r) if op == "!=": return compute.not_equal(l, r) if op == "or": return compute.or_(l, r) if op == "<": return compute.less(l, r) if op == ">": return compute.greater(l, r) if op == "/": return compute.divide_checked(l, r) if op == "and": return compute.and_(l, r) if op == "in": return compute.is_in(l, r) if op == "==": return compute.equal(l, r) if op == "<=": return compute.less_equal(l, r) if op == ">=": return compute.greater_equal(l, r) raise Exception("binary op not implemented")
def get_vectors_df( self, vector_names: Sequence[str], resampling_frequency: Optional[Frequency], realizations: Optional[Sequence[int]] = None, ) -> pd.DataFrame: if not vector_names: raise ValueError("List of requested vector names is empty") timer = PerfTimer() columns_to_get = ["DATE", "REAL"] columns_to_get.extend(vector_names) table = self._get_or_read_table(columns_to_get) et_read_ms = timer.lap_ms() if realizations: mask = pc.is_in(table["REAL"], value_set=pa.array(realizations)) table = table.filter(mask) et_filter_ms = timer.lap_ms() if resampling_frequency is not None: table = resample_segmented_multi_real_table(table, resampling_frequency) et_resample_ms = timer.lap_ms() df = table.to_pandas(timestamp_as_object=True) et_to_pandas_ms = timer.lap_ms() LOGGER.debug( f"get_vectors_df({resampling_frequency}) took: {timer.elapsed_ms()}ms (" f"read={et_read_ms}ms, " f"filter={et_filter_ms}ms, " f"resample={et_resample_ms}ms, " f"to_pandas={et_to_pandas_ms}ms), " f"#vecs={len(vector_names)}, " f"#real={len(realizations) if realizations else 'all'}, " f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}" ) return df
def isin(self, values) -> npt.NDArray[np.bool_]: if pa_version_under2p0: fallback_performancewarning(version="2") return super().isin(values) # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True # for null values, so we short-circuit to return all False array. if not len(values): return np.zeros(len(self), dtype=bool) kwargs = {} if pa_version_under3p0: # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises # with unexpected keyword argument in pyarrow 3.0.0+ kwargs["skip_null"] = True result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True), **kwargs) # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_)
def get_column_data( self, column_names: Sequence[str], realizations: Optional[Sequence[int]] = None ) -> pd.DataFrame: timer = PerfTimer() # For now guard against requesting the same column multiple times since that # will cause the conversion to pandas below to throw # This should probably raise an exception instead? if len(set(column_names)) != len(column_names): LOGGER.warning("The column_names argument contains duplicate names") column_names = list(dict.fromkeys(column_names)) # We always want to include the the REAL column but watch out in case it is # already included in the column_names list columns_to_get = ( ["REAL", *column_names] if "REAL" not in column_names else column_names ) table = self._cached_reader.read_all().select(columns_to_get) et_read_ms = timer.lap_ms() if realizations: mask = pc.is_in(table["REAL"], value_set=pa.array(realizations)) table = table.filter(mask) et_filter_ms = timer.lap_ms() df = table.to_pandas(ignore_metadata=True) et_to_pandas_ms = timer.lap_ms() LOGGER.debug( f"get_column_data() took: {timer.elapsed_ms()}ms " f"(read={et_read_ms}ms, filter={et_filter_ms}ms, to_pandas={et_to_pandas_ms}ms), " f"#cols={len(column_names)}, " f"#real={len(realizations) if realizations else 'all'}, " f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}" ) return df
def tb_compare_values(tb, tb_cmp, skip_null=True): col_names = tb.column_names comp_col_names = tb_cmp.column_names row_indices = tb.index.index_values row_indices_cmp = tb_cmp.index.index_values col_comp_res = compare_array_like_values( l_org_ar=pa.array(col_names), l_cmp_ar=pa.array(comp_col_names)) row_comp_res = compare_array_like_values( l_org_ar=pa.array(row_indices), l_cmp_ar=pa.array(row_indices_cmp)) bcast_col_comp_res = broadcast(ar=col_comp_res, broadcast_coefficient=rows) row_col_comp = compare_row_and_column(row=row_comp_res, columns=bcast_col_comp_res) tb_ar = tb.to_arrow().combine_chunks() tb_cmp_ar = tb_cmp.to_arrow().combine_chunks() col_data_map = {} for col_name, validity, row_col_validity in zip( col_names, col_comp_res, row_col_comp): if validity.as_py(): chunk_ar_org = tb_ar.column(col_name) chunk_ar_cmp = tb_cmp_ar.column(col_name) data_cmp_res = a_compute.is_in(chunk_ar_org, value_set=chunk_ar_cmp, skip_nulls=skip_null) print(data_cmp_res, row_col_validity) col_data_map[col_name] = compare_two_arrays( data_cmp_res, row_col_validity) else: col_data_map[col_name] = pa.array( populate_column_with_single_value(False, tb.row_count)) is_in_values = list(col_data_map.values()) return cn.Table.from_list(tb.context, col_names, is_in_values)
def get_vectors_df( self, vector_names: Sequence[str], resampling_frequency: Optional[Frequency], realizations: Optional[Sequence[int]] = None, ) -> pd.DataFrame: if resampling_frequency is not None: raise ValueError("Resampling is not supported by this provider") timer = PerfTimer() columns_to_get = ["DATE", "REAL"] columns_to_get.extend(vector_names) table = self._get_or_read_table(columns_to_get) et_read_ms = timer.lap_ms() if realizations: mask = pc.is_in(table["REAL"], value_set=pa.array(realizations)) table = table.filter(mask) et_filter_ms = timer.lap_ms() df = table.to_pandas(timestamp_as_object=True) # df = table.to_pandas(split_blocks=True, self_destruct=True) # del table # not necessary, but a good practice et_to_pandas_ms = timer.lap_ms() LOGGER.debug( f"get_vectors_df() took: {timer.elapsed_ms()}ms (" f"read={et_read_ms}ms, " f"filter={et_filter_ms}ms, " f"to_pandas={et_to_pandas_ms}ms), " f"#vecs={len(vector_names)}, " f"#real={len(realizations) if realizations else 'all'}, " f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}") return df
def dates( self, resampling_frequency: Optional[Frequency], realizations: Optional[Sequence[int]] = None, ) -> List[datetime.datetime]: timer = PerfTimer() table = self._get_or_read_table(["DATE", "REAL"]) et_read_ms = timer.lap_ms() if realizations: mask = pc.is_in(table["REAL"], value_set=pa.array(realizations)) table = table.filter(mask) et_filter_ms = timer.lap_ms() if resampling_frequency is not None: unique_dates_np = table.column("DATE").unique().to_numpy() min_raw_date = np.min(unique_dates_np) max_raw_date = np.max(unique_dates_np) intersected_dates = generate_normalized_sample_dates( min_raw_date, max_raw_date, resampling_frequency ) else: intersected_dates = find_intersected_dates_between_realizations(table) et_find_unique_ms = timer.lap_ms() LOGGER.debug( f"dates({resampling_frequency}) took: {timer.elapsed_ms()}ms (" f"read={et_read_ms}ms, " f"filter={et_filter_ms}ms, " f"find_unique={et_find_unique_ms}ms)" ) return intersected_dates.astype(datetime.datetime).tolist()
def find_intersected_dates_between_realizations(table: pa.Table) -> np.ndarray: """Find the intersection of dates present in all the realizations The input table must contain both REAL and DATE columns, but this function makes no assumptions about sorting of either column""" unique_reals = table.column("REAL").unique().to_numpy() date_intersection = None for real in unique_reals: # pylint: disable=no-member real_mask = pc.is_in(table["REAL"], value_set=pa.array([real])) dates_in_real = table.filter(real_mask).column( "DATE").unique().to_numpy() if date_intersection is None: date_intersection = dates_in_real else: date_intersection = np.intersect1d(date_intersection, dates_in_real, assume_unique=True) if date_intersection is not None: return date_intersection return np.empty(0, dtype=np.datetime64)
def compare_array_like_values(l_org_ar, l_cmp_ar, skip_null=True): return a_compute.is_in(l_org_ar, value_set=l_cmp_ar, skip_nulls=skip_null)