Exemple #1
0
 def load_procedures(self, procedures):
     fp = (CompassFile & 'type = "procedure"').fetch1('file')
     encounters = np.unique(cohort.Cohort.Encounter().fetch('encounter_id'))
     tab = csv.read_csv(fp)
     tab = tab.filter(
         pc.is_in(
             tab['order_name'],
             options=pc.SetLookupOptions(value_set=pa.array(procedures))))
     tab = tab.filter(
         pc.is_in(
             tab['encounter_id'],
             options=pc.SetLookupOptions(value_set=pa.array(encounters))))
     return tab
Exemple #2
0
def test_is_in():
    arr = pa.array([1, 2, None, 1, 2, 3])

    result = pc.is_in(arr, value_set=pa.array([1, 3, None]))
    assert result.to_pylist() == [True, False, True, True, False, True]

    result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True)
    assert result.to_pylist() == [True, False, False, True, False, True]

    result = pc.is_in(arr, value_set=pa.array([1, 3]))
    assert result.to_pylist() == [True, False, False, True, False, True]

    result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True)
    assert result.to_pylist() == [True, False, False, True, False, True]
Exemple #3
0
    def load_procedures(self, procedures=None, person_id=None):
        procedures = procedures or self.Procedure.fetch('procedure')

        fp = (CompassFile & 'type = "procedure"').fetch1('file')
        tab = csv.read_csv(fp)
        tab = tab.filter(
            pc.is_in(
                tab['order_name'],
                options=pc.SetLookupOptions(value_set=pa.array(procedures))))
        if person_id is not None:
            tab = tab.filter(
                pc.is_in(tab['person_id'],
                         options=pc.SetLookupOptions(
                             value_set=pa.array(person_id))))
        return tab
    def dates(
        self,
        resampling_frequency: Optional[Frequency],
        realizations: Optional[Sequence[int]] = None,
    ) -> List[datetime.datetime]:

        if resampling_frequency is not None:
            raise ValueError("Resampling is not supported by this provider")

        timer = PerfTimer()

        table = self._get_or_read_table(["DATE", "REAL"])
        et_read_ms = timer.lap_ms()

        if realizations:
            mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        intersected_dates = find_intersected_dates_between_realizations(table)
        et_find_unique_ms = timer.lap_ms()

        LOGGER.debug(f"dates() took: {timer.elapsed_ms()}ms ("
                     f"read={et_read_ms}ms, "
                     f"filter={et_filter_ms}ms, "
                     f"find_unique={et_find_unique_ms}ms)")

        return intersected_dates.astype(datetime.datetime).tolist()
Exemple #5
0
    def load_batches(self, key):
        k = Cohort.Alignment & key
        k_df = pd.DataFrame(k.fetch(as_dict=True))
        f = (CompassFile & 'type = "flowsheet"' & key).fetch1('file')
        dat = ds.dataset('./compass/flowsheet', format='parquet').to_table()
        tab = dat.filter(
            pc.is_in(dat['encounter_id'],
                     options=pc.SetLookupOptions(
                         value_set=pa.array(k_df.encounter_id.unique()))))
        for b in tab.to_batches(max_chunksize=200):
            df = b.to_pandas().rename(
                columns={
                    'flowsheet_time': 'time',
                    'flowsheet_value': 'value',
                    'flowsheet_days_since_birth': 'days_from_dob'
                })
            df['cohort_id'] = key['cohort_id']
            df['procedure'] = key['procedure']
            df.days_from_dob = pd.to_numeric(df.days_from_dob, errors='coerce')
            df['value'] = df['value'].replace({
                'Not delirious- CAM-': 'N',
                'Unable to assess': 'U',
                'Delirious- CAM+': 'Y',
                '': np.nan
            })

            yield pd.merge(df.dropna(),
                           k_df,
                           on=['cohort_id', 'encounter_id', 'procedure'])
Exemple #6
0
    def isin(self, values):
        if pa_version_under2p0:
            return super().isin(values)

        value_set = [
            pa_scalar.as_py() for pa_scalar in
            [pa.scalar(value, from_pandas=True) for value in values]
            if pa_scalar.type in (pa.string(), pa.null())
        ]

        # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
        # for null values, so we short-circuit to return all False array.
        if not len(value_set):
            return np.zeros(len(self), dtype=bool)

        kwargs = {}
        if pa_version_under3p0:
            # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
            # with unexpected keyword argument in pyarrow 3.0.0+
            kwargs["skip_null"] = True

        result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs)
        # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
        # to False
        return np.array(result, dtype=np.bool_)
def _split_into_per_realization_tables(table: pa.Table) -> Dict[int, pa.Table]:
    per_real_tables: Dict[int, pa.Table] = {}
    unique_reals = table.column("REAL").unique().to_pylist()
    for real in unique_reals:
        # pylint: disable=no-member
        mask = pc.is_in(table["REAL"], value_set=pa.array([real]))
        real_table = table.filter(mask).drop(["REAL"])
        per_real_tables[real] = real_table

    return per_real_tables
    def get_vectors_for_date_df(
        self,
        date: datetime.datetime,
        vector_names: Sequence[str],
        realizations: Optional[Sequence[int]] = None,
    ) -> pd.DataFrame:

        timer = PerfTimer()

        columns_to_get = ["DATE", "REAL"]
        columns_to_get.extend(vector_names)
        table = self._get_or_read_table(columns_to_get)
        et_read_ms = timer.lap_ms()

        # Note that we use MS here to be aligned with storage type in arrow file
        lookup_date = pa.scalar(date, type=pa.timestamp("ms"))
        mask = pc.equal(table["DATE"], lookup_date)

        if realizations:
            real_mask = pc.is_in(table["REAL"],
                                 value_set=pa.array(realizations))
            mask = pc.and_(mask, real_mask)

        table = table.drop(["DATE"])

        # table = table.filter(mask).combine_chunks()
        table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        df = table.to_pandas()
        # df = table.to_pandas(split_blocks=True, zero_copy_only=True)
        # del table  # not necessary, but a good practice
        et_to_pandas_ms = timer.lap_ms()

        LOGGER.debug(
            f"get_vectors_for_date_df() took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"to_pandas={et_to_pandas_ms}ms), "
            f"#vecs={len(vector_names)}, "
            f"#real={len(realizations) if realizations else 'all'}, "
            f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}")

        return df
    def get_vectors_for_date_df(
        self,
        date: datetime.datetime,
        vector_names: Sequence[str],
        realizations: Optional[Sequence[int]] = None,
    ) -> pd.DataFrame:

        if not vector_names:
            raise ValueError("List of requested vector names is empty")

        timer = PerfTimer()

        columns_to_get = ["DATE", "REAL"]
        columns_to_get.extend(vector_names)
        table = self._get_or_read_table(columns_to_get)
        et_read_ms = timer.lap_ms()

        if realizations:
            real_mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(real_mask)
        et_filter_ms = timer.lap_ms()

        np_lookup_date = np.datetime64(date, "ms")
        table = sample_segmented_multi_real_table_at_date(table, np_lookup_date)

        et_resample_ms = timer.lap_ms()
        table = table.drop(["DATE"])

        df = table.to_pandas()
        et_to_pandas_ms = timer.lap_ms()

        LOGGER.debug(
            f"get_vectors_for_date_df() took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"resample={et_resample_ms}ms, "
            f"to_pandas={et_to_pandas_ms}ms), "
            f"#vecs={len(vector_names)}, "
            f"#real={len(realizations) if realizations else 'all'}, "
            f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}"
        )

        return df
Exemple #10
0
 def demographics(self):
     cid = self.cohort_id
     enc_df = (Cohort.Encounter() & {
         'cohort_id': cid
     }).fetch(format='frame').reset_index()
     fp = (CompassFile & {'type': 'encounter'}).fetch1('file')
     tab = csv.read_csv(fp)
     tab = tab.filter(
         pc.is_in(tab['encounter_id'],
                  options=pc.SetLookupOptions(
                      value_set=pa.array(enc_df.encounter_id.unique()))))
     enc_df = tab.to_pandas()
     cols = ['gender', 'age', 'death_during_encounter']
     categorical = ['gender', 'death_during_encounter']
     return TableOne(enc_df,
                     cols,
                     categorical,
                     nonnormal=['age'],
                     missing=False)
Exemple #11
0
def binary_col(op, l, r):
    """
  interpretor for executing binary operator expressions
  """
    if op == "+": return compute.add_checked(l, r)
    if op == "*": return compute.multiply_checked(l, r)
    if op == '-': return compute.subtract_checked(l, r)
    if op == "=": return compute.equal(l, r)
    if op == "<>": return compute.not_equal(l, r)
    if op == "!=": return compute.not_equal(l, r)
    if op == "or": return compute.or_(l, r)
    if op == "<": return compute.less(l, r)
    if op == ">": return compute.greater(l, r)
    if op == "/": return compute.divide_checked(l, r)
    if op == "and": return compute.and_(l, r)
    if op == "in": return compute.is_in(l, r)
    if op == "==": return compute.equal(l, r)
    if op == "<=": return compute.less_equal(l, r)
    if op == ">=": return compute.greater_equal(l, r)
    raise Exception("binary op not implemented")
    def get_vectors_df(
        self,
        vector_names: Sequence[str],
        resampling_frequency: Optional[Frequency],
        realizations: Optional[Sequence[int]] = None,
    ) -> pd.DataFrame:

        if not vector_names:
            raise ValueError("List of requested vector names is empty")

        timer = PerfTimer()

        columns_to_get = ["DATE", "REAL"]
        columns_to_get.extend(vector_names)
        table = self._get_or_read_table(columns_to_get)
        et_read_ms = timer.lap_ms()

        if realizations:
            mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        if resampling_frequency is not None:
            table = resample_segmented_multi_real_table(table, resampling_frequency)
        et_resample_ms = timer.lap_ms()

        df = table.to_pandas(timestamp_as_object=True)
        et_to_pandas_ms = timer.lap_ms()

        LOGGER.debug(
            f"get_vectors_df({resampling_frequency}) took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"resample={et_resample_ms}ms, "
            f"to_pandas={et_to_pandas_ms}ms), "
            f"#vecs={len(vector_names)}, "
            f"#real={len(realizations) if realizations else 'all'}, "
            f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}"
        )

        return df
Exemple #13
0
    def isin(self, values) -> npt.NDArray[np.bool_]:
        if pa_version_under2p0:
            fallback_performancewarning(version="2")
            return super().isin(values)

        # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
        # for null values, so we short-circuit to return all False array.
        if not len(values):
            return np.zeros(len(self), dtype=bool)

        kwargs = {}
        if pa_version_under3p0:
            # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
            # with unexpected keyword argument in pyarrow 3.0.0+
            kwargs["skip_null"] = True

        result = pc.is_in(self._data,
                          value_set=pa.array(values, from_pandas=True),
                          **kwargs)
        # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
        # to False
        return np.array(result, dtype=np.bool_)
    def get_column_data(
        self, column_names: Sequence[str], realizations: Optional[Sequence[int]] = None
    ) -> pd.DataFrame:

        timer = PerfTimer()

        # For now guard against requesting the same column multiple times since that
        # will cause the conversion to pandas below to throw
        # This should probably raise an exception instead?
        if len(set(column_names)) != len(column_names):
            LOGGER.warning("The column_names argument contains duplicate names")
            column_names = list(dict.fromkeys(column_names))

        # We always want to include the the REAL column but watch out in case it is
        # already included in the column_names list
        columns_to_get = (
            ["REAL", *column_names] if "REAL" not in column_names else column_names
        )

        table = self._cached_reader.read_all().select(columns_to_get)
        et_read_ms = timer.lap_ms()

        if realizations:
            mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        df = table.to_pandas(ignore_metadata=True)
        et_to_pandas_ms = timer.lap_ms()

        LOGGER.debug(
            f"get_column_data() took: {timer.elapsed_ms()}ms "
            f"(read={et_read_ms}ms, filter={et_filter_ms}ms, to_pandas={et_to_pandas_ms}ms), "
            f"#cols={len(column_names)}, "
            f"#real={len(realizations) if realizations else 'all'}, "
            f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}"
        )

        return df
    def tb_compare_values(tb, tb_cmp, skip_null=True):

        col_names = tb.column_names
        comp_col_names = tb_cmp.column_names

        row_indices = tb.index.index_values
        row_indices_cmp = tb_cmp.index.index_values

        col_comp_res = compare_array_like_values(
            l_org_ar=pa.array(col_names), l_cmp_ar=pa.array(comp_col_names))
        row_comp_res = compare_array_like_values(
            l_org_ar=pa.array(row_indices), l_cmp_ar=pa.array(row_indices_cmp))
        bcast_col_comp_res = broadcast(ar=col_comp_res,
                                       broadcast_coefficient=rows)
        row_col_comp = compare_row_and_column(row=row_comp_res,
                                              columns=bcast_col_comp_res)

        tb_ar = tb.to_arrow().combine_chunks()
        tb_cmp_ar = tb_cmp.to_arrow().combine_chunks()

        col_data_map = {}
        for col_name, validity, row_col_validity in zip(
                col_names, col_comp_res, row_col_comp):
            if validity.as_py():
                chunk_ar_org = tb_ar.column(col_name)
                chunk_ar_cmp = tb_cmp_ar.column(col_name)
                data_cmp_res = a_compute.is_in(chunk_ar_org,
                                               value_set=chunk_ar_cmp,
                                               skip_nulls=skip_null)
                print(data_cmp_res, row_col_validity)
                col_data_map[col_name] = compare_two_arrays(
                    data_cmp_res, row_col_validity)
            else:
                col_data_map[col_name] = pa.array(
                    populate_column_with_single_value(False, tb.row_count))

        is_in_values = list(col_data_map.values())
        return cn.Table.from_list(tb.context, col_names, is_in_values)
    def get_vectors_df(
        self,
        vector_names: Sequence[str],
        resampling_frequency: Optional[Frequency],
        realizations: Optional[Sequence[int]] = None,
    ) -> pd.DataFrame:

        if resampling_frequency is not None:
            raise ValueError("Resampling is not supported by this provider")

        timer = PerfTimer()

        columns_to_get = ["DATE", "REAL"]
        columns_to_get.extend(vector_names)
        table = self._get_or_read_table(columns_to_get)
        et_read_ms = timer.lap_ms()

        if realizations:
            mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        df = table.to_pandas(timestamp_as_object=True)
        # df = table.to_pandas(split_blocks=True, self_destruct=True)
        # del table  # not necessary, but a good practice
        et_to_pandas_ms = timer.lap_ms()

        LOGGER.debug(
            f"get_vectors_df() took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"to_pandas={et_to_pandas_ms}ms), "
            f"#vecs={len(vector_names)}, "
            f"#real={len(realizations) if realizations else 'all'}, "
            f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}")

        return df
    def dates(
        self,
        resampling_frequency: Optional[Frequency],
        realizations: Optional[Sequence[int]] = None,
    ) -> List[datetime.datetime]:

        timer = PerfTimer()

        table = self._get_or_read_table(["DATE", "REAL"])
        et_read_ms = timer.lap_ms()

        if realizations:
            mask = pc.is_in(table["REAL"], value_set=pa.array(realizations))
            table = table.filter(mask)
        et_filter_ms = timer.lap_ms()

        if resampling_frequency is not None:
            unique_dates_np = table.column("DATE").unique().to_numpy()
            min_raw_date = np.min(unique_dates_np)
            max_raw_date = np.max(unique_dates_np)
            intersected_dates = generate_normalized_sample_dates(
                min_raw_date, max_raw_date, resampling_frequency
            )
        else:
            intersected_dates = find_intersected_dates_between_realizations(table)

        et_find_unique_ms = timer.lap_ms()

        LOGGER.debug(
            f"dates({resampling_frequency}) took: {timer.elapsed_ms()}ms ("
            f"read={et_read_ms}ms, "
            f"filter={et_filter_ms}ms, "
            f"find_unique={et_find_unique_ms}ms)"
        )

        return intersected_dates.astype(datetime.datetime).tolist()
def find_intersected_dates_between_realizations(table: pa.Table) -> np.ndarray:
    """Find the intersection of dates present in all the realizations
    The input table must contain both REAL and DATE columns, but this function makes
    no assumptions about sorting of either column"""

    unique_reals = table.column("REAL").unique().to_numpy()

    date_intersection = None
    for real in unique_reals:
        # pylint: disable=no-member
        real_mask = pc.is_in(table["REAL"], value_set=pa.array([real]))
        dates_in_real = table.filter(real_mask).column(
            "DATE").unique().to_numpy()
        if date_intersection is None:
            date_intersection = dates_in_real
        else:
            date_intersection = np.intersect1d(date_intersection,
                                               dates_in_real,
                                               assume_unique=True)

    if date_intersection is not None:
        return date_intersection

    return np.empty(0, dtype=np.datetime64)
 def compare_array_like_values(l_org_ar, l_cmp_ar, skip_null=True):
     return a_compute.is_in(l_org_ar,
                            value_set=l_cmp_ar,
                            skip_nulls=skip_null)