Beispiel #1
0
class TestDataFrameEvalWithFrame(object):

    def setup_method(self, method):
        self.frame = DataFrame(randn(10, 3), columns=list('abc'))

    def teardown_method(self, method):
        del self.frame

    def test_simple_expr(self, parser, engine):
        res = self.frame.eval('a + b', engine=engine, parser=parser)
        expect = self.frame.a + self.frame.b
        assert_series_equal(res, expect)

    def test_bool_arith_expr(self, parser, engine):
        res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser)
        expect = self.frame.a[self.frame.a < 1] + self.frame.b
        assert_series_equal(res, expect)

    def test_invalid_type_for_operator_raises(self, parser, engine):
        df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
        ops = '+', '-', '*', '/'
        for op in ops:
            with tm.assert_raises_regex(TypeError,
                                        r"unsupported operand type\(s\) "
                                        "for .+: '.+' and '.+'"):
                df.eval('a {0} b'.format(op), engine=engine, parser=parser)
Beispiel #2
0
class TestDataFrameEvalNumExprPandas(tm.TestCase):

    @classmethod
    def setUpClass(cls):
        super(TestDataFrameEvalNumExprPandas, cls).setUpClass()
        cls.engine = 'numexpr'
        cls.parser = 'pandas'
        tm.skip_if_no_ne()

    def setUp(self):
        self.frame = DataFrame(randn(10, 3), columns=list('abc'))

    def tearDown(self):
        del self.frame

    def test_simple_expr(self):
        res = self.frame.eval('a + b', engine=self.engine, parser=self.parser)
        expect = self.frame.a + self.frame.b
        assert_series_equal(res, expect)

    def test_bool_arith_expr(self):
        res = self.frame.eval('a[a < 1] + b', engine=self.engine,
                              parser=self.parser)
        expect = self.frame.a[self.frame.a < 1] + self.frame.b
        assert_series_equal(res, expect)

    def test_invalid_type_for_operator_raises(self):
        df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
        ops = '+', '-', '*', '/'
        for op in ops:
            with tm.assertRaisesRegexp(TypeError,
                                       r"unsupported operand type\(s\) for "
                                       r".+: '.+' and '.+'"):
                df.eval('a {0} b'.format(op), engine=self.engine,
                        parser=self.parser)
Beispiel #3
0
class TestDataFrameEvalNumExprPandas(object):
    @classmethod
    def setup_class(cls):
        cls.engine = 'numexpr'
        cls.parser = 'pandas'
        tm.skip_if_no_ne()

    def setup_method(self, method):
        self.frame = DataFrame(randn(10, 3), columns=list('abc'))

    def teardown_method(self, method):
        del self.frame

    def test_simple_expr(self):
        res = self.frame.eval('a + b', engine=self.engine, parser=self.parser)
        expect = self.frame.a + self.frame.b
        assert_series_equal(res, expect)

    def test_bool_arith_expr(self):
        res = self.frame.eval('a[a < 1] + b',
                              engine=self.engine,
                              parser=self.parser)
        expect = self.frame.a[self.frame.a < 1] + self.frame.b
        assert_series_equal(res, expect)

    def test_invalid_type_for_operator_raises(self):
        df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
        ops = '+', '-', '*', '/'
        for op in ops:
            with tm.assert_raises_regex(
                    TypeError, "unsupported operand type\(s\) "
                    "for .+: '.+' and '.+'"):
                df.eval('a {0} b'.format(op),
                        engine=self.engine,
                        parser=self.parser)
Beispiel #4
0
class TestDataFrameEvalWithFrame(object):

    def setup_method(self, method):
        self.frame = DataFrame(np.random.randn(10, 3), columns=list('abc'))

    def teardown_method(self, method):
        del self.frame

    def test_simple_expr(self, parser, engine):
        res = self.frame.eval('a + b', engine=engine, parser=parser)
        expect = self.frame.a + self.frame.b
        assert_series_equal(res, expect)

    def test_bool_arith_expr(self, parser, engine):
        res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser)
        expect = self.frame.a[self.frame.a < 1] + self.frame.b
        assert_series_equal(res, expect)

    @pytest.mark.parametrize('op', ['+', '-', '*', '/'])
    def test_invalid_type_for_operator_raises(self, parser, engine, op):
        df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
        msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'"

        with pytest.raises(TypeError, match=msg):
            df.eval('a {0} b'.format(op), engine=engine, parser=parser)
Beispiel #5
0
class TestDataFrameEvalWithFrame(object):

    def setup_method(self, method):
        self.frame = DataFrame(np.random.randn(10, 3), columns=list('abc'))

    def teardown_method(self, method):
        del self.frame

    def test_simple_expr(self, parser, engine):
        res = self.frame.eval('a + b', engine=engine, parser=parser)
        expect = self.frame.a + self.frame.b
        assert_series_equal(res, expect)

    def test_bool_arith_expr(self, parser, engine):
        res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser)
        expect = self.frame.a[self.frame.a < 1] + self.frame.b
        assert_series_equal(res, expect)

    @pytest.mark.parametrize('op', ['+', '-', '*', '/'])
    def test_invalid_type_for_operator_raises(self, parser, engine, op):
        df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
        msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'"

        with pytest.raises(TypeError, match=msg):
            df.eval('a {0} b'.format(op), engine=engine, parser=parser)
 def test_invalid_type_for_operator_raises(self, parser, engine):
     df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
     ops = '+', '-', '*', '/'
     for op in ops:
         with tm.assert_raises_regex(
                 TypeError, r"unsupported operand type\(s\) "
                 "for .+: '.+' and '.+'"):
             df.eval('a {0} b'.format(op), engine=engine, parser=parser)
Beispiel #7
0
 def test_invalid_type_for_operator_raises(self, parser, engine):
     df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
     ops = '+', '-', '*', '/'
     for op in ops:
         with tm.assert_raises_regex(TypeError,
                                     r"unsupported operand type\(s\) "
                                     "for .+: '.+' and '.+'"):
             df.eval('a {0} b'.format(op), engine=engine, parser=parser)
Beispiel #8
0
 def test_invalid_type_for_operator_raises(self):
     df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
     ops = '+', '-', '*', '/'
     for op in ops:
         with tm.assertRaisesRegexp(TypeError,
                                    "unsupported operand type\(s\) for "
                                    ".+: '.+' and '.+'"):
             df.eval('a {0} b'.format(op), engine=self.engine,
                     parser=self.parser)
Beispiel #9
0
 def test_invalid_type_for_operator_raises(self):
     df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
     ops = '+', '-', '*', '/'
     for op in ops:
         with tm.assertRaisesRegexp(TypeError,
                                    "unsupported operand type\(s\) for "
                                    ".+: '.+' and '.+'"):
             df.eval('a {0} b'.format(op), engine=self.engine,
                     parser=self.parser)
Beispiel #10
0
def eval_formula(df: DataFrame, formula: str) -> DataFrame:
    try:
        result = df.eval(formula)
    except Exception:
        # for all cases not handled by NumExpr
        result = df.eval(formula, engine='python')

    try:
        # eval can introduce Infinity values (when dividing by 0),
        # which do not have a JSON representation.
        # Let's replace them by NaN:
        return result.replace([np.inf, -np.inf], np.nan)
    except Exception:
        # `result` is not a Series
        return result
Beispiel #11
0
def aggregate_match_sims(simdf: DataFrame, agg_func: str):
    """Aggregate similarities using a numexpr aggregation function.

    Extra functions available: ``@max(*a)``, ``@min(*a)``, ``@mean(*a)``, ``@pow(a,b)``.

    See also:
        `Pandas eval <https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#supported-syntax>`_
        `Numexpr <https://numexpr.readthedocs.io/>`_

    Args:
        simdf: DataFrame of similarities, where columns are matcher names.
        agg_func: Numexpr-style function.
    """

    import warnings, tqdm

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)

        funcs = {
            "max": lambda *args: np.nanmax(args, axis=0),
            "min": lambda *args: np.nanmin(args, axis=0),
            "mean": lambda *args: np.nanmean(args, axis=0),
            "pow": lambda a, b: a**b,
        }
        if agg_func in funcs:
            agg = funcs[agg_func](*(simdf[c] for c in simdf))  # type: ignore
        else:
            agg = simdf.eval(agg_func, local_dict=funcs, engine="python")
        return pd.Series(agg, index=simdf.index, name=0)
Beispiel #12
0
    def apply_to_dataframe(
        self,
        df: pd.DataFrame,
        column_name: str = "unnamed_response",
        do_query: bool = False,
    ) -> None:
        """Apply trained models to an arbitrary dataframe.

        This function will augment the dataframe with a new column
        (with a name given by the ``column_name`` argument) if it
        doesn't already exist. If the dataframe is empty this function
        does nothing.

        Parameters
        ----------
        df : pandas.DataFrame
            Dataframe to read and augment.
        column_name : str
            Name to give the BDT response variable.
        do_query : bool
            Perform a query on the dataframe to select events
            belonging to the region associated with training result;
            necessary if the dataframe hasn't been pre-filtered.

        Examples
        --------
        >>> from tdub.apply import FoldedTrainSummary
        >>> from tdub.frames import raw_dataframe
        >>> df = raw_dataframe("/path/to/file.root")
        >>> fr_1j1b = FoldedTrainSummary("/path/to/folded_training_1j1b")
        >>> fr_1j1b.apply_to_dataframe(df, do_query=True)

        """
        if df.shape[0] == 0:
            log.info("Dataframe is empty, doing nothing")
            return None

        if column_name not in df.columns:
            log.info(f"Creating {column_name} column")
            df[column_name] = -9999.0

        if do_query:
            log.info(f"applying selection filter '{self.selection_used}'")
            mask = df.eval(self.selection_used)
            X = df[self.features].to_numpy()[mask]
        else:
            X = df[self.features].to_numpy()

        if X.shape[0] == 0:
            return None

        y0 = self.model0.predict_proba(X)[:, 1]
        y1 = self.model1.predict_proba(X)[:, 1]
        y2 = self.model2.predict_proba(X)[:, 1]
        y = np.mean([y0, y1, y2], axis=0)

        if do_query:
            df.loc[mask, column_name] = y
        else:
            df[column_name] = y
Beispiel #13
0
def filter_records(df: pd.DataFrame, criteria: str) -> pd.DataFrame:
    try:
        result = df[df.eval(criteria)]
        return result
    except Exception as e:
        logger.log_error(e)
        return df
Beispiel #14
0
    def expr(data: pd.DataFrame, step: str):
        # aliases
        op = step['operation']
        k = step['column'] if 'column' in step else None
        k_new = k if 'new-column' not in step else step['new-column']
        c_expr = step['expression']

        if op == 'text-transform':
            f_expr = eval('lambda value: %s' % c_expr)
            data[k_new] = data[k].apply(f_expr)

        elif op == 'categorize':
            params = dict(data=data, col_name=k, categories=eval(c_expr))
            params.update({'new_col_name': k_new} if 'new-column' in
                          step else {})
            categorize(**params)

        elif op == 'fill-na':
            fill = c_expr
            if c_expr in ['mean', 'max', 'min', 'median']:
                fill = data.eval('%s.%s()' % (k, c_expr))
            data[k].fillna(fill, inplace=True)

        elif op == 'drop-na':
            params = eval(c_expr)
            dropna(data, **params)

        elif op == 'drop-unique':
            params = eval(c_expr)
            drop_columns_with_unique_values(data, **params)

        return data
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    result = (df.eval(
        "team_1_score = team_1_powers * 15 + team_1_tens * 10 - team_1_negs * 5 + team_1_bonus_points"
    ).eval(
        "team_2_score = team_2_powers * 15 + team_2_tens * 10 - team_2_negs * 5 + team_2_bonus_points"
    ).eval("point_diff = team_1_score - team_2_score"))
    return result
Beispiel #16
0
 def test_eval_resolvers_as_list(self):
     # GH 14095
     df = DataFrame(np.random.randn(10, 2), columns=list("ab"))
     dict1 = {"a": 1}
     dict2 = {"b": 2}
     assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
     assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
def make_binary_array(
    data: pd.DataFrame,
    aggregation_level: Optional[AggregationLevel] = None,
    country=None,
    fips=None,
    state=None,
    states=None,
    on=None,
    after=None,
    before=None,
):
    """Create a binary array selecting rows in `data` matching the given parameters."""
    query_parts = []
    # aggregation_level is almost always set. The exception is `DatasetFilter` which is used to
    # get all data in the USA, at all aggregation levels.
    if aggregation_level:
        query_parts.append(f'aggregate_level == "{aggregation_level.value}"')
    if country:
        query_parts.append("country == @country")
    if state:
        query_parts.append("state == @state")
    if fips:
        query_parts.append("fips == @fips")
    if states:
        query_parts.append("state in @states")
    if on:
        query_parts.append("date == @on")
    if after:
        query_parts.append("date > @after")
    if before:
        query_parts.append("date < @before")
    return data.eval(" and ".join(query_parts))
Beispiel #18
0
 def test_eval_resolvers_as_list(self):
     # GH 14095
     df = DataFrame(randn(10, 2), columns=list('ab'))
     dict1 = {'a': 1}
     dict2 = {'b': 2}
     assert (df.eval('a + b', resolvers=[dict1,
                                         dict2]) == dict1['a'] + dict2['b'])
     assert (pd.eval('a + b', resolvers=[dict1,
                                         dict2]) == dict1['a'] + dict2['b'])
Beispiel #19
0
 def test_eval_resolvers_as_list(self):
     # GH 14095
     df = DataFrame(np.random.randn(10, 2), columns=list('ab'))
     dict1 = {'a': 1}
     dict2 = {'b': 2}
     assert (df.eval('a + b', resolvers=[dict1, dict2]) ==
             dict1['a'] + dict2['b'])
     assert (pd.eval('a + b', resolvers=[dict1, dict2]) ==
             dict1['a'] + dict2['b'])
Beispiel #20
0
    def test_eval_resolvers_combined(self):
        # GH 34966
        df = DataFrame(np.random.randn(10, 2), columns=list("ab"))
        dict1 = {"c": 2}

        # Both input and default index/column resolvers should be usable
        result = df.eval("a + b * c", resolvers=[dict1])

        expected = df["a"] + df["b"] * dict1["c"]
        tm.assert_series_equal(result, expected)
Beispiel #21
0
def get_transition_probability(df_reference: pd.DataFrame,
                               df_future: pd.DataFrame, source_group: str,
                               target_group: str) -> pd.DataFrame:
    """
    Obtains transition probabilities from a given reference patient group to
    other patient groups at any visit.

    Args:
        df_reference: The reference patient groups.
        df_future: The future patient groups.
        source_group: The reference patient group.
        target_group: The target patient group.

    Returns:
        The transition probability.
    """

    df_reference = df_reference.query('classification == @source_group').drop(
        'visit_id', axis=1).set_index('subject_id')

    df_future = df_future.set_index('subject_id')

    df_future = df_future.loc[df_reference.index & df_future.index]

    if df_future.shape[0] < 1:

        return None

    df_future.eval('is_target = (classification == @target_group)',
                   inplace=True)

    future_merged = df_future.groupby('subject_id')['is_target'].max()

    return pd.DataFrame(
        {
            'source': source_group,
            'target': target_group,
            'probability': future_merged.mean(),
            'count': future_merged.sum()
        },
        index=[0])
Beispiel #22
0
def filter_records(df: pd.DataFrame, criteria: str) -> pd.DataFrame:
    """
    :param df: Data Set
    :param criteria: python condition
    :return: new Data Set
    """
    try:
        result = df[df.eval(criteria)]
        return result
    except Exception as e:
        logger.log_error(e)
        return df
Beispiel #23
0
def make_rows_key(
    data: pd.DataFrame,
    aggregation_level: Optional[AggregationLevel] = None,
    country=None,
    fips=None,
    state=None,
    states=None,
    on=None,
    after=None,
    before=None,
    location_id_matches: Optional[str] = None,
    exclude_county_999: bool = False,
    exclude_fips_prefix: Optional[str] = None,
):
    """Create a binary array or slice selecting rows in `data` matching the given parameters."""
    query_parts = []
    # aggregation_level is almost always set. The exception is `DatasetFilter` which is used to
    # get all data in the USA, at all aggregation levels.
    if aggregation_level:
        query_parts.append(f'aggregate_level == "{aggregation_level.value}"')
    if country:
        query_parts.append("country == @country")
    if state:
        query_parts.append("state == @state")
    if fips:
        query_parts.append("fips == @fips")
    if states:
        query_parts.append("state in @states")
    if on:
        query_parts.append("date == @on")
    if after:
        query_parts.append("date > @after")
    if before:
        query_parts.append("date < @before")
    if exclude_county_999:
        # I don't think it is possible to use the default fast eval to match a substring. Instead
        # create a binary Series here and refer to it from the query.
        not_county_999 = data[CommonFields.FIPS].str[-3:] != "999"
        query_parts.append("@not_county_999")
    if location_id_matches:
        location_id_match_mask = data.index.get_level_values(
            CommonFields.LOCATION_ID).str.match(location_id_matches)
        query_parts.append("@location_id_match_mask")
    if exclude_fips_prefix:
        not_fips_prefix = data[
            CommonFields.FIPS].str[0:2] != exclude_fips_prefix
        query_parts.append("@not_fips_prefix")

    if query_parts:
        return data.eval(" and ".join(query_parts))
    else:
        # Select all rows
        return slice(None, None, None)
Beispiel #24
0
def order_cols(df: pd.DataFrame,
               cols: Mapping[str, str] = None) -> pd.DataFrame:
    """

    At first adds special column 'i': row index, if it is used in cols.values
    :param df:
    :param cols: mapping out col names to expressions for pd.DataFrame.eval() (using input col names) or just input col names
    :return:
    """
    df = df.copy()

    # Add row index to can eval expressions using it
    def i_term_is_used() -> bool:
        for in_col in cols.values():
            for term in in_col.split():
                if 'i' in term:
                    return True
        return False

    if i_term_is_used():
        df['i'] = np.arange(
            df.shape[0])  # pd.RangeIndex( , name='rec_num') same effect

    df_out = pd.DataFrame(index=df.index)
    #cols_use = omegaconf.OmegaConf.to_container(cols)  # make editable copy
    # if cols_use.pop('rec_num', None):  # 'rec_num' in df_out
    #     df_out['rec_num'] = df['rec_num']

    dict_rename = {}
    for out_col, in_col in cols.items():
        if in_col.isidentifier() and not in_col in dict_rename:
            if in_col not in df.columns:
                df[in_col] = None
            dict_rename[in_col] = out_col
        else:
            df_out[out_col] = df.eval(in_col)

    df_to_rename = df[dict_rename.keys()]
    # removing index if exists because df.rename() renames only columns
    col_index = dict_rename.pop('index',
                                None)  # index will be placed in this column
    if col_index:
        df_out[col_index] = df_out.index
    df_out = df_out.join(df_to_rename.rename(columns=dict_rename, copy=False))

    cols_iter = iter(cols.items())
    index_name, in_1st_col = next(cols_iter)
    if 'index' not in in_1st_col:  # original index is not at 1st column so need to be replaced
        df_out.set_index(index_name, inplace=True)
        return df_out[[k for k, v in cols_iter]]
    #df_out['DATE'] = df_out['DATE'].dt.tz_convert(None)
    return df_out[cols.keys()]
Beispiel #25
0
    def _construct_bands(self, quotes: pd.DataFrame) -> pd.DataFrame:
        # Standard Bolling Bands Algorithm
        quotes['TP'] = quotes.eval("(high + low + close) / 3")

        quotes['std_dev'] = quotes['TP'].rolling(self.num_periods).std()

        quotes['band_center'] = self._get_band_center(quotes)
        quotes['band_upper'] = quotes['band_center'] + self.deviations * quotes['std_dev']
        quotes['band_lower'] = quotes['band_center'] - self.deviations * quotes['std_dev']

        # Long term rolling standard deviation, used to evaluate "consolidation periods".
        quotes['long_term_std'] = quotes['TP'].rolling(self.long_periods).std()

        return quotes
Beispiel #26
0
    def __init__(
        self,
        quotes: pd.DataFrame,
        num_periods: int,
        deviations: float,
        long_periods: int = 60,
        pace: Optional[float] = None
    ) -> None:
        if pace is None:
            signal = quotes.eval("(high + low + close) / 3").values
            pace = signal.var() / np.correlate(signal, signal, 'valid')
        self.pace = pace

        super().__init__(quotes, num_periods, deviations, long_periods)
Beispiel #27
0
    def __init__(self,
                 quotes: pd.DataFrame,
                 short_periods: int,
                 long_periods: int,
                 signal_periods: int,
                 tolerance: float = 2e-1,
                 pace: Optional[float] = None) -> None:
        if pace is None:
            signal = quotes.eval("(high + low + close) / 3").values
            pace = signal.var() / np.correlate(signal, signal, 'valid')
        self.pace = pace

        super().__init__(quotes, short_periods, long_periods, signal_periods,
                         tolerance)
Beispiel #28
0
class TestDataFrameEvalWithFrame:
    def setup_method(self):
        self.frame = DataFrame(np.random.randn(10, 3), columns=list("abc"))

    def teardown_method(self):
        del self.frame

    def test_simple_expr(self, parser, engine):
        res = self.frame.eval("a + b", engine=engine, parser=parser)
        expect = self.frame.a + self.frame.b
        tm.assert_series_equal(res, expect)

    def test_bool_arith_expr(self, parser, engine):
        res = self.frame.eval("a[a < 1] + b", engine=engine, parser=parser)
        expect = self.frame.a[self.frame.a < 1] + self.frame.b
        tm.assert_series_equal(res, expect)

    @pytest.mark.parametrize("op", ["+", "-", "*", "/"])
    def test_invalid_type_for_operator_raises(self, parser, engine, op):
        df = DataFrame({"a": [1, 2], "b": ["c", "d"]})
        msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'"

        with pytest.raises(TypeError, match=msg):
            df.eval(f"a {op} b", engine=engine, parser=parser)
Beispiel #29
0
def plot_error_residuals(predictions: pd.DataFrame) -> None:
    points = (alt.Chart(
        predictions.eval("Residuals = predicted - real")).mark_circle(
            size=100).encode(
                alt.X("predicted",
                      title="Predicted",
                      scale=alt.Scale(zero=False)),
                alt.Y("Residuals", title="Residuals"),
                alt.Color("target"),
            ))

    rule = alt.Chart(pd.DataFrame([{
        "zero": 0
    }])).mark_rule().encode(alt.Y("zero"))

    st.altair_chart(points + rule, use_container_width=True)
Beispiel #30
0
def enrich_etf_summary(clean_etf_summary: pd.DataFrame,
                       buy_params: Dict) -> pd.DataFrame:
    '''Create computed columns which can be used directly for determining whether to sell or buy'''

    vf = buy_params['volatility_factor']

    summary = (clean_etf_summary.eval(
        'pct_over_yhat_upper = (day_high - yhat_upper) / yhat_upper'
    ).eval('pct_below_yhat_lower = (yhat_lower - day_low) / yhat_lower').eval(
        'sell_flag = day_high > yhat_upper'
    ).eval('buy_flag = day_low < yhat_lower').assign(
        dividend_decimal=lambda df: df.dividend_yield.str.strip('%').astype(
            float).fillna(0) / 100
    ).eval(
        f'expected_return = 100 * {vf} * volatility + 100 * dividend_decimal'))

    return summary
Beispiel #31
0
def interaction_sums(labeled_interactions: pd.DataFrame) -> pd.DataFrame:
    '''Sum interaction strengths by type across the region.'''
    within_region = labeled_interactions.eval('region_from == region_to')
    across_regions = ~within_region
    labeled_interactions = labeled_interactions.assign(
        **{
            TYPE_PREFIX + 'any_from': True,
            TYPE_PREFIX + 'any_to': True,
        })
    unit_types = [
        utype[:-5] if utype.endswith('_from') else utype[:-3]
        for utype in _get_available_types(labeled_interactions)
    ]
    regions = np.sort(
        np.array(
            list(
                set(labeled_interactions['region_from'].unique().tolist() +
                    labeled_interactions['region_to'].unique().tolist()))))
    measures = pd.DataFrame([], index=regions)
    for from_unit_type in unit_types:
        for to_unit_type in unit_types:
            measure_name = _intersum_measure_name(from_unit_type, to_unit_type)
            measures[measure_name] = _sum_interactions(labeled_interactions[
                labeled_interactions[f'{TYPE_PREFIX}{from_unit_type}_from']
                & labeled_interactions[f'{TYPE_PREFIX}{to_unit_type}_to']
                & within_region]).reindex(regions, fill_value=0)
    for unit_type in unit_types:
        measures[_intersum_measure_name(unit_type, 'out')] = _sum_interactions(
            labeled_interactions[
                labeled_interactions[f'{TYPE_PREFIX}{unit_type}_from']
                & across_regions]).reindex(regions, fill_value=0)
        measures[_intersum_measure_name(unit_type, 'all')] = _sum_interactions(
            labeled_interactions[labeled_interactions[
                f'{TYPE_PREFIX}{unit_type}_from']]).reindex(regions,
                                                            fill_value=0)
        measures[_intersum_measure_name('out', unit_type)] = _sum_interactions(
            labeled_interactions[
                labeled_interactions[f'{TYPE_PREFIX}{unit_type}_to']
                & across_regions],
            key='region_to').reindex(regions, fill_value=0)
        measures[_intersum_measure_name('all', unit_type)] = _sum_interactions(
            labeled_interactions[
                labeled_interactions[f'{TYPE_PREFIX}{unit_type}_to']],
            key='region_to').reindex(regions, fill_value=0)
    return measures
Beispiel #32
0
def apply_filter_alert_by_epiweek(df: pd.DataFrame,
                                  view_name: str,
                                  epiweek: int = None):
    """

    :param df:
    :param view_name:
    :param epiweek:
    :return:
    """
    if epiweek is not None:
        mask = df.eval('epiweek=={}'.format(epiweek))
    else:
        mask = df.keys()

    df_alert = df[mask].copy().reset_index()

    return df_alert
def get_transition_probability(
    X_reference: pd.DataFrame,
    X_future: pd.DataFrame,
    source_group: str,
    target_group: str,
) -> pd.DataFrame:
    """
    Obtains transition probabilities from a given reference patient group to
    other patient groups at any visit.

    Args:
        X_reference: the reference patient groups
        X_future: the future patient groups
        source_group: the reference patient group
        target_group: the target patient group
    """

    X_reference = (X_reference.query("classification == @source_group").drop(
        "visit_id", axis=1).set_index("subject_id"))

    X_future = X_future.set_index("subject_id").join(X_reference[[]],
                                                     how="inner")

    if X_future.shape[0] < 1:

        return None

    X_future = X_future.eval("is_target = (classification == @target_group)")

    future_merged = X_future.groupby("subject_id")["is_target"].max()

    return pd.DataFrame(
        {
            "source": source_group,
            "target": target_group,
            "probability": future_merged.mean(),
            "count": future_merged.sum(),
        },
        index=[0],
    )
Beispiel #34
0
    def test_invalid_type_for_operator_raises(self, parser, engine, op):
        df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
        msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'"

        with pytest.raises(TypeError, match=msg):
            df.eval('a {0} b'.format(op), engine=engine, parser=parser)
Beispiel #35
0
 def test_eval_object_dtype_binop(self):
     # GH#24883
     df = DataFrame({"a1": ["Y", "N"]})
     res = df.eval("c = ((a1 == 'Y') & True)")
     expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]})
     tm.assert_frame_equal(res, expected)
Beispiel #36
0
    def test_invalid_type_for_operator_raises(self, parser, engine, op):
        df = DataFrame({"a": [1, 2], "b": ["c", "d"]})
        msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'"

        with pytest.raises(TypeError, match=msg):
            df.eval(f"a {op} b", engine=engine, parser=parser)
Beispiel #37
0
def integrities(interaction_sums: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame({
        name: interaction_sums.eval(expr)
        for name, expr in INTEGRITY_EXPRS.items()
    })