def test_not_aggregated(sort_by, sort_sets_by): # FIXME: this is not testing if aggregation used is count or sum kw = {'sort_by': sort_by, 'sort_sets_by': sort_sets_by} Xagg = generate_data(aggregated=True) intersections1, totals1 = _process_data(Xagg, **kw) Xunagg = generate_data() Xunagg.loc[:] = 1 intersections2, totals2 = _process_data(Xunagg, **kw) assert_series_equal(intersections1, intersections2, check_dtype=False) assert_series_equal(totals1, totals2, check_dtype=False)
def test_not_aggregated(sort_by, sort_sets_by): # FIXME: this is not testing if aggregation used is count or sum kw = {'sort_by': sort_by, 'sort_sets_by': sort_sets_by, 'sum_over': None} Xagg = generate_data(aggregated=True) df1, intersections1, totals1 = _process_data(Xagg, **kw) Xunagg = generate_data() Xunagg.loc[:] = 1 df2, intersections2, totals2 = _process_data(Xunagg, **kw) assert_series_equal(intersections1, intersections2, check_dtype=False) assert_series_equal(totals1, totals2, check_dtype=False) assert set(df1.columns) == {'_value', '_bin'} assert set(df2.columns) == {'_value', '_bin'} assert len(df2) == len(Xunagg) assert df2['_bin'].nunique() == len(intersections2)
def test_process_data_series(x, sort_by, sort_categories_by): assert x.name == 'value' for subset_size in ['auto', 'sum', 'count']: for sum_over in ['abc', False]: with pytest.raises(ValueError, match='sum_over is not applicable'): _process_data(x, sort_by=sort_by, sort_categories_by=sort_categories_by, subset_size=subset_size, sum_over=sum_over) total, df, intersections, totals = _process_data( x, subset_size='auto', sort_by=sort_by, sort_categories_by=sort_categories_by, sum_over=None) assert total == x.sum() assert intersections.name == 'value' x_reordered = (x.reorder_levels( intersections.index.names).reindex(index=intersections.index)) assert len(x) == len(x_reordered) assert x_reordered.index.is_unique assert_series_equal(x_reordered, intersections, check_dtype=False) if sort_by == 'cardinality': assert is_ascending(intersections.values[::-1]) else: # check degree order assert is_ascending(intersections.index.to_frame().sum(axis=1)) # TODO: within a same-degree group, the tuple of active names should # be in sort-order if sort_categories_by: assert is_ascending(totals.values[::-1]) assert np.all(totals.index.values == intersections.index.names) assert np.all(df.index.names == intersections.index.names) assert set(df.columns) == {'_value', '_bin'} assert_index_equal(df['_value'].reorder_levels(x.index.names).index, x.index) assert_array_equal(df['_value'], x) assert_index_equal(intersections.iloc[df['_bin']].index, df.index) assert len(df) == len(x)
def test_not_unique(sort_by, sort_categories_by): kw = { 'sort_by': sort_by, 'sort_categories_by': sort_categories_by, 'subset_size': 'sum', 'sum_over': None } Xagg = generate_counts() df1, intersections1, totals1 = _process_data(Xagg, **kw) Xunagg = generate_samples()['value'] Xunagg.loc[:] = 1 df2, intersections2, totals2 = _process_data(Xunagg, **kw) assert_series_equal(intersections1, intersections2, check_dtype=False) assert_series_equal(totals1, totals2, check_dtype=False) assert set(df1.columns) == {'_value', '_bin'} assert set(df2.columns) == {'_value', '_bin'} assert len(df2) == len(Xunagg) assert df2['_bin'].nunique() == len(intersections2)
def test_subset_size_series(x): kw = { 'sort_by': 'cardinality', 'sort_categories_by': 'cardinality', 'sum_over': None } total, df_sum, intersections_sum, totals_sum = _process_data( x, subset_size='sum', **kw) assert total == intersections_sum.sum() if x.index.is_unique: total, df, intersections, totals = _process_data(x, subset_size='auto', **kw) assert total == intersections.sum() assert_frame_equal(df, df_sum) assert_series_equal(intersections, intersections_sum) assert_series_equal(totals, totals_sum) else: with pytest.raises(ValueError): _process_data(x, subset_size='auto', **kw) total, df_count, intersections_count, totals_count = _process_data( x, subset_size='count', **kw) assert total == intersections_count.sum() total, df, intersections, totals = _process_data( x.groupby(level=list(range(len(x.index.levels)))).count(), subset_size='sum', **kw) assert total == intersections.sum() assert_series_equal(intersections, intersections_count, check_names=False) assert_series_equal(totals, totals_count)
def test_process_data_series(X, sort_by, sort_sets_by): with pytest.raises(ValueError, match='sum_over is not applicable'): _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by, sum_over=False) df, intersections, totals = _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by, sum_over=None) assert intersections.name == 'value' X_reordered = (X .reorder_levels(intersections.index.names) .reindex(index=intersections.index)) assert len(X) == len(X_reordered) assert X_reordered.index.is_unique assert_series_equal(X_reordered, intersections, check_dtype=False) if sort_by == 'cardinality': assert is_ascending(intersections.values[::-1]) else: # check degree order assert is_ascending(intersections.index.to_frame().sum(axis=1)) # TODO: within a same-degree group, the tuple of active names should # be in sort-order if sort_sets_by: assert is_ascending(totals.values[::-1]) assert np.all(totals.index.values == intersections.index.names) assert np.all(df.index.names == intersections.index.names) assert set(df.columns) == {'_value', '_bin'} assert_index_equal(df['_value'].reorder_levels(X.index.names).index, X.index) assert_array_equal(df['_value'], X) assert_index_equal(intersections.iloc[df['_bin']].index, df.index) assert len(df) == len(X)
def test_process_data_frame(x, sort_by, sort_categories_by): X = pd.DataFrame({'a': x}) with pytest.warns(None): total, df, intersections, totals = _process_data( X, sort_by=sort_by, sort_categories_by=sort_categories_by, sum_over='a', subset_size='auto') assert df is not X assert total == intersections.sum() # check equivalence to Series total1, df1, intersections1, totals1 = _process_data( x, sort_by=sort_by, sort_categories_by=sort_categories_by, subset_size='sum', sum_over=None) assert intersections.name == 'a' assert_frame_equal(df, df1.rename(columns={'_value': 'a'})) assert_series_equal(intersections, intersections1, check_names=False) assert_series_equal(totals, totals1) # check effect of extra column X = pd.DataFrame({'a': x, 'b': np.arange(len(x))}) total2, df2, intersections2, totals2 = _process_data( X, sort_by=sort_by, sort_categories_by=sort_categories_by, sum_over='a', subset_size='auto') assert total2 == intersections2.sum() assert_series_equal(intersections, intersections2) assert_series_equal(totals, totals2) assert_frame_equal(df, df2.drop('b', axis=1)) assert_array_equal(df2['b'], X['b']) # disregard levels, tested above # check effect not dependent on order/name X = pd.DataFrame({'b': np.arange(len(x)), 'c': x}) total3, df3, intersections3, totals3 = _process_data( X, sort_by=sort_by, sort_categories_by=sort_categories_by, sum_over='c', subset_size='auto') assert total3 == intersections3.sum() assert_series_equal(intersections, intersections3, check_names=False) assert intersections.name == 'a' assert intersections3.name == 'c' assert_series_equal(totals, totals3) assert_frame_equal(df.rename(columns={'a': 'c'}), df3.drop('b', axis=1)) assert_array_equal(df3['b'], X['b']) # check subset_size='count' X = pd.DataFrame({'b': np.ones(len(x), dtype=int), 'c': x}) total4, df4, intersections4, totals4 = _process_data( X, sort_by=sort_by, sort_categories_by=sort_categories_by, sum_over='b', subset_size='auto') total5, df5, intersections5, totals5 = _process_data( X, sort_by=sort_by, sort_categories_by=sort_categories_by, subset_size='count', sum_over=None) assert total5 == intersections5.sum() assert_series_equal(intersections4, intersections5, check_names=False) assert intersections4.name == 'b' assert intersections5.name == 'size' assert_series_equal(totals4, totals5) assert_frame_equal(df4, df5)
def test_process_data(X, sort_by, sort_sets_by): intersections, totals = _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by) X_reordered = (X.reorder_levels( intersections.index.names).reindex(index=intersections.index)) assert len(X) == len(X_reordered) assert X_reordered.index.is_unique assert_series_equal(X_reordered, intersections, check_dtype=False) if sort_by == 'cardinality': assert is_ascending(intersections.values[::-1]) else: # check degree order assert is_ascending(intersections.index.to_frame().sum(axis=1)) # TODO: within a same-degree group, the tuple of active names should # be in sort-order if sort_sets_by: assert is_ascending(totals.values[::-1]) assert np.all(totals.index.values == intersections.index.names)
def test_subset_size_series(x): kw = { 'sort_by': 'cardinality', 'sort_categories_by': 'cardinality', 'sum_over': None } df_sum, intersections_sum, totals_sum = _process_data(x, subset_size='sum', **kw) if x.index.is_unique: expected_warning = None else: expected_warning = FutureWarning with pytest.warns(expected_warning): df, intersections, totals = _process_data(x, subset_size='legacy', **kw) assert_frame_equal(df, df_sum) assert_series_equal(intersections, intersections_sum) assert_series_equal(totals, totals_sum) if x.index.is_unique: df, intersections, totals = _process_data(x, subset_size='auto', **kw) assert_frame_equal(df, df_sum) assert_series_equal(intersections, intersections_sum) assert_series_equal(totals, totals_sum) else: with pytest.raises(ValueError): _process_data(x, subset_size='auto', **kw) df_count, intersections_count, totals_count = _process_data( x, subset_size='count', **kw) df, intersections, totals = _process_data( x.groupby(level=list(range(len(x.index.levels)))).count(), subset_size='sum', **kw) assert_series_equal(intersections, intersections_count, check_names=False) assert_series_equal(totals, totals_count)
def test_process_data_frame(x, sort_by, sort_sets_by): X = pd.DataFrame({'a': x}) with pytest.raises(ValueError, match='sum_over must be False or '): _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by, sum_over=None) df, intersections, totals = _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by, sum_over='a') assert df is not X # check equivalence to Series df1, intersections1, totals1 = _process_data(x, sort_by=sort_by, sort_sets_by=sort_sets_by, sum_over=None) assert intersections.name == 'a' assert_frame_equal(df, df1.rename(columns={'_value': 'a'})) assert_series_equal(intersections, intersections1, check_names=False) assert_series_equal(totals, totals1) # check effect of extra column X = pd.DataFrame({'a': x, 'b': np.arange(len(x))}) df2, intersections2, totals2 = _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by, sum_over='a') assert_series_equal(intersections, intersections2) assert_series_equal(totals, totals2) assert_frame_equal(df, df2.drop('b', axis=1)) assert_array_equal(df2['b'], X['b']) # disregard levels, tested above # check effect not dependent on order/name X = pd.DataFrame({'b': np.arange(len(x)), 'c': x}) df3, intersections3, totals3 = _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by, sum_over='c') assert_series_equal(intersections, intersections3, check_names=False) assert intersections.name == 'a' assert intersections3.name == 'c' assert_series_equal(totals, totals3) assert_frame_equal(df.rename(columns={'a': 'c'}), df3.drop('b', axis=1)) assert_array_equal(df3['b'], X['b']) # check sum_over=False X = pd.DataFrame({'b': np.ones(len(x), dtype=int), 'c': x}) df4, intersections4, totals4 = _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by, sum_over='b') df5, intersections5, totals5 = _process_data(X, sort_by=sort_by, sort_sets_by=sort_sets_by, sum_over=False) assert_series_equal(intersections4, intersections5, check_names=False) assert intersections4.name == 'b' assert intersections5.name == 'size' assert_series_equal(totals4, totals5) assert_frame_equal(df4, df5)
def test_subset_size_frame(x): kw = {'sort_by': 'cardinality', 'sort_categories_by': 'cardinality'} X = pd.DataFrame({'x': x}) df_sum, intersections_sum, totals_sum = _process_data(X, subset_size='sum', sum_over='x', **kw) df_count, intersections_count, totals_count = _process_data( X, subset_size='count', sum_over=None, **kw) # error cases: sum_over=False for subset_size in ['auto', 'sum', 'count']: with pytest.raises(ValueError, match='sum_over'): _process_data(X, subset_size=subset_size, sum_over=False, **kw) with pytest.raises(ValueError, match='sum_over'): _process_data(X, subset_size=subset_size, sum_over=False, **kw) # error cases: sum_over incompatible with subset_size with pytest.raises(ValueError, match='sum_over should be a field'): _process_data(X, subset_size='sum', sum_over=None, **kw) with pytest.raises(ValueError, match='sum_over cannot be set'): _process_data(X, subset_size='count', sum_over='x', **kw) # check subset_size='auto' or 'legacy' with sum_over=str => sum for subset_size in ['auto', 'legacy']: df, intersections, totals = _process_data(X, subset_size=subset_size, sum_over='x', **kw) assert_frame_equal(df, df_sum) assert_series_equal(intersections, intersections_sum) assert_series_equal(totals, totals_sum) # check subset_size='auto' with sum_over=None => count df, intersections, totals = _process_data(X, subset_size='auto', sum_over=None, **kw) assert_frame_equal(df, df_count) assert_series_equal(intersections, intersections_count) assert_series_equal(totals, totals_count) # check legacy use of sum_over=False with pytest.warns(DeprecationWarning, match='sum_over=False'): df, intersections, totals = _process_data(X, subset_size='legacy', sum_over=False, **kw) assert_frame_equal(df, df_count) assert_series_equal(intersections, intersections_count) assert_series_equal(totals, totals_count)