def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None): sniffer = CategoricalSniffer(NAAction(NA_types=NA_types)) for data in datas: done = sniffer.sniff(data) if done: assert exp_finish_fast break else: assert not exp_finish_fast assert sniffer.levels_contrast() == (exp_levels, exp_contrast)
def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action): num_column_counts = {} cat_sniffers = {} examine_needed = set(factors) for data in data_iter_maker(): for factor in list(examine_needed): value = factor.eval(factor_states[factor], data) if factor in cat_sniffers or guess_categorical(value): if factor not in cat_sniffers: cat_sniffers[factor] = CategoricalSniffer(NA_action, factor.origin) done = cat_sniffers[factor].sniff(value) if done: examine_needed.remove(factor) else: # Numeric value = atleast_2d_column_default(value) _max_allowed_dim(2, value, factor) column_count = value.shape[1] num_column_counts[factor] = column_count examine_needed.remove(factor) if not examine_needed: break # Pull out the levels cat_levels_contrasts = {} for factor, sniffer in six.iteritems(cat_sniffers): cat_levels_contrasts[factor] = sniffer.levels_contrast() return (num_column_counts, cat_levels_contrasts)
def test_CategoricalSniffer(): patch_patsy() from patsy.categorical import CategoricalSniffer def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None): sniffer = CategoricalSniffer(NAAction(NA_types=NA_types)) for data in datas: done = sniffer.sniff(data) if done: assert exp_finish_fast break else: assert not exp_finish_fast assert sniffer.levels_contrast() == (exp_levels, exp_contrast) t([], [pd.Categorical.from_array([1, 2, None])], True, (1, 2)) # check order preservation t([], [pd.Categorical([1, 0], ["a", "b"])], True, ("a", "b")) t([], [pd.Categorical([1, 0], ["b", "a"])], True, ("b", "a")) # check that if someone sticks a .contrast field onto a Categorical # object, we pick it up: c = pd.Categorical.from_array(["a", "b"]) c.contrast = "CONTRAST" t([], [c], True, ("a", "b"), "CONTRAST") t([], [C([1, 2]), C([3, 2])], False, (1, 2, 3)) # check order preservation t([], [C([1, 2], levels=[1, 2, 3]), C([4, 2])], True, (1, 2, 3)) t([], [C([1, 2], levels=[3, 2, 1]), C([4, 2])], True, (3, 2, 1)) # do some actual sniffing with NAs in t(["None", "NaN"], [C([1, np.nan]), C([10, None])], False, (1, 10)) # But 'None' can be a type if we don't make it represent NA: sniffer = CategoricalSniffer(NAAction(NA_types=["NaN"])) sniffer.sniff(C([1, np.nan, None])) # The level order here is different on py2 and py3 :-( Because there's no # consistent way to sort mixed-type values on both py2 and py3. Honestly # people probably shouldn't use this, but I don't know how to give a # sensible error. levels, _ = sniffer.levels_contrast() assert set(levels) == set([None, 1]) # bool special case t(["None", "NaN"], [C([True, np.nan, None])], True, (False, True)) t([], [C([10, 20]), C([False]), C([30, 40])], False, (False, True, 10, 20, 30, 40)) # check tuples too t(["None", "NaN"], [C([("b", 2), None, ("a", 1), np.nan, ("c", None)])], False, (("a", 1), ("b", 2), ("c", None))) # contrasts t([], [C([10, 20], contrast="FOO")], False, (10, 20), "FOO") # unhashable level error: sniffer = CategoricalSniffer(NAAction()) pytest.raises(PatsyError, sniffer.sniff, [{}])