def test_data_frame_to_numeric(): a = numpy.concatenate(( numpy.repeat(["large"], 10), numpy.repeat(["small"], 5), numpy.repeat(["tiny"], 13), numpy.repeat(["medium"], 3))) b = numpy.concatenate(( numpy.repeat(["yes"], 8), numpy.repeat(["no"], 23))) rnd = numpy.random.RandomState(0) c = rnd.randn(len(a)) input_df = pandas.DataFrame({"a_category": a, "a_binary": b, "a_number": c.copy()}) a_num = numpy.concatenate(( numpy.repeat([0], 10), numpy.repeat([2], 5), numpy.repeat([3], 13), numpy.repeat([1], 3))).astype(numpy.int64) b_num = numpy.concatenate(( numpy.repeat([1], 8), numpy.repeat([0], 23))).astype(numpy.int64) expected = pandas.DataFrame({"a_category": a_num, "a_binary": b_num, "a_number": c.copy()}) actual = column.categorical_to_numeric(input_df) tm.assert_frame_equal(actual, expected, check_exact=True)
def setUp(self): x, self.y = load_whas500() x = column.categorical_to_numeric(column.standardize(x, with_std=False)) self.x = x.values self.columns = x.columns.tolist()
def _make_whas500(with_mean=True, with_std=True, to_numeric=False): x, y = load_whas500() if with_mean: x = standardize(x, with_std=with_std) if to_numeric: x = categorical_to_numeric(x) names = ['(Intercept)'] + x.columns.tolist() return DataSetWithNames(x=x.values, y=y, names=names, x_data_frame=x)
def test_bool_series(self): input_series = pandas.Series([True, True, False, False, True, False, True], name="human", index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu", "Zeta"]) expected = pandas.Series([1, 1, 0, 0, 1, 0, 1], name="human", index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu", "Zeta"]) actual = column.categorical_to_numeric(input_series) tm.assert_series_equal(actual, expected, check_exact=True)
def test_series(self): input_series = pandas.Series(["a", "a", "b", "b", "b", "c"], name="Thr33", index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu"]) expected = pandas.Series([0, 0, 1, 1, 1, 2], name="Thr33", index=["Alpha", "Beta", "Gamma", "Delta", "Eta", "Mu"]) actual = column.categorical_to_numeric(input_series) tm.assert_series_equal(actual, expected, check_exact=True)
def whas500_sparse_data(): x, y = load_whas500() x_dense = categorical_to_numeric(x.select_dtypes(exclude=[numpy.float_])) data = [] index_i = [] index_j = [] for j, (_, col) in enumerate(x_dense.iteritems()): idx = numpy.flatnonzero(col.values) data.extend([1] * len(idx)) index_i.extend(idx) index_j.extend([j] * len(idx)) x_sparse = coo_matrix((data, (index_i, index_j))) return SparseDataSet(x_dense=x_dense, x_sparse=x_sparse, y=y)
def setUp(self): x, self.y = load_whas500() self.x_dense = column.categorical_to_numeric(x.select_dtypes(exclude=[numpy.float_])) data = [] index_i = [] index_j = [] for j, (_, col) in enumerate(self.x_dense.iteritems()): idx = numpy.flatnonzero(col.values) data.extend([1] * len(idx)) index_i.extend(idx) index_j.extend([j] * len(idx)) self.x_sparse = coo_matrix((data, (index_i, index_j))) assert_array_equal(self.x_dense.values, self.x_sparse.toarray())
def setUp(self): x, self.y = load_whas500() self.x = categorical_to_numeric(x)