Ejemplo n.º 1
0
    def test_sort_index_categorical_index(self):

        df = DataFrame({
            "A":
            np.arange(6, dtype="int64"),
            "B":
            Series(list("aabbca")).astype(CategoricalDtype(list("cab"))),
        }).set_index("B")

        result = df.sort_index()
        expected = df.iloc[[4, 0, 1, 5, 2, 3]]
        tm.assert_frame_equal(result, expected)

        result = df.sort_index(ascending=False)
        expected = df.iloc[[2, 3, 0, 1, 5, 4]]
        tm.assert_frame_equal(result, expected)
Ejemplo n.º 2
0
    def test_unique_index_series(self, ordered):
        # GH38140
        dtype = CategoricalDtype([3, 2, 1], ordered=ordered)

        c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
        # Categorical.unique sorts categories by appearance order
        # if ordered=False
        exp = Categorical([3, 1, 2], dtype=dtype)
        tm.assert_categorical_equal(c.unique(), exp)

        tm.assert_index_equal(Index(c).unique(), Index(exp))
        tm.assert_categorical_equal(Series(c).unique(), exp)

        c = Categorical([1, 1, 2, 2], dtype=dtype)
        exp = Categorical([1, 2], dtype=dtype)
        tm.assert_categorical_equal(c.unique(), exp)
        tm.assert_index_equal(Index(c).unique(), Index(exp))
        tm.assert_categorical_equal(Series(c).unique(), exp)
    def X_feature_label_encode(self, dataframe: DataFrame) -> DataFrame:
        for label_string, fieldnames in self.params[
                'X_feature_label_encode'].items():
            labels = label_string.split(',')
            category_dtype = CategoricalDtype(categories=labels, ordered=True)

            encoder = LabelEncoder()
            encoder.fit(labels)
            for fieldname in fieldnames:
                # Replace NaN with first label 'NA', encoder.transform() will throw exception on unseen values
                dataframe[fieldname] = dataframe[fieldname].astype(
                    category_dtype)
                dataframe[fieldname].fillna(labels[0], inplace=True)
                dataframe[f"{fieldname}_Numeric"] = encoder.transform(
                    dataframe[fieldname])

        self.params['X_feature_exclude'] += list(
            flatten(self.params['X_feature_label_encode'].values()))
        return dataframe
Ejemplo n.º 4
0
def clean_nsr(df):
    od = developer
    nsr_var = [
        'parents', 'has_nurs', 'form', 'children', 'housing', 'finance',
        'social', 'health', 'target'
    ]
    df.columns = nsr_var
    raw = df.copy()
    #raw = raw.replace({'inconv': 0, 'convenient': 1})
    df = df.replace('_', '', regex=True)
    #df = df.replace(' ', '', regex=True)
    df = df.drop(columns=['finance'])
    for i in df.columns:
        df[i] = df[i].astype('category')
        r = od[i]
        cat_r = CategoricalDtype(categories=r, ordered=True)
        # give the order
        df[i] = df[i].cat.reorder_categories(r, ordered=True)

    df['finance'] = raw['finance']
    return df
Ejemplo n.º 5
0
    def test_unique(self, ordered):
        # GH38140
        dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)

        # categories are reordered based on value when ordered=False
        cat = Categorical(["a", "b", "c"], dtype=dtype)
        res = cat.unique()
        tm.assert_categorical_equal(res, cat)

        cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
        res = cat.unique()
        tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))

        cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
        res = cat.unique()
        exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
        tm.assert_categorical_equal(res, exp_cat)

        # nan must be removed
        cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
        res = cat.unique()
        exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
        tm.assert_categorical_equal(res, exp_cat)
Ejemplo n.º 6
0
def load_dframe(sujb_num):
    fname = "prodroma.xlsx"
    subj_num = 0
    dframe = pd.read_excel(fname,
                           sheet_name=subj_num,
                           skiprows=10,
                           usecols="B:CR",
                           index_col=0).T

    dframe = dframe.rename(
        columns={
            "день": "day",
            "Время заполнения ТП": "fillin_time",
            "ГБ новая": "ha_new",
            "ГБ продолжение": "ha_cont",
            "Начало боли": "ha_start",
            "Окончание боли": "ha_stop",
            "Обезболивающее": "painkiller",
            "Название": "painkiller_name",
            "аура": "aura",
            "Боль сейчас": "ha_now",
            "ВАШ макс": "your_max",
            "односторонняя": "onesided",
            "пульсация": "pulsation",
            "усиление движением": "intens_by_mov",
            "тошнота": "vomiting",
            "чувствительность к свету": "light_sens_bin",
            "чувствительность к звуку": "noise_sens_bin",
            "чувствительность к запахам": "smell_sens_bin",
            "заметил провокатор": "noticed_trigger",
            "какой триггер": "which_trigger",
            "Продолжительность сна": "sleep_duration",
            "Качество сна": "sleep_quality",
            "Свежесть после сна": "sleep_freshness",
            "Больше света, чем обычно": "a_lot_light",
            "Чувствительность к свету": "light_sens_cat",
            "Больше звука чем обычно": "a_lot_noise",
            "Чувствительность к звуку": "noise_sens_cat",
            "Были резкие запахи?": "strong_smells",
            "Чувствительность к запахам": "smell_sens_cat",
            "Пропуск приема пищи": "meal_skip",
            "Чувство голода": "hunger",
            "Воды достаточно?": "hydration",
            "Жажда": "thirst",
            "Алкоголь": "alcohol",
            "кофеин": "caffeine",
            "сыр, шоко, цитрус": "cheese_choco_citrus",
            "Хотелось шоколада": "wanted_choco",
            "Чувство усталости": "tiredness",
            "Сложность концентрации": "focus_difficulty",
            "Тревога": "anxiety",
            "Депрессия": "depression",
            "Работоспособность": "productivity",
            "Работосособность": "productivity",
            "Сонливость": "sleepiness",
            "Зевания": "yawning",
            "Напряжение глаз": "eye_strain",
            "боль в шее": "neck_pain",
            "Чувствит кожи головы": "scalp_sens",
            "Физическая ативность": "exercise",
            "какой день": "which_day",
            "Перелеты": "flights",
            "1 день менструации": "pms_1st_day",
            "подташнивает": "nausea",
            "вегетатика": "vegetatics",
            "мочеиспускание": "urination",
            "% заполнения дневника": "journal_completion_percentage",
            "комментарий": "comment",
            "дата": "date",
            "ТП": "TP",
        })

    dframe = dframe.set_index(["date", "TP"])

    dframe.columns.rename(None, inplace=True)

    dframe.fillin_time = pd.to_datetime(dframe.fillin_time)

    dframe.replace(to_replace="да", value=True, inplace=True)
    dframe.replace(to_replace="нет", value=False, inplace=True)

    dframe["ha_new"] = dframe["ha_new"].fillna(False)
    dframe["ha_cont"] = dframe["ha_cont"].fillna(False)
    dframe["ha_now"] = dframe["ha_new"] | dframe["ha_cont"]

    dframe["painkiller"] = dframe["painkiller"].fillna(False)
    dframe["vomiting"] = dframe["vomiting"].fillna(False)
    dframe["intens_by_mov"] = dframe["intens_by_mov"].fillna(False)
    dframe["pulsation"] = dframe["pulsation"].fillna(False)
    dframe["light_sens_bin"] = dframe["light_sens_bin"].fillna(False)
    dframe["noise_sens_bin"] = dframe["noise_sens_bin"].fillna(False)
    dframe["smell_sens_bin"] = dframe["smell_sens_bin"].fillna(False)
    dframe["flights"] = dframe["flights"].fillna(False)
    dframe["pms_1st_day"] = dframe["pms_1st_day"].fillna(False)

    cat_type = CategoricalDtype([1, 2, 3, 4, 5], ordered=True)
    for col in [
            "anxiety",
            "depression",
            "tiredness",
            "productivity",
            "sleepiness",
            "light_sens_cat",
            "smell_sens_cat",
            "noise_sens_cat",
            "sleep_quality",
            "sleep_freshness",
            "hunger",
    ]:
        dframe[col] = dframe[col].astype(cat_type)
    # dframe["anxiety"] = dframe["depression"].astype(int).astype('category')
    return dframe
Ejemplo n.º 7
0
    def test_unique(self, data, categories, expected_data, ordered):
        dtype = CategoricalDtype(categories, ordered=ordered)

        idx = CategoricalIndex(data, dtype=dtype)
        expected = CategoricalIndex(expected_data, dtype=dtype)
        tm.assert_index_equal(idx.unique(), expected)
Ejemplo n.º 8
0
                  columns=list('ABCDEF'))
initMetadata(df)
MARKERS = ['hex', 'circle_x', 'triangle', 'square']
markerFactor = factor_mark('DDC', MARKERS, ["A0", "A1", "A2", "A3", "A4"])
colorFactor = factor_cmap('DDC', 'Category10_6',
                          ["A0", "A1", "A2", "A3", "A4"])

mapDDC = {0: "A0", 1: "A1", 2: "A2", 3: "A3", 4: "A4"}
df.eval("Bool=A>0.5", inplace=True)
df.eval("BoolB=B>0.5", inplace=True)
df.eval("BoolC=C>0.1", inplace=True)
df["A"] = df["A"].round(3)
df["B"] = df["B"].round(3)
df["C"] = df["C"].round(3)
df["D"] = df["D"].round(3)
df["AA"] = ((df.A * 10).round(0)).astype(CategoricalDtype(ordered=True))
df["CC"] = ((df.C * 5).round(0)).astype(int)
df["DD"] = ((df.D * 4).round(0)).astype(int)
df["DDC"] = ((df.D * 4).round(0)).astype(int).map(mapDDC)
df["EE"] = (df.E * 4).round(0)
df['errY'] = df.A * 0.02 + 0.02
df.head(10)
df.meta.metaData = {
    'A.AxisTitle': "A (cm)",
    'B.AxisTitle': "B (cm/s)",
    'C.AxisTitle': "C (s)",
    'D.AxisTitle': "D (a.u.)",
    'Bool.AxisTitle': "A>half",
    'E.AxisTitle': "Category"
}
Ejemplo n.º 9
0
def tree2Panda(tree, include, selection, **kwargs):
    r"""
    Convert selected items from the tree into panda table
    TODO:
        * to  consult with uproot
            * currently not able to work with friend trees
        * check the latest version of RDeatFrame (in AliRoot latest v16.16.00)
        * Add filter on metadata - e.g class of variables
    :param tree:            input tree
    :param include:         regular expresion array - processing Tree+Friends, branches, aliases
    :param selection:       tree selection ()
    :param kwargs:
        * exclude           exclude arrray
        * firstEntry        firt entry to enter
        * nEntries          number of entries to convert
        * column mask
    :return:                panda data frame
    """
    options = {
        "exclude": [],
        "firstEntry": 0,
        "nEntries": 100000000,
        "columnMask": [[".fX$", "_X"], [".fY$", "_y"], [".fElements", ""]],
        "category":0,
        "verbose": 0
    }
    options.update(kwargs)
    if not hasattr(tree, 'anyTree'):
        treeToAnyTree(tree)          # expand tree/aliases/variables - if not done before
    anyTree = tree.anyTree
    # check regular expressions in anyTree
    variablesTree = findSelectedBranches(anyTree, include, options["exclude"])
    variables = ""

    for var in variablesTree:
        # if var.length<2: continue
        var = var.replace("/", ".")
        variables += var + ":"
    # check if valid TTree formula
    for var in include:
        if ".*" in var:
            continue
        formula=    ROOT.TTreeFormula('test', var, tree)
        if (formula.GetNdim()>0):
            variables += var + ":"
    variables = variables[0:-1]

    entries = tree.Draw(str(variables), selection, "goffpara", options["nEntries"], options["firstEntry"])  # query data
    columns = variables.split(":")
    for i, column in enumerate(columns):
        columns[i] = column.replace(".", "_")
    # replace column names
    #    1.) pandas does not allow dots in names
    #    2.) user can specified own column mask
    for i, column in enumerate(columns):
        for mask in options["columnMask"]:
            columns[i] = columns[i].replace(mask[0], mask[1])

    ex_dict = {}
    for i, a in enumerate(columns):
        val = tree.GetVal(i)
        ex_dict[a] = np.frombuffer(val, dtype=float, count=entries)
    df = pd.DataFrame(ex_dict, columns=columns)
    for i, a in enumerate(columns):
        if (tree.GetLeaf(a)):
              if (tree.GetLeaf(a).ClassName() == 'TLeafC'): df[a]=df[a].astype(np.int8)
              if (tree.GetLeaf(a).ClassName() == 'TLeafS'): df[a]=df[a].astype(np.int16)
              if (tree.GetLeaf(a).ClassName() == 'TLeafI'): df[a]=df[a].astype(np.int32)
              if (tree.GetLeaf(a).ClassName() == 'TLeafL'): df[a]=df[a].astype(np.int64)
              if (tree.GetLeaf(a).ClassName() == 'TLeafB'): df[a] = df[a].astype(bool)
        if (options["category"]>0):
            dfUniq=df[a].unique()
            if dfUniq.shape[0]<=options["category"] :
                df[a]=df[a].astype(CategoricalDtype(ordered=True))



    initMetadata(df)
    metaData = tree.GetUserInfo().FindObject("metaTable")
    if metaData:
        for key in metaData:
            df.meta.metaData[key.GetName()] = key.GetTitle()
    return df
Ejemplo n.º 10
0
    def test_getitem_bool_mask_categorical_index(self):

        df3 = DataFrame(
            {
                "A": np.arange(6, dtype="int64"),
            },
            index=CategoricalIndex(
                [1, 1, 2, 1, 3, 2],
                dtype=CategoricalDtype([3, 2, 1], ordered=True),
                name="B",
            ),
        )
        df4 = DataFrame(
            {
                "A": np.arange(6, dtype="int64"),
            },
            index=CategoricalIndex(
                [1, 1, 2, 1, 3, 2],
                dtype=CategoricalDtype([3, 2, 1], ordered=False),
                name="B",
            ),
        )

        result = df3[df3.index == "a"]
        expected = df3.iloc[[]]
        tm.assert_frame_equal(result, expected)

        result = df4[df4.index == "a"]
        expected = df4.iloc[[]]
        tm.assert_frame_equal(result, expected)

        result = df3[df3.index == 1]
        expected = df3.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        result = df4[df4.index == 1]
        expected = df4.iloc[[0, 1, 3]]
        tm.assert_frame_equal(result, expected)

        # since we have an ordered categorical

        # CategoricalIndex([1, 1, 2, 1, 3, 2],
        #         categories=[3, 2, 1],
        #         ordered=True,
        #         name='B')
        result = df3[df3.index < 2]
        expected = df3.iloc[[4]]
        tm.assert_frame_equal(result, expected)

        result = df3[df3.index > 1]
        expected = df3.iloc[[]]
        tm.assert_frame_equal(result, expected)

        # unordered
        # cannot be compared

        # CategoricalIndex([1, 1, 2, 1, 3, 2],
        #         categories=[3, 2, 1],
        #         ordered=False,
        #         name='B')
        msg = "Unordered Categoricals can only compare equality or not"
        with pytest.raises(TypeError, match=msg):
            df4[df4.index < 2]
        with pytest.raises(TypeError, match=msg):
            df4[df4.index > 1]
Ejemplo n.º 11
0
from . import loader

REGIONS = {
    "north-america": "North America",
    "south-asia": "South Asia",
    "sub-saharan-africa": "Sub-Saharan Africa",
    "europe": "Europe & Central Asia",
    "latin-america": "Latin America & Caribbean",
    "middle-east": "Middle East & North Africa",
    "east-asia": "East Asia & Pacific",
}
INCOME_GROUPS = {
    "low": "Low income",
    "lower-middle": "Lower middle income",
    "upper-middle": "Upper middle income",
    "high": "High income",
}

IncomeGroup = CategoricalDtype(categories=INCOME_GROUPS, ordered=True)
Region = CategoricalDtype(categories=REGIONS, ordered=False)


@loader.filtering_from_data(["region"])
def load_region():
    return loader.load_database("un.pkl.gz").astype(Region)


@loader.filtering_from_data(["income_group"])
def load_income_group():
    return loader.load_database("un.pkl.gz").astype(IncomeGroup)
Ejemplo n.º 12
0
class TestUpdate:
    def test_update(self):
        s = Series([1.5, np.nan, 3.0, 4.0, np.nan])
        s2 = Series([np.nan, 3.5, np.nan, 5.0])
        s.update(s2)

        expected = Series([1.5, 3.5, 3.0, 5.0, np.nan])
        tm.assert_series_equal(s, expected)

        # GH 3217
        df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
        df["c"] = np.nan

        df["c"].update(Series(["foo"], index=[0]))
        expected = DataFrame(
            [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"]
        )
        tm.assert_frame_equal(df, expected)

    @pytest.mark.parametrize(
        "other, dtype, expected",
        [
            # other is int
            ([61, 63], "int32", Series([10, 61, 12], dtype="int32")),
            ([61, 63], "int64", Series([10, 61, 12])),
            ([61, 63], float, Series([10.0, 61.0, 12.0])),
            ([61, 63], object, Series([10, 61, 12], dtype=object)),
            # other is float, but can be cast to int
            ([61.0, 63.0], "int32", Series([10, 61, 12], dtype="int32")),
            ([61.0, 63.0], "int64", Series([10, 61, 12])),
            ([61.0, 63.0], float, Series([10.0, 61.0, 12.0])),
            ([61.0, 63.0], object, Series([10, 61.0, 12], dtype=object)),
            # others is float, cannot be cast to int
            ([61.1, 63.1], "int32", Series([10.0, 61.1, 12.0])),
            ([61.1, 63.1], "int64", Series([10.0, 61.1, 12.0])),
            ([61.1, 63.1], float, Series([10.0, 61.1, 12.0])),
            ([61.1, 63.1], object, Series([10, 61.1, 12], dtype=object)),
            # other is object, cannot be cast
            ([(61,), (63,)], "int32", Series([10, (61,), 12])),
            ([(61,), (63,)], "int64", Series([10, (61,), 12])),
            ([(61,), (63,)], float, Series([10.0, (61,), 12.0])),
            ([(61,), (63,)], object, Series([10, (61,), 12])),
        ],
    )
    def test_update_dtypes(self, other, dtype, expected):

        ser = Series([10, 11, 12], dtype=dtype)
        other = Series(other, index=[1, 3])
        ser.update(other)

        tm.assert_series_equal(ser, expected)

    @pytest.mark.parametrize(
        "series, other, expected",
        [
            # update by key
            (
                Series({"a": 1, "b": 2, "c": 3, "d": 4}),
                {"b": 5, "c": np.nan},
                Series({"a": 1, "b": 5, "c": 3, "d": 4}),
            ),
            # update by position
            (Series([1, 2, 3, 4]), [np.nan, 5, 1], Series([1, 5, 1, 4])),
        ],
    )
    def test_update_from_non_series(self, series, other, expected):
        # GH 33215
        series.update(other)
        tm.assert_series_equal(series, expected)

    @pytest.mark.parametrize(
        "data, other, expected, dtype",
        [
            (["a", None], [None, "b"], ["a", "b"], "string"),
            pytest.param(
                ["a", None],
                [None, "b"],
                ["a", "b"],
                "arrow_string",
                marks=td.skip_if_no("pyarrow", min_version="1.0.0"),
            ),
            ([1, None], [None, 2], [1, 2], "Int64"),
            ([True, None], [None, False], [True, False], "boolean"),
            (
                ["a", None],
                [None, "b"],
                ["a", "b"],
                CategoricalDtype(categories=["a", "b"]),
            ),
            (
                [Timestamp(year=2020, month=1, day=1, tz="Europe/London"), NaT],
                [NaT, Timestamp(year=2020, month=1, day=1, tz="Europe/London")],
                [Timestamp(year=2020, month=1, day=1, tz="Europe/London")] * 2,
                "datetime64[ns, Europe/London]",
            ),
        ],
    )
    def test_update_extension_array_series(self, data, other, expected, dtype):
        result = Series(data, dtype=dtype)
        other = Series(other, dtype=dtype)
        expected = Series(expected, dtype=dtype)

        result.update(other)
        tm.assert_series_equal(result, expected)

    def test_update_with_categorical_type(self):
        # GH 25744
        dtype = CategoricalDtype(["a", "b", "c", "d"])
        s1 = Series(["a", "b", "c"], index=[1, 2, 3], dtype=dtype)
        s2 = Series(["b", "a"], index=[1, 2], dtype=dtype)
        s1.update(s2)
        result = s1
        expected = Series(["b", "a", "c"], index=[1, 2, 3], dtype=dtype)
        tm.assert_series_equal(result, expected)
    if kind == "kn":
        subkind = "tsne"
    else:
        sub_kind = kind

    subset = cl_df[[c + "_" + sub_kind for c in ['x', 'y', 'z']]]
    print(subset[:10])

    points = [list(x) for x in subset.to_numpy()]
    print(points[:10])
    print(len(points))

    arr = np.array(points)
    dist = Y = cdist(arr, arr, 'euclidean')
    new_path = make_path(np.array(points), dist)[:-1]
    print(new_path)

    cl_df[['cl_%s' % k for k in things]] = cl_cols

    path_order_categories = CategoricalDtype(categories=new_path, ordered=True)
    cl_df['cl_%s' % kind] = cl_df['cl'].astype(path_order_categories)

    cl_df.sort_values(['cl_%s' % kind], inplace=True)
    cl_df['cl_%s' % kind] = cl_df['cl'].astype('int32')

    cl_df.to_csv('%s_clusters_mean_points.csv' % kind,
                 sep='\t',
                 header=True,
                 index=False)
    print(kind + " " + str(new_path))
Ejemplo n.º 14
0

def test_numpy_transpose(index_or_series_obj):
    msg = "the 'axes' parameter is not supported"
    obj = index_or_series_obj
    tm.assert_equal(np.transpose(obj), obj)

    with pytest.raises(ValueError, match=msg):
        np.transpose(obj, axes=1)


@pytest.mark.parametrize(
    "data, transposed_data, index, columns, dtype",
    [
        ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int),
        ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])),
        ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int),
        ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])),
        ([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int),
        (
            [[1, 2], [3, 4]],
            [[1, 3], [2, 4]],
            ["a", "a"],
            ["b", "b"],
            CategoricalDtype([1, 2, 3, 4]),
        ),
    ],
)
def test_duplicate_labels(data, transposed_data, index, columns, dtype):
    # GH 42380
    df = DataFrame(data, index=index, columns=columns, dtype=dtype)
def test_output_attributes(scraper_output):
    results = scraper_output
    exp_cols = [
        "Place (Overall)",
        "Place (Gender)",
        "Place (Category)",
        "Name",
        "Sex",
        "Club",
        "Running Number",
        "Category",
        "Finish",
        "Year",
        "Country",
        "FirstName",
        "LastName",
        "DSQ",
        "Finish (Total Seconds)",
    ]

    exp_dtypes = pd.Series({
        "Place (Overall)":
        Int64Dtype(),
        "Place (Gender)":
        Int64Dtype(),
        "Place (Category)":
        dtype("float64"),
        "Name":
        dtype("O"),
        "Sex":
        dtype("O"),
        "Club":
        dtype("O"),
        "Running Number":
        dtype("O"),
        "Category":
        CategoricalDtype(
            categories=[
                "18-39",
                "40-44",
                "45-49",
                "50-54",
                "55-59",
                "60-64",
                "65-69",
                "70+",
                "70-74",
                "75-79",
                "80-84",
                "85+",
                "80+",
                "Unknown",
            ],
            ordered=False,
        ),
        "Finish":
        dtype("<m8[ns]"),
        "Year":
        Int64Dtype(),
        "Country":
        dtype("O"),
        "FirstName":
        dtype("O"),
        "LastName":
        dtype("O"),
        "DSQ":
        dtype("bool"),
        "Finish (Total Seconds)":
        dtype("float64"),
    })

    exp_rows_min = 1000  # One sex for one year should give at least this many

    assert exp_cols == list(results.columns), "Expected columns not found"
    assert exp_rows_min <= results.shape[
        0], "Less than minimum expected number of rows"

    assert exp_dtypes.values.tolist() == results.dtypes.values.tolist()
Ejemplo n.º 16
0
import numpy as np
import pandas as pd
from pandas import CategoricalDtype

df = pd.read_csv('D:\\Study\\ML\\Final_Project\\dataset-har-PUC-Rio-ugulino\\Full_Data.csv', delimiter=';')
df['how_tall_in_meters'] = df['how_tall_in_meters'].apply(lambda x: int(x.replace(',', '')))
df['body_mass_index'] = df['body_mass_index'].apply(lambda x: float(x.replace(',', '.')))

df["user"] = df["user"].astype(CategoricalDtype(['debora', 'katia', 'wallace', 'jose_carlos']))
df = pd.concat([df, pd.get_dummies(df['user'], prefix='user')], axis=1)

df["gender"] = df["gender"].astype(CategoricalDtype(['Woman', 'Man']))
df = pd.concat([df, pd.get_dummies(df['gender'], prefix='gender')], axis=1)

df["class"] = df["class"].astype(CategoricalDtype(['sitting', 'sittingdown', 'standing', 'standingup', 'walking']))
df = pd.concat([df, pd.get_dummies(df['class'], prefix='class')], axis=1)

df.drop(['user'], axis=1, inplace=True)
df.drop(['gender'], axis=1, inplace=True)
df.drop(['class'], axis=1, inplace=True)

array = df.to_numpy()
np.random.shuffle(array)
train_data = array[:int(len(array) * 0.8)]
test_data = array[int(len(array) * 0.8):]
pd.DataFrame(train_data).to_csv("D:\\Study\\ML\\Final_Project\\Sources\\Datasets\\Train_data.csv", header=df.columns,
                                index=False)
pd.DataFrame(test_data).to_csv("D:\\Study\\ML\\Final_Project\\Sources\\Datasets\\Test_data.csv", header=df.columns,
                               index=False)
Ejemplo n.º 17
0
def preprocess_features(fp_processed, only_label=True):
    # Load and merge the datasets
    train = pd.read_csv(fp_processed + 'train.csv', index_col=0)
    valid = pd.read_csv(fp_processed + 'valid.csv', index_col=0)
    test = pd.read_csv(fp_processed + 'test.csv', index_col=0)
    #   For easier splitting afterwards
    train['dataset'] = 'train'
    valid['dataset'] = 'valid'
    test['dataset'] = 'test'
    tvt = pd.concat([train, valid, test])

    labels = [
        'concrete_cement', 'healthy_metal', 'incomplete', 'irregular_metal',
        'other'
    ]
    countries = ['colombia', 'guatemala', 'st_lucia']
    places = [
        'borde_rural', 'borde_soacha', 'castries', 'dennery', 'gros_islet',
        'mixco_1_and_ebenezer', 'mixco_3'
    ]

    countries_cat_type = CategoricalDtype(categories=countries, ordered=True)
    places_cat_type = CategoricalDtype(categories=places, ordered=True)
    labels_cat_type = CategoricalDtype(
        categories=labels + ['unknown'],
        ordered=True)  # +['unknown] for the nan's in neighbour labels

    # Encode labels
    tvt.loc[:, 'label'] = tvt.loc[:, 'label'].astype(labels_cat_type).cat.codes

    if not only_label:
        # Encode categories
        #   First handle nan, otherwise cat.code for nan is -1, resulting in error in ebedding (index out of range: -1)
        tvt = tvt.fillna('unknown')

        tvt.loc[:, 'country'] = tvt.loc[:, 'country'].astype(str).astype(
            countries_cat_type).cat.codes
        tvt.loc[:,
                'place'] = tvt.loc[:,
                                   'place'].astype(places_cat_type).cat.codes
        tvt.loc[:, 'verified'] = tvt.loc[:, 'verified'].astype(int)
        for i in range(1, 21):
            tvt.loc[:, f'l_{i}'] = tvt.loc[:, f'l_{i}'].astype(
                labels_cat_type).cat.codes

        # Normalize continuous features
        continuous_cols = [
            'area', 'complexity', 'z_min', 'z_max', 'z_median', 'z_count',
            'z_majority', 'z_minority', 'z_unique', 'z_range', 'z_sum'
        ]
        for col in continuous_cols:
            mu = tvt[col].mean()
            sigma = tvt[col].std()
            tvt.loc[:, col] = (tvt[col] - mu) / sigma

        # Normalize distances
        mu = tvt.loc[:, 'd_1':'d_19'].values.mean()
        sigma = tvt.loc[:, 'd_1':'d_19'].values.std()
        for i in range(1, 21):
            tvt.loc[:, f'd_{i}'] = (tvt[f'd_{i}'] - mu) / sigma

    # split and save
    train = tvt[tvt['dataset'] == 'train']
    valid = tvt[tvt['dataset'] == 'valid']
    test = tvt[tvt['dataset'] == 'test']
    train.to_csv(fp_processed + 'train_.csv')
    valid.to_csv(fp_processed + 'valid.csv')
    test.to_csv(fp_processed + 'test.csv')
Ejemplo n.º 18
0
 def test_impl():
     names = ['C1', 'C2', 'C3']
     ct_dtype = CategoricalDtype(['A', 'B', 'C'])
     dtypes = {'C1': np.int, 'C2': ct_dtype, 'C3': str}
     df = pd.read_csv("csv_data_cat1.csv", names=names, dtype=dtypes)
     return df
Ejemplo n.º 19
0
class TestDataFrameToRecords:
    def test_to_records_dt64(self):
        df = DataFrame(
            [["one", "two", "three"], ["four", "five", "six"]],
            index=date_range("2012-01-01", "2012-01-02"),
        )

        expected = df.index.values[0]
        result = df.to_records()["index"][0]
        assert expected == result

    def test_to_records_dt64tz_column(self):
        # GH#32535 dont less tz in to_records
        df = DataFrame(
            {"A": date_range("2012-01-01", "2012-01-02", tz="US/Eastern")})

        result = df.to_records()

        assert result.dtype["A"] == object
        val = result[0][1]
        assert isinstance(val, Timestamp)
        assert val == df.loc[0, "A"]

    def test_to_records_with_multindex(self):
        # GH#3189
        index = [
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
        data = np.zeros((8, 4))
        df = DataFrame(data, index=index)
        r = df.to_records(index=True)["level_0"]
        assert "bar" in r
        assert "one" not in r

    def test_to_records_with_Mapping_type(self):
        import email
        from email.parser import Parser

        abc.Mapping.register(email.message.Message)

        headers = Parser().parsestr("From: <*****@*****.**>\n"
                                    "To: <*****@*****.**>\n"
                                    "Subject: Test message\n"
                                    "\n"
                                    "Body would go here\n")

        frame = DataFrame.from_records([headers])
        all(x in frame for x in ["Type", "Subject", "From"])

    def test_to_records_floats(self):
        df = DataFrame(np.random.rand(10, 10))
        df.to_records()

    def test_to_records_index_name(self):
        df = DataFrame(np.random.randn(3, 3))
        df.index.name = "X"
        rs = df.to_records()
        assert "X" in rs.dtype.fields

        df = DataFrame(np.random.randn(3, 3))
        rs = df.to_records()
        assert "index" in rs.dtype.fields

        df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
        df.index.names = ["A", None]
        rs = df.to_records()
        assert "level_0" in rs.dtype.fields

    def test_to_records_with_unicode_index(self):
        # GH#13172
        # unicode_literals conflict with to_records
        result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records()
        expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_unicode_column_names(self):
        # xref issue: https://github.com/numpy/numpy/issues/2407
        # Issue GH#11879. to_records used to raise an exception when used
        # with column names containing non-ascii characters in Python 2
        result = DataFrame(data={"accented_name_é": [1.0]}).to_records()

        # Note that numpy allows for unicode field names but dtypes need
        # to be specified using dictionary instead of list of tuples.
        expected = np.rec.array(
            [(0, 1.0)],
            dtype={
                "names": ["index", "accented_name_é"],
                "formats": ["=i8", "=f8"]
            },
        )
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_categorical(self):
        # GH#8626

        # dict creation
        df = DataFrame({"A": list("abc")}, dtype="category")
        expected = Series(list("abc"), dtype="category", name="A")
        tm.assert_series_equal(df["A"], expected)

        # list-like creation
        df = DataFrame(list("abc"), dtype="category")
        expected = Series(list("abc"), dtype="category", name=0)
        tm.assert_series_equal(df[0], expected)

        # to record array
        # this coerces
        result = df.to_records()
        expected = np.rec.array([(0, "a"), (1, "b"), (2, "c")],
                                dtype=[("index", "=i8"), ("0", "O")])
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize(
        "kwargs,expected",
        [
            # No dtypes --> default to array dtypes.
            (
                dict(),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Should have no effect in this case.
            (
                dict(index=True),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Column dtype applied across the board. Index unaffected.
            (
                dict(column_dtypes="<U4"),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"),
                           ("C", "<U4")],
                ),
            ),
            # Index dtype applied across the board. Columns unaffected.
            (
                dict(index_dtypes="<U1"),
                np.rec.array(
                    [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
                    dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Pass in a type instance.
            (
                dict(column_dtypes=np.unicode),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"),
                           ("C", "<U")],
                ),
            ),
            # Pass in a dtype instance.
            (
                dict(column_dtypes=np.dtype("unicode")),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"),
                           ("C", "<U")],
                ),
            ),
            # Pass in a dictionary (name-only).
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32,
                    "C": "<U2"
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "<U2")],
                ),
            ),
            # Pass in a dictionary (indices-only).
            (
                dict(index_dtypes={0: "int16"}),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Ignore index mappings if index is not True.
            (
                dict(index=False, index_dtypes="<U2"),
                np.rec.array(
                    [(1, 0.2, "a"), (2, 1.5, "bc")],
                    dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")],
                ),
            ),
            # Non-existent names / indices in mapping should not error.
            (
                dict(index_dtypes={
                    0: "int16",
                    "not-there": "float32"
                }),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Names / indices not in mapping default to array dtype.
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Names / indices not in dtype mapping default to array dtype.
            (
                dict(column_dtypes={
                    "A": np.dtype("int8"),
                    "B": np.dtype("float32")
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Mixture of everything.
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32
                },
                     index_dtypes="<U2"),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Invalid dype values.
            (
                dict(index=False, column_dtypes=list()),
                (ValueError, "Invalid dtype \\[\\] specified for column A"),
            ),
            (
                dict(index=False, column_dtypes={
                    "A": "int32",
                    "B": 5
                }),
                (ValueError, "Invalid dtype 5 specified for column B"),
            ),
            # Numpy can't handle EA types, so check error is raised
            (
                dict(
                    index=False,
                    column_dtypes={
                        "A": "int32",
                        "B": CategoricalDtype(["a", "b"])
                    },
                ),
                (ValueError, "Invalid dtype category specified for column B"),
            ),
            # Check that bad types raise
            (
                dict(index=False, column_dtypes={
                    "A": "int32",
                    "B": "foo"
                }),
                (TypeError, "data type [\"']foo[\"'] not understood"),
            ),
        ],
    )
    def test_to_records_dtype(self, kwargs, expected):
        # see GH#18146
        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        if not isinstance(expected, np.recarray):
            with pytest.raises(expected[0], match=expected[1]):
                df.to_records(**kwargs)
        else:
            result = df.to_records(**kwargs)
            tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize(
        "df,kwargs,expected",
        [
            # MultiIndex in the index.
            (
                DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                          columns=list("abc")).set_index(["a", "b"]),
                dict(column_dtypes="float64",
                     index_dtypes={
                         0: "int32",
                         1: "int8"
                     }),
                np.rec.array(
                    [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)],
                    dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")],
                ),
            ),
            # MultiIndex in the columns.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                    ("c", "f")]),
                ),
                dict(column_dtypes={
                    0: "<U1",
                    2: "float32"
                },
                     index_dtypes="float32"),
                np.rec.array(
                    [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0),
                     (2.0, "7", 8, 9.0)],
                    dtype=[
                        ("index", "<f4"),
                        ("('a', 'd')", "<U1"),
                        ("('b', 'e')", "<i8"),
                        ("('c', 'f')", "<f4"),
                    ],
                ),
            ),
            # MultiIndex in both the columns and index.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                    ("c", "f")],
                                                   names=list("ab")),
                    index=MultiIndex.from_tuples([("d", -4), ("d", -5),
                                                  ("f", -6)],
                                                 names=list("cd")),
                ),
                dict(column_dtypes="float64",
                     index_dtypes={
                         0: "<U2",
                         1: "int8"
                     }),
                np.rec.array(
                    [
                        ("d", -4, 1.0, 2.0, 3.0),
                        ("d", -5, 4.0, 5.0, 6.0),
                        ("f", -6, 7, 8, 9.0),
                    ],
                    dtype=[
                        ("c", "<U2"),
                        ("d", "i1"),
                        ("('a', 'd')", "<f8"),
                        ("('b', 'e')", "<f8"),
                        ("('c', 'f')", "<f8"),
                    ],
                ),
            ),
        ],
    )
    def test_to_records_dtype_mi(self, df, kwargs, expected):
        # see GH#18146
        result = df.to_records(**kwargs)
        tm.assert_almost_equal(result, expected)

    def test_to_records_dict_like(self):
        # see GH#18146
        class DictLike:
            def __init__(self, **kwargs):
                self.d = kwargs.copy()

            def __getitem__(self, key):
                return self.d.__getitem__(key)

            def __contains__(self, key) -> bool:
                return key in self.d

            def keys(self):
                return self.d.keys()

        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        dtype_mappings = dict(
            column_dtypes=DictLike(**{
                "A": np.int8,
                "B": np.float32
            }),
            index_dtypes="<U2",
        )

        result = df.to_records(**dtype_mappings)
        expected = np.rec.array(
            [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
            dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
        )
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"])
    def test_to_records_datetimeindex_with_tz(self, tz):
        # GH#13937
        dr = date_range("2016-01-01", periods=10, freq="S", tz=tz)

        df = DataFrame({"datetime": dr}, index=dr)

        expected = df.to_records()
        result = df.tz_convert("UTC").to_records()

        # both converted to UTC, so they are equal
        tm.assert_numpy_array_equal(result, expected)
Ejemplo n.º 20
0
class TestDataFrameConvertTo(TestData):

    def test_to_dict_timestamp(self):

        # GH11247
        # split/records producing np.datetime64 rather than Timestamps
        # on datetime64[ns] dtypes only

        tsmp = Timestamp('20130101')
        test_data = DataFrame({'A': [tsmp, tsmp], 'B': [tsmp, tsmp]})
        test_data_mixed = DataFrame({'A': [tsmp, tsmp], 'B': [1, 2]})

        expected_records = [{'A': tsmp, 'B': tsmp},
                            {'A': tsmp, 'B': tsmp}]
        expected_records_mixed = [{'A': tsmp, 'B': 1},
                                  {'A': tsmp, 'B': 2}]

        assert (test_data.to_dict(orient='records') ==
                expected_records)
        assert (test_data_mixed.to_dict(orient='records') ==
                expected_records_mixed)

        expected_series = {
            'A': Series([tsmp, tsmp], name='A'),
            'B': Series([tsmp, tsmp], name='B'),
        }
        expected_series_mixed = {
            'A': Series([tsmp, tsmp], name='A'),
            'B': Series([1, 2], name='B'),
        }

        tm.assert_dict_equal(test_data.to_dict(orient='series'),
                             expected_series)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient='series'),
                             expected_series_mixed)

        expected_split = {
            'index': [0, 1],
            'data': [[tsmp, tsmp],
                     [tsmp, tsmp]],
            'columns': ['A', 'B']
        }
        expected_split_mixed = {
            'index': [0, 1],
            'data': [[tsmp, 1],
                     [tsmp, 2]],
            'columns': ['A', 'B']
        }

        tm.assert_dict_equal(test_data.to_dict(orient='split'),
                             expected_split)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient='split'),
                             expected_split_mixed)

    def test_to_dict_index_not_unique_with_index_orient(self):
        # GH22801
        # Data loss when indexes are not unique. Raise ValueError.
        df = DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
        msg = "DataFrame index must be unique for orient='index'"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient='index')

    def test_to_dict_invalid_orient(self):
        df = DataFrame({'A': [0, 1]})
        msg = "orient 'xinvalid' not understood"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient='xinvalid')

    def test_to_records_dt64(self):
        df = DataFrame([["one", "two", "three"],
                        ["four", "five", "six"]],
                       index=date_range("2012-01-01", "2012-01-02"))

        # convert_datetime64 defaults to None
        expected = df.index.values[0]
        result = df.to_records()['index'][0]
        assert expected == result

        # check for FutureWarning if convert_datetime64=False is passed
        with tm.assert_produces_warning(FutureWarning):
            expected = df.index.values[0]
            result = df.to_records(convert_datetime64=False)['index'][0]
            assert expected == result

        # check for FutureWarning if convert_datetime64=True is passed
        with tm.assert_produces_warning(FutureWarning):
            expected = df.index[0]
            result = df.to_records(convert_datetime64=True)['index'][0]
            assert expected == result

    def test_to_records_with_multindex(self):
        # GH3189
        index = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        data = np.zeros((8, 4))
        df = DataFrame(data, index=index)
        r = df.to_records(index=True)['level_0']
        assert 'bar' in r
        assert 'one' not in r

    def test_to_records_with_Mapping_type(self):
        import email
        from email.parser import Parser

        abc.Mapping.register(email.message.Message)

        headers = Parser().parsestr('From: <*****@*****.**>\n'
                                    'To: <*****@*****.**>\n'
                                    'Subject: Test message\n'
                                    '\n'
                                    'Body would go here\n')

        frame = DataFrame.from_records([headers])
        all(x in frame for x in ['Type', 'Subject', 'From'])

    def test_to_records_floats(self):
        df = DataFrame(np.random.rand(10, 10))
        df.to_records()

    def test_to_records_index_name(self):
        df = DataFrame(np.random.randn(3, 3))
        df.index.name = 'X'
        rs = df.to_records()
        assert 'X' in rs.dtype.fields

        df = DataFrame(np.random.randn(3, 3))
        rs = df.to_records()
        assert 'index' in rs.dtype.fields

        df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])
        df.index.names = ['A', None]
        rs = df.to_records()
        assert 'level_0' in rs.dtype.fields

    def test_to_records_with_unicode_index(self):
        # GH13172
        # unicode_literals conflict with to_records
        result = DataFrame([{'a': 'x', 'b': 'y'}]).set_index('a') \
            .to_records()
        expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')])
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_unicode_column_names(self):
        # xref issue: https://github.com/numpy/numpy/issues/2407
        # Issue #11879. to_records used to raise an exception when used
        # with column names containing non-ascii characters in Python 2
        result = DataFrame(data={"accented_name_é": [1.0]}).to_records()

        # Note that numpy allows for unicode field names but dtypes need
        # to be specified using dictionary instead of list of tuples.
        expected = np.rec.array(
            [(0, 1.0)],
            dtype={"names": ["index", "accented_name_é"],
                   "formats": ['=i8', '=f8']}
        )
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_categorical(self):

        # GH8626

        # dict creation
        df = DataFrame({'A': list('abc')}, dtype='category')
        expected = Series(list('abc'), dtype='category', name='A')
        tm.assert_series_equal(df['A'], expected)

        # list-like creation
        df = DataFrame(list('abc'), dtype='category')
        expected = Series(list('abc'), dtype='category', name=0)
        tm.assert_series_equal(df[0], expected)

        # to record array
        # this coerces
        result = df.to_records()
        expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')],
                                dtype=[('index', '=i8'), ('0', 'O')])
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize("kwargs,expected", [
        # No dtypes --> default to array dtypes.
        (dict(),
         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                      dtype=[("index", "<i8"), ("A", "<i8"),
                             ("B", "<f8"), ("C", "O")])),

        # Should have no effect in this case.
        (dict(index=True),
         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                      dtype=[("index", "<i8"), ("A", "<i8"),
                             ("B", "<f8"), ("C", "O")])),

        # Column dtype applied across the board. Index unaffected.
        (dict(column_dtypes="<U4"),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "<U4"),
                             ("B", "<U4"), ("C", "<U4")])),

        # Index dtype applied across the board. Columns unaffected.
        (dict(index_dtypes="<U1"),
         np.rec.array([("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
                      dtype=[("index", "<U1"), ("A", "<i8"),
                             ("B", "<f8"), ("C", "O")])),

        # Pass in a type instance.
        (dict(column_dtypes=np.unicode),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "<U"),
                             ("B", "<U"), ("C", "<U")])),

        # Pass in a dtype instance.
        (dict(column_dtypes=np.dtype('unicode')),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "<U"),
                             ("B", "<U"), ("C", "<U")])),

        # Pass in a dictionary (name-only).
        (dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "i1"),
                             ("B", "<f4"), ("C", "<U2")])),

        # Pass in a dictionary (indices-only).
        (dict(index_dtypes={0: "int16"}),
         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                      dtype=[("index", "i2"), ("A", "<i8"),
                             ("B", "<f8"), ("C", "O")])),

        # Ignore index mappings if index is not True.
        (dict(index=False, index_dtypes="<U2"),
         np.rec.array([(1, 0.2, "a"), (2, 1.5, "bc")],
                      dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")])),

        # Non-existent names / indices in mapping should not error.
        (dict(index_dtypes={0: "int16", "not-there": "float32"}),
         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                      dtype=[("index", "i2"), ("A", "<i8"),
                             ("B", "<f8"), ("C", "O")])),

        # Names / indices not in mapping default to array dtype.
        (dict(column_dtypes={"A": np.int8, "B": np.float32}),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "i1"),
                             ("B", "<f4"), ("C", "O")])),

        # Names / indices not in dtype mapping default to array dtype.
        (dict(column_dtypes={"A": np.dtype('int8'), "B": np.dtype('float32')}),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<i8"), ("A", "i1"),
                             ("B", "<f4"), ("C", "O")])),

        # Mixture of everything.
        (dict(column_dtypes={"A": np.int8, "B": np.float32},
              index_dtypes="<U2"),
         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                      dtype=[("index", "<U2"), ("A", "i1"),
                             ("B", "<f4"), ("C", "O")])),

        # Invalid dype values.
        (dict(index=False, column_dtypes=list()),
         (ValueError, "Invalid dtype \\[\\] specified for column A")),

        (dict(index=False, column_dtypes={"A": "int32", "B": 5}),
         (ValueError, "Invalid dtype 5 specified for column B")),

        # Numpy can't handle EA types, so check error is raised
        (dict(index=False, column_dtypes={"A": "int32",
                                          "B": CategoricalDtype(['a', 'b'])}),
         (ValueError, 'Invalid dtype category specified for column B')),

        # Check that bad types raise
        (dict(index=False, column_dtypes={"A": "int32", "B": "foo"}),
         (TypeError, 'data type "foo" not understood')),
    ])
    def test_to_records_dtype(self, kwargs, expected):
        # see gh-18146
        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        if not isinstance(expected, np.recarray):
            with pytest.raises(expected[0], match=expected[1]):
                df.to_records(**kwargs)
        else:
            result = df.to_records(**kwargs)
            tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize("df,kwargs,expected", [
        # MultiIndex in the index.
        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                   columns=list("abc")).set_index(["a", "b"]),
         dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
         np.rec.array([(1, 2, 3.), (4, 5, 6.), (7, 8, 9.)],
                      dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")])),

        # MultiIndex in the columns.
        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                   columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                   ("c", "f")])),
         dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
         np.rec.array([(0., "1", 2, 3.), (1., "4", 5, 6.),
                       (2., "7", 8, 9.)],
                      dtype=[("index", "<f4"),
                             ("('a', 'd')", "<U1"),
                             ("('b', 'e')", "<i8"),
                             ("('c', 'f')", "<f4")])),

        # MultiIndex in both the columns and index.
        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                   columns=MultiIndex.from_tuples([
                       ("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")),
                   index=MultiIndex.from_tuples([
                       ("d", -4), ("d", -5), ("f", -6)], names=list("cd"))),
         dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
         np.rec.array([("d", -4, 1., 2., 3.), ("d", -5, 4., 5., 6.),
                       ("f", -6, 7, 8, 9.)],
                      dtype=[("c", "<U2"), ("d", "i1"),
                             ("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"),
                             ("('c', 'f')", "<f8")]))
    ])
    def test_to_records_dtype_mi(self, df, kwargs, expected):
        # see gh-18146
        result = df.to_records(**kwargs)
        tm.assert_almost_equal(result, expected)

    def test_to_records_dict_like(self):
        # see gh-18146
        class DictLike(object):
            def __init__(self, **kwargs):
                self.d = kwargs.copy()

            def __getitem__(self, key):
                return self.d.__getitem__(key)

            def __contains__(self, key):
                return key in self.d

            def keys(self):
                return self.d.keys()

        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        dtype_mappings = dict(column_dtypes=DictLike(**{"A": np.int8,
                                                        "B": np.float32}),
                              index_dtypes="<U2")

        result = df.to_records(**dtype_mappings)
        expected = np.rec.array([("0", "1", "0.2", "a"),
                                 ("1", "2", "1.5", "bc")],
                                dtype=[("index", "<U2"), ("A", "i1"),
                                       ("B", "<f4"), ("C", "O")])
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize('mapping', [dict, defaultdict(list), OrderedDict])
    def test_to_dict(self, mapping):
        test_data = {
            'A': {'1': 1, '2': 2},
            'B': {'1': '1', '2': '2', '3': '3'},
        }

        # GH16122
        recons_data = DataFrame(test_data).to_dict(into=mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("l", mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][int(k2) - 1])

        recons_data = DataFrame(test_data).to_dict("s", mapping)

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k][k2])

        recons_data = DataFrame(test_data).to_dict("sp", mapping)
        expected_split = {'columns': ['A', 'B'], 'index': ['1', '2', '3'],
                          'data': [[1.0, '1'], [2.0, '2'], [np.nan, '3']]}
        tm.assert_dict_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("r", mapping)
        expected_records = [{'A': 1.0, 'B': '1'},
                            {'A': 2.0, 'B': '2'},
                            {'A': np.nan, 'B': '3'}]
        assert isinstance(recons_data, list)
        assert (len(recons_data) == 3)
        for l, r in zip(recons_data, expected_records):
            tm.assert_dict_equal(l, r)

        # GH10844
        recons_data = DataFrame(test_data).to_dict("i")

        for k, v in compat.iteritems(test_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k2][k])

        df = DataFrame(test_data)
        df['duped'] = df[df.columns[0]]
        recons_data = df.to_dict("i")
        comp_data = test_data.copy()
        comp_data['duped'] = comp_data[df.columns[0]]
        for k, v in compat.iteritems(comp_data):
            for k2, v2 in compat.iteritems(v):
                assert (v2 == recons_data[k2][k])

    @pytest.mark.parametrize('mapping', [list, defaultdict, []])
    def test_to_dict_errors(self, mapping):
        # GH16122
        df = DataFrame(np.random.randn(3, 3))
        with pytest.raises(TypeError):
            df.to_dict(into=mapping)

    def test_to_dict_not_unique_warning(self):
        # GH16927: When converting to a dict, if a column has a non-unique name
        # it will be dropped, throwing a warning.
        df = DataFrame([[1, 2, 3]], columns=['a', 'a', 'b'])
        with tm.assert_produces_warning(UserWarning):
            df.to_dict()

    @pytest.mark.parametrize('tz', ['UTC', 'GMT', 'US/Eastern'])
    def test_to_records_datetimeindex_with_tz(self, tz):
        # GH13937
        dr = date_range('2016-01-01', periods=10,
                        freq='S', tz=tz)

        df = DataFrame({'datetime': dr}, index=dr)

        expected = df.to_records()
        result = df.tz_convert("UTC").to_records()

        # both converted to UTC, so they are equal
        tm.assert_numpy_array_equal(result, expected)

    # orient - orient argument to to_dict function
    # item_getter - function for extracting value from
    # the resulting dict using column name and index
    @pytest.mark.parametrize('orient,item_getter', [
        ('dict', lambda d, col, idx: d[col][idx]),
        ('records', lambda d, col, idx: d[idx][col]),
        ('list', lambda d, col, idx: d[col][idx]),
        ('split', lambda d, col, idx: d['data'][idx][d['columns'].index(col)]),
        ('index', lambda d, col, idx: d[idx][col])
    ])
    def test_to_dict_box_scalars(self, orient, item_getter):
        # 14216, 23753
        # make sure that we are boxing properly
        df = DataFrame({'a': [1, 2], 'b': [.1, .2]})
        result = df.to_dict(orient=orient)
        assert isinstance(item_getter(result, 'a', 0), int)
        assert isinstance(item_getter(result, 'b', 0), float)

    def test_frame_to_dict_tz(self):
        # GH18372 When converting to dict with orient='records' columns of
        # datetime that are tz-aware were not converted to required arrays
        data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),),
                (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)]
        df = DataFrame(list(data), columns=["d", ])

        result = df.to_dict(orient='records')
        expected = [
            {'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)},
            {'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)},
        ]
        tm.assert_dict_equal(result[0], expected[0])
        tm.assert_dict_equal(result[1], expected[1])

    @pytest.mark.parametrize('into, expected', [
        (dict, {0: {'int_col': 1, 'float_col': 1.0},
                1: {'int_col': 2, 'float_col': 2.0},
                2: {'int_col': 3, 'float_col': 3.0}}),
        (OrderedDict, OrderedDict([(0, {'int_col': 1, 'float_col': 1.0}),
                                   (1, {'int_col': 2, 'float_col': 2.0}),
                                   (2, {'int_col': 3, 'float_col': 3.0})])),
        (defaultdict(list), defaultdict(list,
                                        {0: {'int_col': 1, 'float_col': 1.0},
                                         1: {'int_col': 2, 'float_col': 2.0},
                                         2: {'int_col': 3, 'float_col': 3.0}}))
    ])
    def test_to_dict_index_dtypes(self, into, expected):
        # GH 18580
        # When using to_dict(orient='index') on a dataframe with int
        # and float columns only the int columns were cast to float

        df = DataFrame({'int_col': [1, 2, 3],
                        'float_col': [1.0, 2.0, 3.0]})

        result = df.to_dict(orient='index', into=into)
        cols = ['int_col', 'float_col']
        result = DataFrame.from_dict(result, orient='index')[cols]
        expected = DataFrame.from_dict(expected, orient='index')[cols]
        tm.assert_frame_equal(result, expected)

    def test_to_dict_numeric_names(self):
        # https://github.com/pandas-dev/pandas/issues/24940
        df = DataFrame({str(i): [i] for i in range(5)})
        result = set(df.to_dict('records')[0].keys())
        expected = set(df.columns)
        assert result == expected

    def test_to_dict_wide(self):
        # https://github.com/pandas-dev/pandas/issues/24939
        df = DataFrame({('A_{:d}'.format(i)): [i] for i in range(256)})
        result = df.to_dict('records')[0]
        expected = {'A_{:d}'.format(i): i for i in range(256)}
        assert result == expected
Ejemplo n.º 21
0
class TestDataFrameConvertTo:
    def test_to_dict_timestamp(self):

        # GH11247
        # split/records producing np.datetime64 rather than Timestamps
        # on datetime64[ns] dtypes only

        tsmp = Timestamp("20130101")
        test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]})
        test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]})

        expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}]
        expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}]

        assert test_data.to_dict(orient="records") == expected_records
        assert test_data_mixed.to_dict(
            orient="records") == expected_records_mixed

        expected_series = {
            "A": Series([tsmp, tsmp], name="A"),
            "B": Series([tsmp, tsmp], name="B"),
        }
        expected_series_mixed = {
            "A": Series([tsmp, tsmp], name="A"),
            "B": Series([1, 2], name="B"),
        }

        tm.assert_dict_equal(test_data.to_dict(orient="series"),
                             expected_series)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient="series"),
                             expected_series_mixed)

        expected_split = {
            "index": [0, 1],
            "data": [[tsmp, tsmp], [tsmp, tsmp]],
            "columns": ["A", "B"],
        }
        expected_split_mixed = {
            "index": [0, 1],
            "data": [[tsmp, 1], [tsmp, 2]],
            "columns": ["A", "B"],
        }

        tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split)
        tm.assert_dict_equal(test_data_mixed.to_dict(orient="split"),
                             expected_split_mixed)

    def test_to_dict_index_not_unique_with_index_orient(self):
        # GH22801
        # Data loss when indexes are not unique. Raise ValueError.
        df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"])
        msg = "DataFrame index must be unique for orient='index'"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient="index")

    def test_to_dict_invalid_orient(self):
        df = DataFrame({"A": [0, 1]})
        msg = "orient 'xinvalid' not understood"
        with pytest.raises(ValueError, match=msg):
            df.to_dict(orient="xinvalid")

    def test_to_records_dt64(self):
        df = DataFrame(
            [["one", "two", "three"], ["four", "five", "six"]],
            index=date_range("2012-01-01", "2012-01-02"),
        )

        expected = df.index.values[0]
        result = df.to_records()["index"][0]
        assert expected == result

    def test_to_records_with_multindex(self):
        # GH3189
        index = [
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
        data = np.zeros((8, 4))
        df = DataFrame(data, index=index)
        r = df.to_records(index=True)["level_0"]
        assert "bar" in r
        assert "one" not in r

    def test_to_records_with_Mapping_type(self):
        import email
        from email.parser import Parser

        abc.Mapping.register(email.message.Message)

        headers = Parser().parsestr("From: <*****@*****.**>\n"
                                    "To: <*****@*****.**>\n"
                                    "Subject: Test message\n"
                                    "\n"
                                    "Body would go here\n")

        frame = DataFrame.from_records([headers])
        all(x in frame for x in ["Type", "Subject", "From"])

    def test_to_records_floats(self):
        df = DataFrame(np.random.rand(10, 10))
        df.to_records()

    def test_to_records_index_name(self):
        df = DataFrame(np.random.randn(3, 3))
        df.index.name = "X"
        rs = df.to_records()
        assert "X" in rs.dtype.fields

        df = DataFrame(np.random.randn(3, 3))
        rs = df.to_records()
        assert "index" in rs.dtype.fields

        df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
        df.index.names = ["A", None]
        rs = df.to_records()
        assert "level_0" in rs.dtype.fields

    def test_to_records_with_unicode_index(self):
        # GH13172
        # unicode_literals conflict with to_records
        result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records()
        expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")])
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_unicode_column_names(self):
        # xref issue: https://github.com/numpy/numpy/issues/2407
        # Issue #11879. to_records used to raise an exception when used
        # with column names containing non-ascii characters in Python 2
        result = DataFrame(data={"accented_name_é": [1.0]}).to_records()

        # Note that numpy allows for unicode field names but dtypes need
        # to be specified using dictionary instead of list of tuples.
        expected = np.rec.array(
            [(0, 1.0)],
            dtype={
                "names": ["index", "accented_name_é"],
                "formats": ["=i8", "=f8"]
            },
        )
        tm.assert_almost_equal(result, expected)

    def test_to_records_with_categorical(self):

        # GH8626

        # dict creation
        df = DataFrame({"A": list("abc")}, dtype="category")
        expected = Series(list("abc"), dtype="category", name="A")
        tm.assert_series_equal(df["A"], expected)

        # list-like creation
        df = DataFrame(list("abc"), dtype="category")
        expected = Series(list("abc"), dtype="category", name=0)
        tm.assert_series_equal(df[0], expected)

        # to record array
        # this coerces
        result = df.to_records()
        expected = np.rec.array([(0, "a"), (1, "b"), (2, "c")],
                                dtype=[("index", "=i8"), ("0", "O")])
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize(
        "kwargs,expected",
        [
            # No dtypes --> default to array dtypes.
            (
                dict(),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Should have no effect in this case.
            (
                dict(index=True),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Column dtype applied across the board. Index unaffected.
            (
                dict(column_dtypes="<U4"),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"),
                           ("C", "<U4")],
                ),
            ),
            # Index dtype applied across the board. Columns unaffected.
            (
                dict(index_dtypes="<U1"),
                np.rec.array(
                    [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
                    dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Pass in a type instance.
            (
                dict(column_dtypes=np.unicode),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"),
                           ("C", "<U")],
                ),
            ),
            # Pass in a dtype instance.
            (
                dict(column_dtypes=np.dtype("unicode")),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"),
                           ("C", "<U")],
                ),
            ),
            # Pass in a dictionary (name-only).
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32,
                    "C": "<U2"
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "<U2")],
                ),
            ),
            # Pass in a dictionary (indices-only).
            (
                dict(index_dtypes={0: "int16"}),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Ignore index mappings if index is not True.
            (
                dict(index=False, index_dtypes="<U2"),
                np.rec.array(
                    [(1, 0.2, "a"), (2, 1.5, "bc")],
                    dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")],
                ),
            ),
            # Non-existent names / indices in mapping should not error.
            (
                dict(index_dtypes={
                    0: "int16",
                    "not-there": "float32"
                }),
                np.rec.array(
                    [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
                    dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"),
                           ("C", "O")],
                ),
            ),
            # Names / indices not in mapping default to array dtype.
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Names / indices not in dtype mapping default to array dtype.
            (
                dict(column_dtypes={
                    "A": np.dtype("int8"),
                    "B": np.dtype("float32")
                }),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Mixture of everything.
            (
                dict(column_dtypes={
                    "A": np.int8,
                    "B": np.float32
                },
                     index_dtypes="<U2"),
                np.rec.array(
                    [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
                    dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"),
                           ("C", "O")],
                ),
            ),
            # Invalid dype values.
            (
                dict(index=False, column_dtypes=list()),
                (ValueError, "Invalid dtype \\[\\] specified for column A"),
            ),
            (
                dict(index=False, column_dtypes={
                    "A": "int32",
                    "B": 5
                }),
                (ValueError, "Invalid dtype 5 specified for column B"),
            ),
            # Numpy can't handle EA types, so check error is raised
            (
                dict(
                    index=False,
                    column_dtypes={
                        "A": "int32",
                        "B": CategoricalDtype(["a", "b"])
                    },
                ),
                (ValueError, "Invalid dtype category specified for column B"),
            ),
            # Check that bad types raise
            (
                dict(index=False, column_dtypes={
                    "A": "int32",
                    "B": "foo"
                }),
                (TypeError, 'data type "foo" not understood'),
            ),
        ],
    )
    def test_to_records_dtype(self, kwargs, expected):
        # see gh-18146
        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        if not isinstance(expected, np.recarray):
            with pytest.raises(expected[0], match=expected[1]):
                df.to_records(**kwargs)
        else:
            result = df.to_records(**kwargs)
            tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize(
        "df,kwargs,expected",
        [
            # MultiIndex in the index.
            (
                DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                          columns=list("abc")).set_index(["a", "b"]),
                dict(column_dtypes="float64",
                     index_dtypes={
                         0: "int32",
                         1: "int8"
                     }),
                np.rec.array(
                    [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)],
                    dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")],
                ),
            ),
            # MultiIndex in the columns.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                    ("c", "f")]),
                ),
                dict(column_dtypes={
                    0: "<U1",
                    2: "float32"
                },
                     index_dtypes="float32"),
                np.rec.array(
                    [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0),
                     (2.0, "7", 8, 9.0)],
                    dtype=[
                        ("index", "<f4"),
                        ("('a', 'd')", "<U1"),
                        ("('b', 'e')", "<i8"),
                        ("('c', 'f')", "<f4"),
                    ],
                ),
            ),
            # MultiIndex in both the columns and index.
            (
                DataFrame(
                    [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                    columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
                                                    ("c", "f")],
                                                   names=list("ab")),
                    index=MultiIndex.from_tuples([("d", -4), ("d", -5),
                                                  ("f", -6)],
                                                 names=list("cd")),
                ),
                dict(column_dtypes="float64",
                     index_dtypes={
                         0: "<U2",
                         1: "int8"
                     }),
                np.rec.array(
                    [
                        ("d", -4, 1.0, 2.0, 3.0),
                        ("d", -5, 4.0, 5.0, 6.0),
                        ("f", -6, 7, 8, 9.0),
                    ],
                    dtype=[
                        ("c", "<U2"),
                        ("d", "i1"),
                        ("('a', 'd')", "<f8"),
                        ("('b', 'e')", "<f8"),
                        ("('c', 'f')", "<f8"),
                    ],
                ),
            ),
        ],
    )
    def test_to_records_dtype_mi(self, df, kwargs, expected):
        # see gh-18146
        result = df.to_records(**kwargs)
        tm.assert_almost_equal(result, expected)

    def test_to_records_dict_like(self):
        # see gh-18146
        class DictLike:
            def __init__(self, **kwargs):
                self.d = kwargs.copy()

            def __getitem__(self, key):
                return self.d.__getitem__(key)

            def __contains__(self, key):
                return key in self.d

            def keys(self):
                return self.d.keys()

        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})

        dtype_mappings = dict(
            column_dtypes=DictLike(**{
                "A": np.int8,
                "B": np.float32
            }),
            index_dtypes="<U2",
        )

        result = df.to_records(**dtype_mappings)
        expected = np.rec.array(
            [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
            dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")],
        )
        tm.assert_almost_equal(result, expected)

    @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict])
    def test_to_dict(self, mapping):
        test_data = {
            "A": {
                "1": 1,
                "2": 2
            },
            "B": {
                "1": "1",
                "2": "2",
                "3": "3"
            }
        }

        # GH16122
        recons_data = DataFrame(test_data).to_dict(into=mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][k2]

        recons_data = DataFrame(test_data).to_dict("l", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][int(k2) - 1]

        recons_data = DataFrame(test_data).to_dict("s", mapping)

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k][k2]

        recons_data = DataFrame(test_data).to_dict("sp", mapping)
        expected_split = {
            "columns": ["A", "B"],
            "index": ["1", "2", "3"],
            "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]],
        }
        tm.assert_dict_equal(recons_data, expected_split)

        recons_data = DataFrame(test_data).to_dict("r", mapping)
        expected_records = [
            {
                "A": 1.0,
                "B": "1"
            },
            {
                "A": 2.0,
                "B": "2"
            },
            {
                "A": np.nan,
                "B": "3"
            },
        ]
        assert isinstance(recons_data, list)
        assert len(recons_data) == 3
        for l, r in zip(recons_data, expected_records):
            tm.assert_dict_equal(l, r)

        # GH10844
        recons_data = DataFrame(test_data).to_dict("i")

        for k, v in test_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k2][k]

        df = DataFrame(test_data)
        df["duped"] = df[df.columns[0]]
        recons_data = df.to_dict("i")
        comp_data = test_data.copy()
        comp_data["duped"] = comp_data[df.columns[0]]
        for k, v in comp_data.items():
            for k2, v2 in v.items():
                assert v2 == recons_data[k2][k]

    @pytest.mark.parametrize("mapping", [list, defaultdict, []])
    def test_to_dict_errors(self, mapping):
        # GH16122
        df = DataFrame(np.random.randn(3, 3))
        with pytest.raises(TypeError):
            df.to_dict(into=mapping)

    def test_to_dict_not_unique_warning(self):
        # GH16927: When converting to a dict, if a column has a non-unique name
        # it will be dropped, throwing a warning.
        df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"])
        with tm.assert_produces_warning(UserWarning):
            df.to_dict()

    @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"])
    def test_to_records_datetimeindex_with_tz(self, tz):
        # GH13937
        dr = date_range("2016-01-01", periods=10, freq="S", tz=tz)

        df = DataFrame({"datetime": dr}, index=dr)

        expected = df.to_records()
        result = df.tz_convert("UTC").to_records()

        # both converted to UTC, so they are equal
        tm.assert_numpy_array_equal(result, expected)

    # orient - orient argument to to_dict function
    # item_getter - function for extracting value from
    # the resulting dict using column name and index
    @pytest.mark.parametrize(
        "orient,item_getter",
        [
            ("dict", lambda d, col, idx: d[col][idx]),
            ("records", lambda d, col, idx: d[idx][col]),
            ("list", lambda d, col, idx: d[col][idx]),
            ("split",
             lambda d, col, idx: d["data"][idx][d["columns"].index(col)]),
            ("index", lambda d, col, idx: d[idx][col]),
        ],
    )
    def test_to_dict_box_scalars(self, orient, item_getter):
        # 14216, 23753
        # make sure that we are boxing properly
        df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]})
        result = df.to_dict(orient=orient)
        assert isinstance(item_getter(result, "a", 0), int)
        assert isinstance(item_getter(result, "b", 0), float)

    def test_frame_to_dict_tz(self):
        # GH18372 When converting to dict with orient='records' columns of
        # datetime that are tz-aware were not converted to required arrays
        data = [
            (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc), ),
            (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc), ),
        ]
        df = DataFrame(list(data), columns=["d"])

        result = df.to_dict(orient="records")
        expected = [
            {
                "d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)
            },
            {
                "d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)
            },
        ]
        tm.assert_dict_equal(result[0], expected[0])
        tm.assert_dict_equal(result[1], expected[1])

    @pytest.mark.parametrize(
        "into, expected",
        [
            (
                dict,
                {
                    0: {
                        "int_col": 1,
                        "float_col": 1.0
                    },
                    1: {
                        "int_col": 2,
                        "float_col": 2.0
                    },
                    2: {
                        "int_col": 3,
                        "float_col": 3.0
                    },
                },
            ),
            (
                OrderedDict,
                OrderedDict([
                    (0, {
                        "int_col": 1,
                        "float_col": 1.0
                    }),
                    (1, {
                        "int_col": 2,
                        "float_col": 2.0
                    }),
                    (2, {
                        "int_col": 3,
                        "float_col": 3.0
                    }),
                ]),
            ),
            (
                defaultdict(dict),
                defaultdict(
                    dict,
                    {
                        0: {
                            "int_col": 1,
                            "float_col": 1.0
                        },
                        1: {
                            "int_col": 2,
                            "float_col": 2.0
                        },
                        2: {
                            "int_col": 3,
                            "float_col": 3.0
                        },
                    },
                ),
            ),
        ],
    )
    def test_to_dict_index_dtypes(self, into, expected):
        # GH 18580
        # When using to_dict(orient='index') on a dataframe with int
        # and float columns only the int columns were cast to float

        df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]})

        result = df.to_dict(orient="index", into=into)
        cols = ["int_col", "float_col"]
        result = DataFrame.from_dict(result, orient="index")[cols]
        expected = DataFrame.from_dict(expected, orient="index")[cols]
        tm.assert_frame_equal(result, expected)

    def test_to_dict_numeric_names(self):
        # https://github.com/pandas-dev/pandas/issues/24940
        df = DataFrame({str(i): [i] for i in range(5)})
        result = set(df.to_dict("records")[0].keys())
        expected = set(df.columns)
        assert result == expected

    def test_to_dict_wide(self):
        # https://github.com/pandas-dev/pandas/issues/24939
        df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)})
        result = df.to_dict("records")[0]
        expected = {"A_{:d}".format(i): i for i in range(256)}
        assert result == expected

    def test_to_dict_orient_dtype(self):
        # https://github.com/pandas-dev/pandas/issues/22620
        # Input Data
        input_data = {
            "a": [1, 2, 3],
            "b": [1.0, 2.0, 3.0],
            "c": ["X", "Y", "Z"]
        }
        df = DataFrame(input_data)
        # Expected Dtypes
        expected = {"a": int, "b": float, "c": str}
        # Extracting dtypes out of to_dict operation
        for df_dict in df.to_dict("records"):
            result = {
                "a": type(df_dict["a"]),
                "b": type(df_dict["b"]),
                "c": type(df_dict["c"]),
            }
            assert result == expected
Ejemplo n.º 22
0
def test_london_cleaner():
    unclean_input = pd.DataFrame.from_dict(
        {
            "Place (Overall)": [12547, 34146],
            "Place (Gender)": [9390, 20833],
            "Place (Category)": [4345, 3132],
            "Name": ["»A Smith, Matthew (GBR) \n", "»Aalders, Jennifer (GBR) \n"],
            "Sex": ["M", "W"],
            "Club": ["Lymm Runners", "Tynny Trotters"],
            "Running Number": ["Runner Number40546", "Runner Number23235"],
            "Category": ["18-39", pd.NA],
            "Finish": ["0 days 03:59:33", "0 days 06:22:20"],
            "Year": [2021, 2021],
        }
    )

    exp_output = pd.DataFrame.from_dict(
        {
            "Place (Overall)": [12547, 34146],
            "Place (Gender)": [9390, 20833],
            "Place (Category)": [4345, 3132],
            "Name": ["A Smith Matthew", "Aalders Jennifer"],
            "Sex": ["M", "F"],
            "Club": ["Lymm Runners", "Tynny Trotters"],
            "Running Number": ["40546", "23235"],
            "Category": ["18-39", "Unknown"],
            "Finish": [
                pd.Timedelta("0 days 03:59:33"),
                pd.Timedelta("0 days 06:22:20"),
            ],
            "Year": [2021, 2021],
            "Country": ["GBR", "GBR"],
            "FirstName": ["Matthew", "Jennifer"],
            "LastName": ["A Smith", "Aalders"],
            "DSQ": [False, False],
            "Finish (Total Seconds)": [14373.0, 22940.0],
        }
    ).astype(
        {
            "Place (Overall)": Int64Dtype(),
            "Place (Gender)": Int64Dtype(),
            "Place (Category)": Int64Dtype(),
            "Name": dtype("O"),
            "Sex": dtype("O"),
            "Club": dtype("O"),
            "Running Number": dtype("O"),
            "Category": CategoricalDtype(
                categories=[
                    "18-39",
                    "40-44",
                    "45-49",
                    "50-54",
                    "55-59",
                    "60-64",
                    "65-69",
                    "70+",
                    "70-74",
                    "75-79",
                    "80-84",
                    "85+",
                    "80+",
                    "Unknown",
                ],
                ordered=False,
            ),
            "Finish": dtype("<m8[ns]"),
            "Year": Int64Dtype(),
            "Country": dtype("O"),
            "FirstName": dtype("O"),
            "LastName": dtype("O"),
            "DSQ": dtype("bool"),
            "Finish (Total Seconds)": dtype("float64"),
        }
    )

    actual_output = london_cleaner(unclean_input)

    pd.testing.assert_frame_equal(actual_output, exp_output, check_categorical=False)
Ejemplo n.º 23
0
    def get_feature_df(self) -> Tuple[pd.DataFrame, List[Any]]:
        """
        Transform incoming data into pandas dataframe
        :return: tuple(features pandas.DataFrame, unqualified item id list)
        """
        # prepare features dataframe
        target_qs = self.get_queryset()
        all_sample_ids = list(target_qs.values_list('id', flat=True))
        # TODO: all documents ref. by all_sample_ids should be in feature_table
        feature_table: Optional[pd.DataFrame] = None
        counter = 'counter'

        for feature_source_item in self.feature_source:
            msg = f'Get "{feature_source_item}" feature data:'
            self.log_message(msg)
            self.log_message('_' * len(msg))

            # get aggregation queryset parameters for .annotate function
            source_model = self.source_models[feature_source_item]
            source_field = self.source_fields[feature_source_item]
            target_id_field = self.target_id_field
            aggregation = {counter: self.aggregation_function}

            # try to decrease memory usage iterating over chunks and using sparse dataframes
            # Note: pivot_table takes extra memory so use lower memory limits
            source_qs = source_model.objects.filter(**{target_id_field + '__in': all_sample_ids})

            if hasattr(source_model, 'text_unit'):
                source_qs = source_qs.filter(**{self.unit_type_filter: self.unit_type})

            ids = sorted(source_qs.order_by(target_id_field).values_list(target_id_field, flat=True).distinct())
            terms = sorted(source_qs.order_by(source_field).values_list(source_field, flat=True).distinct())
            id_count = len(ids)
            term_count = len(terms)

            self.log_message(f'{self.source_item}s containing "{feature_source_item}": {id_count}')
            self.log_message(f'unique "{feature_source_item}" items: {term_count}')

            if not term_count:
                self.log_message(f'WARN: there are no "{feature_source_item}" entities found')
                continue

            from_mem_chunk_size = self.get_chunk_size(term_count * 2)    # np.uint16 - 2 bytes
            chunk_size = min([self.max_chunk_size, from_mem_chunk_size])
            self.log_message(f'chunk_size from_mem/min/final: {from_mem_chunk_size}/{self.max_chunk_size}/{chunk_size}')

            # TODO: we stopped using pd.SparseDataFrame as there's no such class anymore
            single_feature_table = SparseSingleFeatureTable(feature_source_item)

            for step in range(0, id_count, chunk_size):
                self.log_message(f'...process "{feature_source_item}" feature: "{self.source_item}s" range: {step}-{step + chunk_size}')
                sample_ids = ids[step:step + chunk_size]

                chunk_qs = source_qs \
                    .filter(**{target_id_field + '__in': sample_ids}) \
                    .order_by(target_id_field, source_field) \
                    .values(target_id_field, source_field) \
                    .annotate(**aggregation)

                df_src = list(chunk_qs)
                chunk_df = pd.DataFrame.from_records(df_src)
                del chunk_qs
                gc.collect()  # try to free up memory

                doc_cat = CategoricalDtype(sample_ids, ordered=True)
                # TODO: fix for date features: pandas can't compare dates, but datetimes only
                if terms and isinstance(terms[0], datetime.date):
                    terms = [datetime.datetime.combine(d, datetime.datetime.min.time()) for d in terms]
                term_cat = CategoricalDtype(terms, ordered=True)

                row = [] if chunk_df.empty else chunk_df[self.target_id_field].astype(doc_cat).cat.codes
                col = [] if chunk_df.empty else chunk_df[source_field].astype(term_cat).cat.codes
                val = [] if chunk_df.empty else chunk_df[counter]
                sparse_matrix = scp.csr_matrix(
                    (val, (row, col)),
                    shape=(len(sample_ids), term_cat.categories.size),
                    dtype=np.uint16)
                single_feature_table.join(sparse_matrix)
                del chunk_df
                gc.collect()  # try to free up memory
                mem = psutil.virtual_memory()
                self.log_message(f'......available memory: {get_mb(mem.available)}M ({mem.percent}%)')

            # join feature_source_item-specific dataframe into results dataframe
            gc.collect()    # try to free up memory
            single_feature_df_src = SparseAllFeaturesTable(ids)
            single_feature_df_src.add_feature_table(single_feature_table, terms)

            if feature_table is None:
                feature_table = single_feature_df_src.to_dataframe()
            else:
                feature_table = feature_table.join(single_feature_df_src.to_dataframe(), how='outer')
            del single_feature_table
            del single_feature_df_src
            gc.collect()    # try to free up memory
            # end of "for feature_source_item in self.feature_source"

        df = feature_table
        if self.drop_empty_columns:
            df.dropna(axis=1, how='all', inplace=True)

        self.log_message(f'df: {get_df_info(df)}')
        mem = psutil.virtual_memory()
        self.log_message(f'available memory: {get_mb(mem.available)}M ({mem.percent}%)')

        if df.empty:
            msg = 'No features of chosen "feature_source" options {} detected. ' \
                  'Empty Data Set.'.format(str(self.feature_source))
            raise EmptyDataSetError(msg, feature_source=self.feature_source)

        # item ids not included in feature df which don't have features at all
        initial_id_set = set(target_qs.values_list('id', flat=True))
        feature_id_set = set(df.index.tolist())
        unqualified_item_ids = sorted(list(initial_id_set.difference(feature_id_set)))

        self.log_message('count unqualified_item_ids: {}'.format(len(unqualified_item_ids)))

        if not self.drop_empty_rows and unqualified_item_ids:
            unqualified_items_df = pd.DataFrame(index=unqualified_item_ids, columns=df.columns).fillna(0)

            self.log_message('unqualified_items_df shape: {} size: {}'.format(
                unqualified_items_df.shape, unqualified_items_df.memory_usage().sum()))

            df = pd.concat([df, unqualified_items_df]).fillna(0).astype(np.uint16)

            self.log_message(f'df: {get_df_info(df)}')

        return df, unqualified_item_ids
Ejemplo n.º 24
0
class TestAstype:
    def test_astype_float(self, float_frame):
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

        casted = float_frame.astype(np.int32)
        expected = DataFrame(
            float_frame.values.astype(np.int32),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

        float_frame["foo"] = "5"
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

    def test_astype_mixed_float(self, mixed_float_frame):
        # mixed casting
        casted = mixed_float_frame.reindex(
            columns=["A", "B"]).astype("float32")
        _check_cast(casted, "float32")

        casted = mixed_float_frame.reindex(
            columns=["A", "B"]).astype("float16")
        _check_cast(casted, "float16")

    def test_astype_mixed_type(self, mixed_type_frame):
        # mixed casting
        mn = mixed_type_frame._get_numeric_data().copy()
        mn["little_float"] = np.array(12345.0, dtype="float16")
        mn["big_float"] = np.array(123456789101112.0, dtype="float64")

        casted = mn.astype("float64")
        _check_cast(casted, "float64")

        casted = mn.astype("int64")
        _check_cast(casted, "int64")

        casted = mn.reindex(columns=["little_float"]).astype("float16")
        _check_cast(casted, "float16")

        casted = mn.astype("float32")
        _check_cast(casted, "float32")

        casted = mn.astype("int32")
        _check_cast(casted, "int32")

        # to object
        casted = mn.astype("O")
        _check_cast(casted, "object")

    def test_astype_with_exclude_string(self, float_frame):
        df = float_frame.copy()
        expected = float_frame.astype(int)
        df["string"] = "foo"
        casted = df.astype(int, errors="ignore")

        expected["string"] = "foo"
        tm.assert_frame_equal(casted, expected)

        df = float_frame.copy()
        expected = float_frame.astype(np.int32)
        df["string"] = "foo"
        casted = df.astype(np.int32, errors="ignore")

        expected["string"] = "foo"
        tm.assert_frame_equal(casted, expected)

    def test_astype_with_view_float(self, float_frame):

        # this is the only real reason to do it this way
        tf = np.round(float_frame).astype(np.int32)
        casted = tf.astype(np.float32, copy=False)

        # TODO(wesm): verification?
        tf = float_frame.astype(np.float64)
        casted = tf.astype(np.int64, copy=False)  # noqa

    def test_astype_with_view_mixed_float(self, mixed_float_frame):

        tf = mixed_float_frame.reindex(columns=["A", "B", "C"])

        casted = tf.astype(np.int64)
        casted = tf.astype(np.float32)  # noqa

    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    @pytest.mark.parametrize("val", [np.nan, np.inf])
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see GH#14265
        #
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with pytest.raises(ValueError, match=msg):
            df.astype(dtype)

    def test_astype_str(self):
        # see GH#9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        result = df.astype(str)

        expected = DataFrame({
            "a":
            list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))),
            "b":
            list(map(str, map(Timestamp, b._values))),
            "c":
            list(map(lambda x: Timedelta(x)._repr_base(), c._values)),
            "d":
            list(map(str, d._values)),
            "e":
            list(map(str, e._values)),
        })

        tm.assert_frame_equal(result, expected)

    def test_astype_str_float(self):
        # see GH#11302
        result = DataFrame([np.NaN]).astype(str)
        expected = DataFrame(["nan"])

        tm.assert_frame_equal(result, expected)
        result = DataFrame([1.12345678901234567890]).astype(str)

        # < 1.14 truncates
        # >= 1.14 preserves the full repr
        val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457"
        expected = DataFrame([val])
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(["1.0", "2", "3.14", "4", "5.4"])
        df = DataFrame({"a": a, "b": b, "c": c, "d": d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({"b": "str", "d": "float32"})
        result = df.astype(dt1)
        expected = DataFrame({
            "a":
            a,
            "b":
            Series(["0", "1", "2", "3", "4"]),
            "c":
            c,
            "d":
            Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
        })
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df, original)

        dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
        result = df.astype(dt2)
        expected = DataFrame({
            "a":
            a,
            "b":
            Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
            "c":
            Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
            "d":
            Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
        })
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
        tm.assert_frame_equal(df.astype(dt3), df.astype(str))
        tm.assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({"b": str, 2: str})
        dt5 = dtype_class({"e": str})
        msg = "Only a column name can be used for the key in a dtype mappings argument"
        with pytest.raises(KeyError, match=msg):
            df.astype(dt4)
        with pytest.raises(KeyError, match=msg):
            df.astype(dt5)
        tm.assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        tm.assert_frame_equal(df, equiv)
        tm.assert_frame_equal(df, original)

        # GH#16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({}) if dtype_class is dict else dtype_class(
            {}, dtype=object)
        equiv = df.astype(dt7)
        tm.assert_frame_equal(df, equiv)
        tm.assert_frame_equal(df, original)

    def test_astype_duplicate_col(self):
        a1 = Series([1, 2, 3, 4, 5], name="a")
        b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
        a2 = Series([0, 1, 2, 3, 4], name="a")
        df = concat([a1, b, a2], axis=1)

        result = df.astype(str)
        a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
        b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"],
                       dtype=str,
                       name="b")
        a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
        expected = concat([a1_str, b_str, a2_str], axis=1)
        tm.assert_frame_equal(result, expected)

        result = df.astype({"a": "str"})
        expected = concat([a1_str, b, a2_str], axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype",
        [
            "category",
            CategoricalDtype(),
            CategoricalDtype(ordered=True),
            CategoricalDtype(ordered=False),
            CategoricalDtype(categories=list("abcdef")),
            CategoricalDtype(categories=list("edba"), ordered=False),
            CategoricalDtype(categories=list("edcb"), ordered=True),
        ],
        ids=repr,
    )
    def test_astype_categorical(self, dtype):
        # GH#18099
        d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
        df = DataFrame(d)
        result = df.astype(dtype)
        expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype])
    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ["a", "a", "b", "c"]})
        xpr = f"Expected an instance of {cls.__name__}"
        with pytest.raises(TypeError, match=xpr):
            df.astype({"A": cls})

        with pytest.raises(TypeError, match=xpr):
            df["A"].astype(cls)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes(self, dtype):
        # GH#22578
        df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                       columns=["a", "b"])

        expected1 = DataFrame({
            "a": integer_array([1, 3, 5], dtype=dtype),
            "b": integer_array([2, 4, 6], dtype=dtype),
        })
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
        tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)

        df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                       columns=["a", "b"])
        df["b"] = df["b"].astype(dtype)
        expected2 = DataFrame({
            "a": [1.0, 3.0, 5.0],
            "b": integer_array([2, 4, 6], dtype=dtype)
        })
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes_1d(self, dtype):
        # GH#22578
        df = DataFrame({"a": [1.0, 2.0, 3.0]})

        expected1 = DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

        df = DataFrame({"a": [1.0, 2.0, 3.0]})
        df["a"] = df["a"].astype(dtype)
        expected2 = DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["category", "Int64"])
    def test_astype_extension_dtypes_duplicate_col(self, dtype):
        # GH#24704
        a1 = Series([0, np.nan, 4], name="a")
        a2 = Series([np.nan, 3, 5], name="a")
        df = concat([a1, a2], axis=1)

        result = df.astype(dtype)
        expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype", [{
        100: "float64",
        200: "uint64"
    }, "category", "float64"])
    def test_astype_column_metadata(self, dtype):
        # GH#19920
        columns = UInt64Index([100, 200, 300], name="foo")
        df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
        df = df.astype(dtype)
        tm.assert_index_equal(df.columns, columns)

    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_from_datetimelike_to_object(self, dtype, unit):
        # tests astype to object dtype
        # GH#19223 / GH#12425
        dtype = f"{dtype}[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(object)
        assert (result.dtypes == object).all()

        if dtype.startswith("M8"):
            assert result.iloc[0, 0] == Timestamp(1, unit=unit)
        else:
            assert result.iloc[0, 0] == Timedelta(1, unit=unit)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # GH#19223 / GH#12425
        dtype = f"{dtype}[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # GH#19223
        dtype = f"M8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns"])
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # GH#19223
        dtype = f"m8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # GH#19223
        dtype = f"m8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # GH#19224
        dtype = f"M8[{unit}]"
        other = f"m8[{unit}]"

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        msg = (fr"cannot astype a datetimelike from \[datetime64\[ns\]\] to "
               fr"\[timedelta64\[{unit}\]\]")
        with pytest.raises(TypeError, match=msg):
            df.astype(other)

        msg = (fr"cannot astype a timedelta from \[timedelta64\[ns\]\] to "
               fr"\[datetime64\[{unit}\]\]")
        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError, match=msg):
            df.astype(dtype)

    def test_astype_arg_for_errors(self):
        # GH#14878

        df = DataFrame([1, 2, 3])

        msg = ("Expected value of kwarg 'errors' to be one of "
               "['raise', 'ignore']. Supplied value is 'True'")
        with pytest.raises(ValueError, match=re.escape(msg)):
            df.astype(np.float64, errors=True)

        df.astype(np.int8, errors="ignore")

    def test_astype_arg_for_errors_dictlist(self):
        # GH#25905
        df = DataFrame([
            {
                "a": "1",
                "b": "16.5%",
                "c": "test"
            },
            {
                "a": "2.2",
                "b": "15.3",
                "c": "another_test"
            },
        ])
        expected = DataFrame([
            {
                "a": 1.0,
                "b": "16.5%",
                "c": "test"
            },
            {
                "a": 2.2,
                "b": "15.3",
                "c": "another_test"
            },
        ])
        type_dict = {"a": "float64", "b": "float64", "c": "object"}

        result = df.astype(dtype=type_dict, errors="ignore")

        tm.assert_frame_equal(result, expected)

    def test_astype_dt64tz(self, timezone_frame):
        # astype
        expected = np.array(
            [
                [
                    Timestamp("2013-01-01 00:00:00"),
                    Timestamp("2013-01-02 00:00:00"),
                    Timestamp("2013-01-03 00:00:00"),
                ],
                [
                    Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
                    NaT,
                    Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
                ],
                [
                    Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
                    NaT,
                    Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
                ],
            ],
            dtype=object,
        ).T
        expected = DataFrame(
            expected,
            index=timezone_frame.index,
            columns=timezone_frame.columns,
            dtype=object,
        )
        result = timezone_frame.astype(object)
        tm.assert_frame_equal(result, expected)

        result = timezone_frame.astype("datetime64[ns]")
        expected = DataFrame({
            "A":
            date_range("20130101", periods=3),
            "B":
            (date_range("20130101", periods=3,
                        tz="US/Eastern").tz_convert("UTC").tz_localize(None)),
            "C": (date_range("20130101", periods=3,
                             tz="CET").tz_convert("UTC").tz_localize(None)),
        })
        expected.iloc[1, 1] = NaT
        expected.iloc[1, 2] = NaT
        tm.assert_frame_equal(result, expected)

    def test_astype_dt64tz_to_str(self, timezone_frame):
        # str formatting
        result = timezone_frame.astype(str)
        expected = DataFrame(
            [
                [
                    "2013-01-01",
                    "2013-01-01 00:00:00-05:00",
                    "2013-01-01 00:00:00+01:00",
                ],
                ["2013-01-02", "NaT", "NaT"],
                [
                    "2013-01-03",
                    "2013-01-03 00:00:00-05:00",
                    "2013-01-03 00:00:00+01:00",
                ],
            ],
            columns=timezone_frame.columns,
        )
        tm.assert_frame_equal(result, expected)

        with option_context("display.max_columns", 20):
            result = str(timezone_frame)
            assert (
                "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00"
            ) in result
            assert (
                "1 2013-01-02                       NaT                       NaT"
            ) in result
            assert (
                "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00"
            ) in result
Ejemplo n.º 25
0
    def replot_single(self, i):
        col = self.joint_data.columns[i]
        self.outboxes[i].clear_output(wait=True)
        with self.outboxes[i]:
            plt.clf()
            fig = plt.gcf()
            fig.set_figwidth(3)
            fig.set_figheight(1)

            if self.scope.get_dtype(col) in ('cat', 'bool'):

                if self.scope.get_dtype(col) == 'cat':
                    bar_labels = self.scope.get_cat_values(col)
                else:
                    bar_labels = [False, True]

                v = self.joint_data[col].astype(
                    CategoricalDtype(categories=bar_labels,
                                     ordered=False)).cat.codes
                bar_heights, _ = numpy.histogram(v,
                                                 bins=numpy.arange(
                                                     0,
                                                     len(bar_labels) + 1))
                bar_x = numpy.arange(0, len(bar_labels))
                plt.bar(bar_x, bar_heights, 0.8, align='edge')
                filter_vals = self.joint_data.loc[self.joint_filters.all(
                    axis=1), col].astype(
                        CategoricalDtype(categories=bar_labels,
                                         ordered=False)).cat.codes
                bar_heights, _ = numpy.histogram(filter_vals,
                                                 bins=numpy.arange(
                                                     0,
                                                     len(bar_labels) + 1))
                plt.bar(bar_x, bar_heights, 0.8, align='edge')
                plt.xticks(bar_x + 0.4, [str(i) for i in bar_labels])
                plt.show()

                # bar_labels, bar_heights = numpy.unique(self.joint_data[col], return_counts=True)
                # bar_x = numpy.arange(0,len(bar_labels))
                # plt.bar(bar_x, bar_heights, 0.8, align='edge')
                # from pandas import CategoricalDtype
                # filter_vals = self.joint_data.loc[self.joint_filters.all(axis=1), col].astype(
                # 	CategoricalDtype(categories=bar_labels, ordered=False)
                # ).cat.codes
                # bar_heights, _ = numpy.histogram(filter_vals, bins=numpy.arange(0,len(bar_labels)+1))
                # plt.bar(bar_x, bar_heights, 0.8, align='edge')
                # plt.xticks(bar_x+0.4, bar_labels)
                # plt.show()
            else:

                bins = 20 if col not in self.data.strategy_names else 20
                #n, bins, patches = plt.hist(self.joint_data[col], bins=bins)
                bar_heights, bar_x = numpy.histogram(self.joint_data[col],
                                                     bins=bins)
                plt.bar(bar_x[:-1],
                        bar_heights,
                        bar_x[1:] - bar_x[:-1],
                        align='edge')
                #n, bins, patches = plt.hist(self.joint_data.loc[self.joint_filters.all(axis=1), col], bins=bins)

                bar_heights, bar_x = numpy.histogram(
                    self.joint_data.loc[self.joint_filters.all(axis=1), col],
                    bins=bar_x)
                plt.bar(bar_x[:-1],
                        bar_heights,
                        bar_x[1:] - bar_x[:-1],
                        align='edge')
                plt.show()
Ejemplo n.º 26
0
class TestAstype:
    def test_astype_float(self, float_frame):
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

        casted = float_frame.astype(np.int32)
        expected = DataFrame(
            float_frame.values.astype(np.int32),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

        float_frame["foo"] = "5"
        casted = float_frame.astype(int)
        expected = DataFrame(
            float_frame.values.astype(int),
            index=float_frame.index,
            columns=float_frame.columns,
        )
        tm.assert_frame_equal(casted, expected)

    def test_astype_mixed_float(self, mixed_float_frame):
        # mixed casting
        casted = mixed_float_frame.reindex(
            columns=["A", "B"]).astype("float32")
        _check_cast(casted, "float32")

        casted = mixed_float_frame.reindex(
            columns=["A", "B"]).astype("float16")
        _check_cast(casted, "float16")

    def test_astype_mixed_type(self, mixed_type_frame):
        # mixed casting
        mn = mixed_type_frame._get_numeric_data().copy()
        mn["little_float"] = np.array(12345.0, dtype="float16")
        mn["big_float"] = np.array(123456789101112.0, dtype="float64")

        casted = mn.astype("float64")
        _check_cast(casted, "float64")

        casted = mn.astype("int64")
        _check_cast(casted, "int64")

        casted = mn.reindex(columns=["little_float"]).astype("float16")
        _check_cast(casted, "float16")

        casted = mn.astype("float32")
        _check_cast(casted, "float32")

        casted = mn.astype("int32")
        _check_cast(casted, "int32")

        # to object
        casted = mn.astype("O")
        _check_cast(casted, "object")

    @td.skip_array_manager_not_yet_implemented
    def test_astype_with_exclude_string(self, float_frame):
        df = float_frame.copy()
        expected = float_frame.astype(int)
        df["string"] = "foo"
        casted = df.astype(int, errors="ignore")

        expected["string"] = "foo"
        tm.assert_frame_equal(casted, expected)

        df = float_frame.copy()
        expected = float_frame.astype(np.int32)
        df["string"] = "foo"
        casted = df.astype(np.int32, errors="ignore")

        expected["string"] = "foo"
        tm.assert_frame_equal(casted, expected)

    def test_astype_with_view_float(self, float_frame):

        # this is the only real reason to do it this way
        tf = np.round(float_frame).astype(np.int32)
        casted = tf.astype(np.float32, copy=False)

        # TODO(wesm): verification?
        tf = float_frame.astype(np.float64)
        casted = tf.astype(np.int64, copy=False)  # noqa

    def test_astype_with_view_mixed_float(self, mixed_float_frame):

        tf = mixed_float_frame.reindex(columns=["A", "B", "C"])

        casted = tf.astype(np.int64)
        casted = tf.astype(np.float32)  # noqa

    @td.skip_array_manager_not_yet_implemented
    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    @pytest.mark.parametrize("val", [np.nan, np.inf])
    def test_astype_cast_nan_inf_int(self, val, dtype):
        # see GH#14265
        #
        # Check NaN and inf --> raise error when converting to int.
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        df = DataFrame([val])

        with pytest.raises(ValueError, match=msg):
            df.astype(dtype)

    def test_astype_str(self):
        # see GH#9757
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
        c = Series([Timedelta(x, unit="d") for x in range(5)])
        d = Series(range(5))
        e = Series([0.0, 0.2, 0.4, 0.6, 0.8])

        df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})

        # Datetime-like
        result = df.astype(str)

        expected = DataFrame({
            "a":
            list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))),
            "b":
            list(map(str, map(Timestamp, b._values))),
            "c":
            list(map(lambda x: Timedelta(x)._repr_base(), c._values)),
            "d":
            list(map(str, d._values)),
            "e":
            list(map(str, e._values)),
        })

        tm.assert_frame_equal(result, expected)

    def test_astype_str_float(self):
        # see GH#11302
        result = DataFrame([np.NaN]).astype(str)
        expected = DataFrame(["nan"])

        tm.assert_frame_equal(result, expected)
        result = DataFrame([1.12345678901234567890]).astype(str)

        val = "1.1234567890123457"
        expected = DataFrame([val])
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # GH7271 & GH16717
        a = Series(date_range("2010-01-04", periods=5))
        b = Series(range(5))
        c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
        d = Series(["1.0", "2", "3.14", "4", "5.4"])
        df = DataFrame({"a": a, "b": b, "c": c, "d": d})
        original = df.copy(deep=True)

        # change type of a subset of columns
        dt1 = dtype_class({"b": "str", "d": "float32"})
        result = df.astype(dt1)
        expected = DataFrame({
            "a":
            a,
            "b":
            Series(["0", "1", "2", "3", "4"]),
            "c":
            c,
            "d":
            Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
        })
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df, original)

        dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
        result = df.astype(dt2)
        expected = DataFrame({
            "a":
            a,
            "b":
            Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
            "c":
            Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
            "d":
            Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
        })
        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(df, original)

        # change all columns
        dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
        tm.assert_frame_equal(df.astype(dt3), df.astype(str))
        tm.assert_frame_equal(df, original)

        # error should be raised when using something other than column labels
        # in the keys of the dtype dict
        dt4 = dtype_class({"b": str, 2: str})
        dt5 = dtype_class({"e": str})
        msg = "Only a column name can be used for the key in a dtype mappings argument"
        with pytest.raises(KeyError, match=msg):
            df.astype(dt4)
        with pytest.raises(KeyError, match=msg):
            df.astype(dt5)
        tm.assert_frame_equal(df, original)

        # if the dtypes provided are the same as the original dtypes, the
        # resulting DataFrame should be the same as the original DataFrame
        dt6 = dtype_class({col: df[col].dtype for col in df.columns})
        equiv = df.astype(dt6)
        tm.assert_frame_equal(df, equiv)
        tm.assert_frame_equal(df, original)

        # GH#16717
        # if dtypes provided is empty, the resulting DataFrame
        # should be the same as the original DataFrame
        dt7 = dtype_class({}) if dtype_class is dict else dtype_class(
            {}, dtype=object)
        equiv = df.astype(dt7)
        tm.assert_frame_equal(df, equiv)
        tm.assert_frame_equal(df, original)

    def test_astype_duplicate_col(self):
        a1 = Series([1, 2, 3, 4, 5], name="a")
        b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
        a2 = Series([0, 1, 2, 3, 4], name="a")
        df = concat([a1, b, a2], axis=1)

        result = df.astype(str)
        a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
        b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"],
                       dtype=str,
                       name="b")
        a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
        expected = concat([a1_str, b_str, a2_str], axis=1)
        tm.assert_frame_equal(result, expected)

        result = df.astype({"a": "str"})
        expected = concat([a1_str, b, a2_str], axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype",
        [
            "category",
            CategoricalDtype(),
            CategoricalDtype(ordered=True),
            CategoricalDtype(ordered=False),
            CategoricalDtype(categories=list("abcdef")),
            CategoricalDtype(categories=list("edba"), ordered=False),
            CategoricalDtype(categories=list("edcb"), ordered=True),
        ],
        ids=repr,
    )
    def test_astype_categorical(self, dtype):
        # GH#18099
        d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
        df = DataFrame(d)
        result = df.astype(dtype)
        expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize(
        "cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype])
    def test_astype_categoricaldtype_class_raises(self, cls):
        df = DataFrame({"A": ["a", "a", "b", "c"]})
        xpr = f"Expected an instance of {cls.__name__}"
        with pytest.raises(TypeError, match=xpr):
            df.astype({"A": cls})

        with pytest.raises(TypeError, match=xpr):
            df["A"].astype(cls)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes(self, dtype):
        # GH#22578
        df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                       columns=["a", "b"])

        expected1 = DataFrame({
            "a": pd.array([1, 3, 5], dtype=dtype),
            "b": pd.array([2, 4, 6], dtype=dtype),
        })
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
        tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)

        df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
                       columns=["a", "b"])
        df["b"] = df["b"].astype(dtype)
        expected2 = DataFrame({
            "a": [1.0, 3.0, 5.0],
            "b": pd.array([2, 4, 6], dtype=dtype)
        })
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
    def test_astype_extension_dtypes_1d(self, dtype):
        # GH#22578
        df = DataFrame({"a": [1.0, 2.0, 3.0]})

        expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

        df = DataFrame({"a": [1.0, 2.0, 3.0]})
        df["a"] = df["a"].astype(dtype)
        expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
        tm.assert_frame_equal(df, expected2)

        tm.assert_frame_equal(df.astype(dtype), expected1)
        tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)

    @pytest.mark.parametrize("dtype", ["category", "Int64"])
    def test_astype_extension_dtypes_duplicate_col(self, dtype):
        # GH#24704
        a1 = Series([0, np.nan, 4], name="a")
        a2 = Series([np.nan, 3, 5], name="a")
        df = concat([a1, a2], axis=1)

        result = df.astype(dtype)
        expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("dtype", [{
        100: "float64",
        200: "uint64"
    }, "category", "float64"])
    def test_astype_column_metadata(self, dtype):
        # GH#19920
        columns = UInt64Index([100, 200, 300], name="foo")
        df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
        df = df.astype(dtype)
        tm.assert_index_equal(df.columns, columns)

    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_from_datetimelike_to_object(self, dtype, unit):
        # tests astype to object dtype
        # GH#19223 / GH#12425
        dtype = f"{dtype}[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(object)
        assert (result.dtypes == object).all()

        if dtype.startswith("M8"):
            assert result.iloc[0, 0] == Timestamp(1, unit=unit)
        else:
            assert result.iloc[0, 0] == Timedelta(1, unit=unit)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units from numeric origination
        # GH#19223 / GH#12425
        dtype = f"{dtype}[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=arr_dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @td.skip_array_manager_not_yet_implemented
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_datetime_unit(self, unit):
        # tests all units from datetime origination
        # GH#19223
        dtype = f"M8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns"])
    def test_astype_to_timedelta_unit_ns(self, unit):
        # preserver the timedelta conversion
        # GH#19223
        dtype = f"m8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(arr.astype(dtype))

        tm.assert_frame_equal(result, expected)

    @td.skip_array_manager_not_yet_implemented
    @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
    def test_astype_to_timedelta_unit(self, unit):
        # coerce to float
        # GH#19223
        dtype = f"m8[{unit}]"
        arr = np.array([[1, 2, 3]], dtype=dtype)
        df = DataFrame(arr)
        result = df.astype(dtype)
        expected = DataFrame(df.values.astype(dtype).astype(float))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_astype_to_incorrect_datetimelike(self, unit):
        # trying to astype a m to a M, or vice-versa
        # GH#19224
        dtype = f"M8[{unit}]"
        other = f"m8[{unit}]"

        df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
        msg = fr"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]"
        with pytest.raises(TypeError, match=msg):
            df.astype(other)

        msg = fr"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]"
        df = DataFrame(np.array([[1, 2, 3]], dtype=other))
        with pytest.raises(TypeError, match=msg):
            df.astype(dtype)

    @td.skip_array_manager_not_yet_implemented
    def test_astype_arg_for_errors(self):
        # GH#14878

        df = DataFrame([1, 2, 3])

        msg = ("Expected value of kwarg 'errors' to be one of "
               "['raise', 'ignore']. Supplied value is 'True'")
        with pytest.raises(ValueError, match=re.escape(msg)):
            df.astype(np.float64, errors=True)

        df.astype(np.int8, errors="ignore")

    def test_astype_arg_for_errors_dictlist(self):
        # GH#25905
        df = DataFrame([
            {
                "a": "1",
                "b": "16.5%",
                "c": "test"
            },
            {
                "a": "2.2",
                "b": "15.3",
                "c": "another_test"
            },
        ])
        expected = DataFrame([
            {
                "a": 1.0,
                "b": "16.5%",
                "c": "test"
            },
            {
                "a": 2.2,
                "b": "15.3",
                "c": "another_test"
            },
        ])
        type_dict = {"a": "float64", "b": "float64", "c": "object"}

        result = df.astype(dtype=type_dict, errors="ignore")

        tm.assert_frame_equal(result, expected)

    def test_astype_dt64tz(self, timezone_frame):
        # astype
        expected = np.array(
            [
                [
                    Timestamp("2013-01-01 00:00:00"),
                    Timestamp("2013-01-02 00:00:00"),
                    Timestamp("2013-01-03 00:00:00"),
                ],
                [
                    Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
                    NaT,
                    Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
                ],
                [
                    Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
                    NaT,
                    Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
                ],
            ],
            dtype=object,
        ).T
        expected = DataFrame(
            expected,
            index=timezone_frame.index,
            columns=timezone_frame.columns,
            dtype=object,
        )
        result = timezone_frame.astype(object)
        tm.assert_frame_equal(result, expected)

        with tm.assert_produces_warning(FutureWarning):
            # dt64tz->dt64 deprecated
            result = timezone_frame.astype("datetime64[ns]")
        expected = DataFrame({
            "A":
            date_range("20130101", periods=3),
            "B":
            (date_range("20130101", periods=3,
                        tz="US/Eastern").tz_convert("UTC").tz_localize(None)),
            "C": (date_range("20130101", periods=3,
                             tz="CET").tz_convert("UTC").tz_localize(None)),
        })
        expected.iloc[1, 1] = NaT
        expected.iloc[1, 2] = NaT
        tm.assert_frame_equal(result, expected)

    def test_astype_dt64tz_to_str(self, timezone_frame):
        # str formatting
        result = timezone_frame.astype(str)
        expected = DataFrame(
            [
                [
                    "2013-01-01",
                    "2013-01-01 00:00:00-05:00",
                    "2013-01-01 00:00:00+01:00",
                ],
                ["2013-01-02", "NaT", "NaT"],
                [
                    "2013-01-03",
                    "2013-01-03 00:00:00-05:00",
                    "2013-01-03 00:00:00+01:00",
                ],
            ],
            columns=timezone_frame.columns,
        )
        tm.assert_frame_equal(result, expected)

        with option_context("display.max_columns", 20):
            result = str(timezone_frame)
            assert (
                "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00"
            ) in result
            assert (
                "1 2013-01-02                       NaT                       NaT"
            ) in result
            assert (
                "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00"
            ) in result

    def test_astype_empty_dtype_dict(self):
        # issue mentioned further down in the following issue's thread
        # https://github.com/pandas-dev/pandas/issues/33113
        df = DataFrame()
        result = df.astype({})
        tm.assert_frame_equal(result, df)
        assert result is not df

    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) ignore keyword
    @pytest.mark.parametrize(
        "df",
        [
            DataFrame(Series(["x", "y", "z"], dtype="string")),
            DataFrame(Series(["x", "y", "z"], dtype="category")),
            DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])),
            DataFrame(Series(3 * [Interval(0, 1)])),
        ],
    )
    @pytest.mark.parametrize("errors", ["raise", "ignore"])
    def test_astype_ignores_errors_for_extension_dtypes(self, df, errors):
        # https://github.com/pandas-dev/pandas/issues/35471
        if errors == "ignore":
            expected = df
            result = df.astype(float, errors=errors)
            tm.assert_frame_equal(result, expected)
        else:
            msg = "(Cannot cast)|(could not convert)"
            with pytest.raises((ValueError, TypeError), match=msg):
                df.astype(float, errors=errors)

    def test_astype_tz_conversion(self):
        # GH 35973
        val = {
            "tz": date_range("2020-08-30",
                             freq="d",
                             periods=2,
                             tz="Europe/London")
        }
        df = DataFrame(val)
        result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"})

        expected = df
        expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin")
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"])
    def test_astype_tz_object_conversion(self, tz):
        # GH 35973
        val = {
            "tz": date_range("2020-08-30",
                             freq="d",
                             periods=2,
                             tz="Europe/London")
        }
        expected = DataFrame(val)

        # convert expected to object dtype from other tz str (independently tested)
        result = expected.astype({"tz": f"datetime64[ns, {tz}]"})
        result = result.astype({"tz": "object"})

        # do real test: object dtype to a specified tz, different from construction tz.
        result = result.astype({"tz": "datetime64[ns, Europe/London]"})
        tm.assert_frame_equal(result, expected)

    def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture,
                                   request):
        tz = tz_naive_fixture
        if tz is None:
            mark = pytest.mark.xfail(
                reason=
                "GH#36153 uses ndarray formatting instead of DTA formatting")
            request.node.add_marker(mark)

        dti = date_range("2016-01-01", periods=3, tz=tz)
        dta = dti._data
        dta[0] = NaT

        obj = frame_or_series(dta)
        result = obj.astype("string")

        # Check that Series/DataFrame.astype matches DatetimeArray.astype
        expected = frame_or_series(dta.astype("string"))
        tm.assert_equal(result, expected)

        item = result.iloc[0]
        if frame_or_series is DataFrame:
            item = item.iloc[0]
        assert item is pd.NA

        # For non-NA values, we should match what we get for non-EA str
        alt = obj.astype(str)
        assert np.all(alt.iloc[1:] == result.iloc[1:])

    def test_astype_bytes(self):
        # GH#39474
        result = DataFrame(["foo", "bar", "baz"]).astype(bytes)
        assert result.dtypes[0] == np.dtype("S3")
Ejemplo n.º 27
0
def predict(model: keras.Model, standard_scaler: CustomStandardScaler,
            tf_idf: TfidfVectorizer, column_dummies, df_past, df_future):
    """model : Keras Model"""
    df_past.date_time = pd.to_datetime(df_past.date_time)
    df_future.date_time = pd.to_datetime(df_future.date_time)

    df_past.holiday = df_past.holiday != 'None'
    df_future.holiday = df_future.holiday != 'None'
    t = tf_idf.transform(df_past.weather_description)
    t2 = tf_idf.transform(df_future.weather_description)
    df_past = pd.concat([
        df_past,
        pd.DataFrame(data=t.toarray(), index=df_past.index).add_prefix('tag_')
    ],
                        axis=1)
    df_future = pd.concat([
        df_future,
        pd.DataFrame(data=t2.toarray(),
                     index=df_future.index).add_prefix('tag_')
    ],
                          axis=1)
    df_past.drop(columns='weather_description', inplace=True)
    df_future.drop(columns='weather_description', inplace=True)

    df_past['hour'] = df_past.date_time.dt.hour
    df_past['weekday'] = df_past.date_time.dt.day_name()
    df_past['day'] = df_past.date_time.dt.day
    df_past['month'] = df_past.date_time.dt.month_name()
    df_past.holiday = df_past.holiday.astype(int)

    df_future['hour'] = df_future.date_time.dt.hour
    df_future['weekday'] = df_future.date_time.dt.day_name()
    df_future['day'] = df_future.date_time.dt.day
    df_future['month'] = df_future.date_time.dt.month_name()
    df_future.holiday = df_future.holiday.astype(int)

    for col, values in column_dummies.items():
        df_future[col] = df_future[col].astype(CategoricalDtype(values))
        df_past[col] = df_past[col].astype(CategoricalDtype(values))

    df_past = df_past.join(
        pd.get_dummies(df_past.weather_main, prefix='weather'))
    df_past = df_past.join(pd.get_dummies(df_past.hour, prefix='hour'))
    df_past = df_past.join(pd.get_dummies(df_past.weekday, prefix='weekday'))
    df_past = df_past.join(pd.get_dummies(df_past.day, prefix='day'))
    df_past = df_past.join(pd.get_dummies(df_past.month, prefix='month'))

    df_future = df_future.join(
        pd.get_dummies(df_future.weather_main, prefix='weather'))
    df_future = df_future.join(pd.get_dummies(df_future.hour, prefix='hour'))
    df_future = df_future.join(
        pd.get_dummies(df_future.weekday, prefix='weekday'))
    df_future = df_future.join(pd.get_dummies(df_future.day, prefix='day'))
    df_future = df_future.join(pd.get_dummies(df_future.month, prefix='month'))

    df_past = df_past.drop(
        columns=['weather_main', 'hour', 'weekday', 'day', 'month'])
    df_future = df_future.drop(
        columns=['weather_main', 'hour', 'weekday', 'day', 'month'])

    df_past.drop(columns='date_time', inplace=True)
    df_future.drop(columns='date_time', inplace=True)

    traffic = df_past['traffic_volume'].values.reshape(-1, 1)

    df_past, df_future, traffic = standard_scaler.transform(
        [df_past, df_future, traffic])

    df_future = df_future[np.newaxis, :]
    df_past = df_past[np.newaxis, :]

    y = model.predict((df_past, df_future))
    y = tf.squeeze(y)
    y = y.numpy()
    return standard_scaler.ss[2].inverse_transform(y)