Esempio n. 1
0
def gen_int_data(ncols, nrows, rand_low, rand_high):
    cache_key = ("int", ncols, nrows, rand_low, rand_high)
    if cache_key in data_cache:
        return data_cache[cache_key]

    logging.info(
        "Generating int data {} rows and {} columns [{}-{}]".format(nrows, ncols, rand_low, rand_high)
    )
    data = {
        "col{}".format(i): random_state.randint(rand_low, rand_high, size=(nrows))
        for i in range(ncols)
    }
    data_cache[cache_key] = weakdict(data)
    return data
Esempio n. 2
0
def gen_str_int_data(ncols, nrows, rand_low, rand_high):
    cache_key = ("str_int", ncols, nrows, rand_low, rand_high)
    if cache_key in data_cache:
        return data_cache[cache_key]

    logging.info(
        "Generating str_int data {} rows and {} columns [{}-{}]".format(
            nrows, ncols, rand_low, rand_high
        )
    )
    data = gen_int_data(ncols, nrows, rand_low, rand_high).copy()
    data["gb_col"] = [
        "str_{}".format(random_state.randint(rand_low, rand_high)) for i in range(nrows)
    ]
    data_cache[cache_key] = weakdict(data)
    return data
Esempio n. 3
0
def test___repr__():
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 100))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 99))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 101))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    # ___repr___ method has a different code path depending on
    # whether the number of rows is >60; and a different code path
    # depending on the number of columns is >20.
    # Previous test cases already check the case when cols>20
    # and rows>60. The cases that follow exercise the other three
    # combinations.
    # rows <= 60, cols > 20
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 100))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    assert repr(pandas_df) == repr(modin_df)

    # rows <= 60, cols <= 20
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 10))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    assert repr(pandas_df) == repr(modin_df)

    # rows > 60, cols <= 20
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(100, 10))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    assert repr(pandas_df) == repr(modin_df)

    # Empty
    pandas_df = pandas.DataFrame(
        columns=["col{}".format(i) for i in range(100)])
    modin_df = pd.DataFrame(columns=["col{}".format(i) for i in range(100)])

    assert repr(pandas_df) == repr(modin_df)

    # From Issue #1705
    string_data = """"time","device_id","lat","lng","accuracy","activity_1","activity_1_conf","activity_2","activity_2_conf","activity_3","activity_3_conf"
"2016-08-26 09:00:00.206",2,60.186805,24.821049,33.6080017089844,"STILL",75,"IN_VEHICLE",5,"ON_BICYCLE",5
"2016-08-26 09:00:05.428",5,60.192928,24.767222,5,"WALKING",62,"ON_BICYCLE",29,"RUNNING",6
"2016-08-26 09:00:05.818",1,60.166382,24.700443,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5
"2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5
"2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0
"2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0"""
    pandas_df = pandas.read_csv(io.StringIO(string_data))
    modin_df = pd.read_csv(io.StringIO(string_data))
    assert repr(pandas_df) == repr(modin_df)
Esempio n. 4
0
def inter_df_math_helper(modin_df, pandas_df, op):
    # Test dataframe to dataframe
    try:
        pandas_result = getattr(pandas_df, op)(pandas_df)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(modin_df)
    else:
        modin_result = getattr(modin_df, op)(modin_df)
        df_equals(modin_result, pandas_result)

    # Test dataframe to int
    try:
        pandas_result = getattr(pandas_df, op)(4)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(4)
    else:
        modin_result = getattr(modin_df, op)(4)
        df_equals(modin_result, pandas_result)

    # Test dataframe to float
    try:
        pandas_result = getattr(pandas_df, op)(4.0)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(4.0)
    else:
        modin_result = getattr(modin_df, op)(4.0)
        df_equals(modin_result, pandas_result)

    # Test transposed dataframes to float
    try:
        pandas_result = getattr(pandas_df.T, op)(4.0)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df.T, op)(4.0)
    else:
        modin_result = getattr(modin_df.T, op)(4.0)
        df_equals(modin_result, pandas_result)

    frame_data = {
        "{}_other".format(modin_df.columns[0]): [0, 2],
        modin_df.columns[0]: [0, 19],
        modin_df.columns[1]: [1, 1],
    }
    modin_df2 = pd.DataFrame(frame_data)
    pandas_df2 = pandas.DataFrame(frame_data)

    # Test dataframe to different dataframe shape
    try:
        pandas_result = getattr(pandas_df, op)(pandas_df2)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(modin_df2)
    else:
        modin_result = getattr(modin_df, op)(modin_df2)
        df_equals(modin_result, pandas_result)

    # Test dataframe fill value
    try:
        pandas_result = getattr(pandas_df, op)(pandas_df2, fill_value=0)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(modin_df2, fill_value=0)
    else:
        modin_result = getattr(modin_df, op)(modin_df2, fill_value=0)
        df_equals(modin_result, pandas_result)

    # Test dataframe to list
    list_test = random_state.randint(RAND_LOW, RAND_HIGH, size=(modin_df.shape[1]))
    try:
        pandas_result = getattr(pandas_df, op)(list_test, axis=1)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(list_test, axis=1)
    else:
        modin_result = getattr(modin_df, op)(list_test, axis=1)
        df_equals(modin_result, pandas_result)

    # Test dataframe to series axis=0
    series_test_modin = modin_df[modin_df.columns[0]]
    series_test_pandas = pandas_df[pandas_df.columns[0]]
    try:
        pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(series_test_modin, axis=0)
    else:
        modin_result = getattr(modin_df, op)(series_test_modin, axis=0)
        df_equals(modin_result, pandas_result)

    # Test dataframe to series axis=1
    series_test_modin = modin_df.iloc[0]
    series_test_pandas = pandas_df.iloc[0]
    try:
        pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(series_test_modin, axis=1)
    else:
        modin_result = getattr(modin_df, op)(series_test_modin, axis=1)
        df_equals(modin_result, pandas_result)

    # Test dataframe to list axis=1
    series_test_modin = series_test_pandas = list(pandas_df.iloc[0])
    try:
        pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(series_test_modin, axis=1)
    else:
        modin_result = getattr(modin_df, op)(series_test_modin, axis=1)
        df_equals(modin_result, pandas_result)

    # Test dataframe to list axis=0
    series_test_modin = series_test_pandas = list(pandas_df[pandas_df.columns[0]])
    try:
        pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(series_test_modin, axis=0)
    else:
        modin_result = getattr(modin_df, op)(series_test_modin, axis=0)
        df_equals(modin_result, pandas_result)

    # Test dataframe to series missing values
    series_test_modin = modin_df.iloc[0, :-2]
    series_test_pandas = pandas_df.iloc[0, :-2]
    try:
        pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(series_test_modin, axis=1)
    else:
        modin_result = getattr(modin_df, op)(series_test_modin, axis=1)
        df_equals(modin_result, pandas_result)

    # Test dataframe to series with different index
    series_test_modin = modin_df[modin_df.columns[0]].reset_index(drop=True)
    series_test_pandas = pandas_df[pandas_df.columns[0]].reset_index(drop=True)
    try:
        pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0)
    except Exception as e:
        with pytest.raises(type(e)):
            getattr(modin_df, op)(series_test_modin, axis=0)
    else:
        modin_result = getattr(modin_df, op)(series_test_modin, axis=0)
        df_equals(modin_result, pandas_result)

    # Level test
    new_idx = pandas.MultiIndex.from_tuples(
        [(i // 4, i // 2, i) for i in modin_df.index]
    )
    modin_df_multi_level = modin_df.copy()
    modin_df_multi_level.index = new_idx
    # Defaults to pandas
    with pytest.warns(UserWarning):
        # Operation against self for sanity check
        getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1)