def gen_int_data(ncols, nrows, rand_low, rand_high): cache_key = ("int", ncols, nrows, rand_low, rand_high) if cache_key in data_cache: return data_cache[cache_key] logging.info( "Generating int data {} rows and {} columns [{}-{}]".format(nrows, ncols, rand_low, rand_high) ) data = { "col{}".format(i): random_state.randint(rand_low, rand_high, size=(nrows)) for i in range(ncols) } data_cache[cache_key] = weakdict(data) return data
def gen_str_int_data(ncols, nrows, rand_low, rand_high): cache_key = ("str_int", ncols, nrows, rand_low, rand_high) if cache_key in data_cache: return data_cache[cache_key] logging.info( "Generating str_int data {} rows and {} columns [{}-{}]".format( nrows, ncols, rand_low, rand_high ) ) data = gen_int_data(ncols, nrows, rand_low, rand_high).copy() data["gb_col"] = [ "str_{}".format(random_state.randint(rand_low, rand_high)) for i in range(nrows) ] data_cache[cache_key] = weakdict(data) return data
def test___repr__(): frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 100)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 99)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 101)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # ___repr___ method has a different code path depending on # whether the number of rows is >60; and a different code path # depending on the number of columns is >20. # Previous test cases already check the case when cols>20 # and rows>60. The cases that follow exercise the other three # combinations. # rows <= 60, cols > 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 100)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # rows <= 60, cols <= 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 10)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # rows > 60, cols <= 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(100, 10)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # Empty pandas_df = pandas.DataFrame( columns=["col{}".format(i) for i in range(100)]) modin_df = pd.DataFrame(columns=["col{}".format(i) for i in range(100)]) assert repr(pandas_df) == repr(modin_df) # From Issue #1705 string_data = """"time","device_id","lat","lng","accuracy","activity_1","activity_1_conf","activity_2","activity_2_conf","activity_3","activity_3_conf" "2016-08-26 09:00:00.206",2,60.186805,24.821049,33.6080017089844,"STILL",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:05.428",5,60.192928,24.767222,5,"WALKING",62,"ON_BICYCLE",29,"RUNNING",6 "2016-08-26 09:00:05.818",1,60.166382,24.700443,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 "2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" pandas_df = pandas.read_csv(io.StringIO(string_data)) modin_df = pd.read_csv(io.StringIO(string_data)) assert repr(pandas_df) == repr(modin_df)
def inter_df_math_helper(modin_df, pandas_df, op): # Test dataframe to dataframe try: pandas_result = getattr(pandas_df, op)(pandas_df) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(modin_df) else: modin_result = getattr(modin_df, op)(modin_df) df_equals(modin_result, pandas_result) # Test dataframe to int try: pandas_result = getattr(pandas_df, op)(4) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(4) else: modin_result = getattr(modin_df, op)(4) df_equals(modin_result, pandas_result) # Test dataframe to float try: pandas_result = getattr(pandas_df, op)(4.0) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(4.0) else: modin_result = getattr(modin_df, op)(4.0) df_equals(modin_result, pandas_result) # Test transposed dataframes to float try: pandas_result = getattr(pandas_df.T, op)(4.0) except Exception as e: with pytest.raises(type(e)): getattr(modin_df.T, op)(4.0) else: modin_result = getattr(modin_df.T, op)(4.0) df_equals(modin_result, pandas_result) frame_data = { "{}_other".format(modin_df.columns[0]): [0, 2], modin_df.columns[0]: [0, 19], modin_df.columns[1]: [1, 1], } modin_df2 = pd.DataFrame(frame_data) pandas_df2 = pandas.DataFrame(frame_data) # Test dataframe to different dataframe shape try: pandas_result = getattr(pandas_df, op)(pandas_df2) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(modin_df2) else: modin_result = getattr(modin_df, op)(modin_df2) df_equals(modin_result, pandas_result) # Test dataframe fill value try: pandas_result = getattr(pandas_df, op)(pandas_df2, fill_value=0) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(modin_df2, fill_value=0) else: modin_result = getattr(modin_df, op)(modin_df2, fill_value=0) df_equals(modin_result, pandas_result) # Test dataframe to list list_test = random_state.randint(RAND_LOW, RAND_HIGH, size=(modin_df.shape[1])) try: pandas_result = getattr(pandas_df, op)(list_test, axis=1) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(list_test, axis=1) else: modin_result = getattr(modin_df, op)(list_test, axis=1) df_equals(modin_result, pandas_result) # Test dataframe to series axis=0 series_test_modin = modin_df[modin_df.columns[0]] series_test_pandas = pandas_df[pandas_df.columns[0]] try: pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(series_test_modin, axis=0) else: modin_result = getattr(modin_df, op)(series_test_modin, axis=0) df_equals(modin_result, pandas_result) # Test dataframe to series axis=1 series_test_modin = modin_df.iloc[0] series_test_pandas = pandas_df.iloc[0] try: pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(series_test_modin, axis=1) else: modin_result = getattr(modin_df, op)(series_test_modin, axis=1) df_equals(modin_result, pandas_result) # Test dataframe to list axis=1 series_test_modin = series_test_pandas = list(pandas_df.iloc[0]) try: pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(series_test_modin, axis=1) else: modin_result = getattr(modin_df, op)(series_test_modin, axis=1) df_equals(modin_result, pandas_result) # Test dataframe to list axis=0 series_test_modin = series_test_pandas = list(pandas_df[pandas_df.columns[0]]) try: pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(series_test_modin, axis=0) else: modin_result = getattr(modin_df, op)(series_test_modin, axis=0) df_equals(modin_result, pandas_result) # Test dataframe to series missing values series_test_modin = modin_df.iloc[0, :-2] series_test_pandas = pandas_df.iloc[0, :-2] try: pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=1) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(series_test_modin, axis=1) else: modin_result = getattr(modin_df, op)(series_test_modin, axis=1) df_equals(modin_result, pandas_result) # Test dataframe to series with different index series_test_modin = modin_df[modin_df.columns[0]].reset_index(drop=True) series_test_pandas = pandas_df[pandas_df.columns[0]].reset_index(drop=True) try: pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) except Exception as e: with pytest.raises(type(e)): getattr(modin_df, op)(series_test_modin, axis=0) else: modin_result = getattr(modin_df, op)(series_test_modin, axis=0) df_equals(modin_result, pandas_result) # Level test new_idx = pandas.MultiIndex.from_tuples( [(i // 4, i // 2, i) for i in modin_df.index] ) modin_df_multi_level = modin_df.copy() modin_df_multi_level.index = new_idx # Defaults to pandas with pytest.warns(UserWarning): # Operation against self for sanity check getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1)