def run_benchmark(benchmark, nrows, limit=None): func = globals()[benchmark] top_authors_needed = benchmark == 'triple_join' print(f"Reading {nrows} CSV rows, running {benchmark} with limit={limit}") stats = {"pandas": {}, "pandaSQL": {}, "dask": {}, "nrows": nrows, "benchmark": benchmark, "limit": limit} start = time.time() authors_df = pandas.read_csv('authors.csv') books_df = pandas.read_csv('books.csv', nrows=nrows) if top_authors_needed: top_authors_df = pandas.read_csv('top_authors.csv') else: top_authors_df = None top_authors = None time_taken = time.time() - start stats['pandas']['read_time'] = time_taken print("[Pandas] Time taken to read: {:0.3f} seconds".format(time_taken)) start = time.time() func(pandas, authors_df, books_df, n=limit, top_authors=top_authors_df) time_taken = time.time() - start stats['pandas']['run_time'] = time_taken print("[Pandas] Time taken to run: {:0.3f} seconds".format(time_taken)) start = time.time() # authors = pandasql.read_csv('authors.csv') # books = pandasql.read_csv('books.csv') authors = pandasql.DataFrame(authors_df) books = pandasql.DataFrame(books_df) if top_authors_needed: top_authors = pandasql.DataFrame(top_authors_df) time_taken = time.time() - start stats['pandaSQL']['read_time'] = time_taken print("[PandaSQL] Time taken to read: {:0.3f} seconds".format(time_taken)) start = time.time() func(pandasql, authors, books, n=limit, top_authors=top_authors) time_taken = time.time() - start stats['pandaSQL']['run_time'] = time_taken print("[PandaSQL] Time taken to run: {:0.3f} seconds".format(time_taken)) start = time.time() authors = dd.from_pandas(authors_df, npartitions=1) books = dd.from_pandas(books_df, npartitions=1) if top_authors_needed: top_authors = dd.from_pandas(top_authors_df, npartitions=1) time_taken = time.time() - start stats['dask']['read_time'] = time_taken print("[Dask] Time taken to read: {:0.3f} seconds".format(time_taken)) start = time.time() func(dd, authors, books, n=limit, top_authors=top_authors) time_taken = time.time() - start stats['dask']['run_time'] = time_taken print("[Dask] Time taken to run: {:0.3f} seconds".format(time_taken)) print(json.dumps(stats, indent=4))
def test_complex_read_query(self): base_df_1 = pd.DataFrame([{ 'a': str(i), 'b': str(j), 'c': 100 * i, 'd': -j } for i in range(3) for j in range(3)]) base_df_2 = pd.DataFrame([{ 'a': str(i), 'b': str(j), 'e': 50 * i, 'f': j } for i in range(3) for j in range(3)]) df_1 = ps.DataFrame(base_df_1) df_2 = ps.DataFrame(base_df_2) key = ['a', 'b'] base_merged = base_df_1.merge(base_df_2, on=key) base_agg = base_merged.groupby(key, as_index=False)[['c', 'f']].sum() base_ordered = base_agg.sort_values(by=key, ascending=False) base_limit = base_ordered.head(3) merged = df_1.merge(df_2, on=key) agg = merged.groupby(key)[['c', 'f']].sum() ordered = agg.sort_values(by=key, ascending=False) limit = ordered.head(3) # This should trigger computation self.assertEqual(str(limit), str(base_limit)) # All dependencies should also have cached results pd.testing.assert_frame_equal(merged.result, base_merged) pd.testing.assert_frame_equal(agg.result, base_agg) pd.testing.assert_frame_equal(ordered.result, base_ordered) pd.testing.assert_frame_equal(limit.result, base_limit)
def test_union(self): base_df_1 = pd.DataFrame([{'n': i, 's': str(i)} for i in range(8)]) df_1 = ps.DataFrame(base_df_1) base_df_2 = pd.DataFrame([{'n': i, 's': str(i)} for i in range(4, 12)]) df_2 = ps.DataFrame(base_df_2) base_df_3 = pd.DataFrame([{'n': i, 's': str(i)} for i in range(8, 16)]) df_3 = ps.DataFrame(base_df_3) union = ps.concat([df_1, df_2, df_3]) expected = pd.concat([base_df_1, base_df_2, base_df_3]) assertDataFrameEqualsPandas(union, expected)
def test_topological_sort(self): base_df_1 = pd.DataFrame([{'n': i, 's1': str(i*2)} for i in range(10)]) df_1 = ps.DataFrame(base_df_1) base_df_2 = pd.DataFrame([{'n': i, 's2': str(i*2)} for i in range(10)]) df_2 = ps.DataFrame(base_df_2) merged = df_1.merge(df_2, on='n') graph = _get_dependency_graph(merged) ordered = _topological_sort(graph) self.assertEqual(ordered[0].name, df_1.name) self.assertEqual(ordered[1].name, df_2.name) self.assertEqual(ordered[2].name, merged.name)
def test_run_with_missing_dependencies_sqlite(self): ps.offloading_strategy('ALWAYS') base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)]) base_selection = base_df[base_df['n'] >= 5] # Should run on SQLite since original data was offloaded df = ps.DataFrame(base_df, offload=True) selection = df[df['n'] >= 5] assertDataFrameEqualsPandas(selection, base_selection) # Should not run on SQLite since original data was not offloaded df = ps.DataFrame(base_df, offload=False) selection = df[df['n'] >= 5] self.assertRaises(RuntimeError, lambda: selection.compute())
def test_result_out_of_memory(self): ps.offloading_strategy('ALWAYS') size = 10**4 base_df = pd.DataFrame([{ 'n': i, 's': str(i * 2) } for i in range(size)]) base_selection = base_df[base_df['n'] >= 5] base_limit = base_selection.head() df = ps.DataFrame(base_df) memory_thresh = 10**4 new_factor = memory_thresh / psutil.virtual_memory().available old_factor = ps.memory_utils.SAFETY_FACTOR ps.memory_utils.SAFETY_FACTOR = new_factor # Should fail since the result is too big to be brought back selection = df[df['n'] >= 5] self.assertRaises(MemoryError, lambda: selection.compute()) # Should run since the result is small enough to be brought back limit = selection.head() assertDataFrameEqualsPandas(limit, base_limit) ps.memory_utils.SAFETY_FACTOR = old_factor
def test_groupby(self): base_df = pd.DataFrame([{ 'a': str(i), 'b': str(j), 'c': 100 * i, 'd': -j } for i in range(3) for j in range(3)]) df = ps.DataFrame(base_df) # Regular groupby res = df.groupby(['a', 'b'], as_index=False).sum() base_res = base_df.groupby(['a', 'b'], as_index=False).sum() assertDataFrameEqualsPandas(res, base_res) # groupby with group names in index res = df.groupby('a', as_index=True).prod() base_res = base_df.groupby('a', as_index=True).prod() assertDataFrameEqualsPandas(res, base_res) # Projection before aggregation res = df.groupby(['a', 'b'], as_index=False)['c'].count() base_res = base_df.groupby(['a', 'b'], as_index=False)['c'].count() assertDataFrameEqualsPandas(res, base_res) # Projection before aggregation with group names in index res = df.groupby(['a', 'b'], as_index=True)['c'].all() base_res = base_df.groupby(['a', 'b'], as_index=True)['c'].all() assertDataFrameEqualsPandas(res, base_res)
def test_get_dependency_graph(self): base_df_1 = pd.DataFrame([{'n': i, 's1': str(i*2)} for i in range(10)]) df_1 = ps.DataFrame(base_df_1) base_df_2 = pd.DataFrame([{'n': i, 's2': str(i*2)} for i in range(10)]) df_2 = ps.DataFrame(base_df_2) merged = df_1.merge(df_2, on='n') graph = _get_dependency_graph(merged) self.assertIn(df_1, graph) self.assertIn(df_2, graph) self.assertIn(merged, graph) self.assertIn(df_1, set(graph[merged])) self.assertIn(df_2, set(graph[merged])) self.assertEqual(len(graph[merged]), 2) self.assertEqual(len(graph[df_1]), 0) self.assertEqual(len(graph[df_2]), 0)
def test_merge_on_different_columns(self): base_df_1 = pd.DataFrame([{ 'n': i, 's1': str(i * 2) } for i in range(10)]) df_1 = ps.DataFrame(base_df_1) base_df_2 = pd.DataFrame([{ 'm': i, 's2': str(i * 2) } for i in range(10)]) df_2 = ps.DataFrame(base_df_2) merged_a = df_1.merge(df_2, left_on='n', right_on='m') merged_b = ps.merge(df_1, df_2, left_on='n', right_on='m') expected = pd.merge(base_df_1, base_df_2, left_on='n', right_on='m') assertDataFrameEqualsPandas(merged_a, expected) assertDataFrameEqualsPandas(merged_b, expected)
def test_merge(self): base_df_1 = pd.DataFrame([{ 'n': i, 's1': str(i * 2) } for i in range(10)]) df_1 = ps.DataFrame(base_df_1) base_df_2 = pd.DataFrame([{ 'n': i, 's2': str(i * 2) } for i in range(10)]) df_2 = ps.DataFrame(base_df_2) merged_a = df_1.merge(df_2, on='n') merged_b = ps.merge(df_1, df_2, on='n') expected = pd.merge(base_df_1, base_df_2, on='n') assertDataFrameEqualsPandas(merged_a, expected) assertDataFrameEqualsPandas(merged_b, expected)
def test_offloading_rule_join_then_restrict(self): df_1 = ps.DataFrame([{'n': i, 's': str(i * 2)} for i in range(100)]) df_2 = ps.DataFrame([{'n': i, 't': str(i * 4)} for i in range(100)]) join = df_1.merge(df_2, on='n') filtered = join[join['s'] + join['t'] < 50] limit = join[:20] self.assertFalse(COST_MODEL.should_offload(df_1)) self.assertFalse(COST_MODEL.should_offload(df_2)) self.assertTrue(COST_MODEL.should_offload(filtered)) self.assertTrue(COST_MODEL.should_offload(limit)) limit.compute() selection = limit['s'] # No pending join-limit operations for selection self.assertFalse(COST_MODEL.should_offload(selection))
def test_limit_after_selection(self): base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)]) df = ps.DataFrame(base_df) limit = df[df['n'] != 0][:5] head = df[df['n'] != 0].head(5) expected = base_df[base_df['n'] != 0].head() assertDataFrameEqualsPandas(limit, expected) assertDataFrameEqualsPandas(head, expected)
def test_offloading_rule_limit_output(self): df = ps.DataFrame([{'n': i, 's': str(i % 2)} for i in range(100)]) filtered = df[df['n'] > 25] limit = filtered.head(5) self.assertFalse(COST_MODEL.should_offload(filtered)) self.assertTrue(COST_MODEL.should_offload(limit))
def test_run_fallback_on_sqlite(self): ps.offloading_strategy('ALWAYS') base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(30)]) df = ps.DataFrame(base_df) largest = df.nlargest(n=10, columns='n') self.assertRaises(RuntimeError, lambda: largest.compute())
def test_drop_duplicates_projection(self): base_df = pd.DataFrame([{'n': int(i / 2), 's': 0} for i in range(10)]) df = ps.DataFrame(base_df) base_df_dup = base_df['n'].drop_duplicates() df_dup = df['n'].drop_duplicates() assertDataFrameEqualsPandas(df_dup, pd.DataFrame(base_df_dup))
def test_selection(self): base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)]) df = ps.DataFrame(base_df) assertDataFrameEqualsPandas(df[df['n'] == 5], base_df[base_df['n'] == 5]) assertDataFrameEqualsPandas(df[(df['n'] < 2) | (df['n'] > 6)], base_df[(base_df['n'] < 2) | (base_df['n'] > 6)]) # noqa
def test_order_by(self): base_df = pd.DataFrame([{'x': i // 2, 'y': i % 2} for i in range(10)]) df = ps.DataFrame(base_df) assertDataFrameEqualsPandas(df.sort_values('x', ascending=False), base_df.sort_values('x', ascending=False)) assertDataFrameEqualsPandas( df.sort_values(['x', 'y'], ascending=[True, False]), base_df.sort_values(['x', 'y'], ascending=[True, False]))
def test_get_and_set_database_file(self): old_file = ps.get_database_file() self.assertTrue(os.path.exists(old_file)) new_file = NamedTemporaryFile().name ps.set_database_file(new_file, delete=True) self.assertFalse(os.path.exists(old_file)) self.assertTrue(os.path.exists(new_file)) self.assertEqual(os.path.getsize(new_file), 0) _ = ps.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)]) self.assertGreater(os.path.getsize(new_file), 0)
def test_complex_write_query(self): base_df_1 = pd.DataFrame([{ 'a': i, 'b': j, 'c': 100 * i, 'd': -j } for i in range(3) for j in range(3)]) base_df_2 = pd.DataFrame([{ 'a': i, 'b': j, 'e': 50 * i, 'f': j } for i in range(3) for j in range(3)]) df_1 = ps.DataFrame(base_df_1) df_2 = ps.DataFrame(base_df_2) base_merged = base_df_1.merge(base_df_2, on=['a', 'b']) base_merged['diff'] = base_merged['c'] - base_merged['e'] base_merged['key'] = base_merged['diff'] * \ (base_merged['d'] - base_merged['f']) base_agg = base_merged.groupby('key', as_index=False)[['a', 'b']].sum() base_agg['sum'] = base_agg['a'] + base_agg['b'] base_ordered = base_agg.sort_values(by='sum') merged = df_1.merge(df_2, on=['a', 'b']) merged['diff'] = merged['c'] - merged['e'] merged['key'] = merged['diff'] * \ (merged['d'] - merged['f']) agg = merged.groupby('key')[['a', 'b']].sum() agg['sum'] = agg['a'] + agg['b'] ordered = agg.sort_values(by='sum') # This should trigger computation self.assertEqual(str(ordered), str(base_ordered)) # All dependencies should also have cached results pd.testing.assert_frame_equal(merged.result, base_merged) pd.testing.assert_frame_equal(agg.result, base_agg) pd.testing.assert_frame_equal(ordered.result, base_ordered)
def test_offloading_fallback_operation(self): ps.offloading_strategy('BEST') df = ps.DataFrame([{'n': i, 's': str(i % 2)} for i in range(100)]) largest = df.nlargest(10, 'n') limit = largest[:3] self.assertFalse(COST_MODEL.should_offload(largest)) self.assertFalse(COST_MODEL.should_offload(limit)) largest.compute() # No more pending fallback operations now self.assertTrue(COST_MODEL.should_offload(limit))
def test_criterion(self): base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)]) df = ps.DataFrame(base_df) assertDataFrameEqualsPandas(df['n'] == 5, base_df['n'] == 5) assertDataFrameEqualsPandas(df['n'] != 5, base_df['n'] != 5) assertDataFrameEqualsPandas(df['n'] >= 5, base_df['n'] >= 5) assertDataFrameEqualsPandas(df['n'] > 5, base_df['n'] > 5) assertDataFrameEqualsPandas(df['n'] <= 5, base_df['n'] <= 5) assertDataFrameEqualsPandas(~(df['n'] <= 5), ~(base_df['n'] <= 5)) assertDataFrameEqualsPandas((df['n'] < 2) | (df['n'] > 6), (base_df['n'] < 2) | (base_df['n'] > 6)) assertDataFrameEqualsPandas((df['n'] > 2) & (df['n'] < 6), (base_df['n'] > 2) & (base_df['n'] < 6))
def test_arithmetic(self): base_df = pd.DataFrame([{'n': i, 'm': 10 - i} for i in range(10)]) df = ps.DataFrame(base_df) assertDataFrameEqualsPandas(df['n'] + 2 * df['m'], base_df['n'] + 2 * base_df['m']) assertDataFrameEqualsPandas( (df['n'] - 1) // (2**(df['m'] % 3)), (base_df['n'] - 1) // (2**(base_df['m'] % 3))) # noqa assertDataFrameEqualsPandas( abs(df['n']) // 5 & df['m'], abs(base_df['n']) // 5 & base_df['m']) assertDataFrameEqualsPandas(df['n'] | 0 ^ ~df['m'], base_df['n'] | 0 ^ ~base_df['m'])
def test_write_on_downstream_dataframe(self): base_df = pd.DataFrame([{'n': i, 'a': str(i * 2)} for i in range(10)]) df = ps.DataFrame(base_df) # New columns selection = df[df['a'] != '4'] selection['b'] = 10 # Make new copy to avoid Pandas warning about writing to a slice expected = pd.DataFrame(base_df[base_df['a'] != '4']) expected['b'] = 10 pd.testing.assert_index_equal(selection.columns, expected.columns) assertDataFrameEqualsPandas(selection, expected)
def test_run_with_missing_dependencies_pandas(self): ps.offloading_strategy('NEVER') base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)]) base_selection = base_df[base_df['n'] >= 5] # Should run on Pandas since original data exists df = ps.DataFrame(base_df) selection = df[df['n'] >= 5] assertDataFrameEqualsPandas(selection, base_selection) # Should not run on Pandas since original data does not exist df._cached_result = None selection = df[df['n'] >= 5] self.assertRaises(RuntimeError, lambda: selection.compute())
def test_nlargest_nsmallest(self): ps.offloading_strategy('NEVER') base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(30)]) df = ps.DataFrame(base_df) base_largest = base_df.nlargest(n=10, columns='n') largest = df.nlargest(n=10, columns='n') self.assertIsInstance(largest, ps.core.FallbackOperation) assertDataFrameEqualsPandas(largest, base_largest) base_smallest = base_df.nsmallest(n=5, columns='n') smallest = df.nsmallest(n=5, columns='n') self.assertIsInstance(smallest, ps.core.FallbackOperation) assertDataFrameEqualsPandas(smallest, base_smallest)
def test_offloading_rule_deep_dependency_graph(self): depth = 10 size = 10**2 step = size // depth df = ps.DataFrame([{'n': i, 's': str(i * 2)} for i in range(size)]) descendants = [df] for d in range(step, size, step): parent = descendants[-1] child = parent[parent['n'] > d] descendants.append(child) self.assertFalse(COST_MODEL.should_offload(df)) self.assertTrue(COST_MODEL.should_offload(descendants[-1]))
def test_projection_after_selection(self): base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)]) df = ps.DataFrame(base_df) sel = df[df['n'] != 5] base_sel = base_df[base_df['n'] != 5] proj = sel['s'] base_proj = base_sel[['s']] self.assertIsNone(sel.result) self.assertIsNone(proj.result) assertDataFrameEqualsPandas(proj, base_proj) # sel should also be cached because of the Pandas computation triggered self.assertIsNotNone(sel.result) self.assertIsNotNone(proj.result)
def test_old_dependents_after_write(self): base_df = pd.DataFrame([{'n': i, 'a': str(i * 2)} for i in range(10)]) df = ps.DataFrame(base_df, deep_copy=True) old_proj = df['a'] expected_old_proj = base_df[['a']] # Change values in column df['a'] = df['n'] base_df['a'] = base_df['n'] # df should have the updated column pd.testing.assert_index_equal(df.columns, base_df.columns) assertDataFrameEqualsPandas(df, base_df) # But old_proj should have old values of column assertDataFrameEqualsPandas(old_proj, expected_old_proj)
def test_string_operations(self): # This is the same test as in TestDataFrame, run on Pandas this time base_df = pd.DataFrame([{ 'n': str(i), 'm': chr(97 + i) } for i in range(26)]) df = ps.DataFrame(base_df) res = df[df['n'].isin(['1', '5', '8'])] base_res = base_df[base_df['n'].isin(['1', '5', '8'])] assertDataFrameEqualsPandas(res, base_res) res = df[df['m'].str.contains('g')] base_res = base_df[base_df['m'].str.contains('g', regex=False)] assertDataFrameEqualsPandas(res, base_res) res = df[df['n'].str.startswith('1') | df['n'].str.endswith('3')] base_res = base_df[base_df['n'].str.startswith('1') | base_df['n'].str.endswith('3')] assertDataFrameEqualsPandas(res, base_res)
def test_write_column(self): base_df = pd.DataFrame([{'n': i, 'a': str(i * 2)} for i in range(10)]) df = ps.DataFrame(base_df) # Duplicate a column df['b'] = df['n'] base_df['b'] = base_df['n'] df['c'] = df['b'] * 2 base_df['c'] = base_df['b'] * 2 pd.testing.assert_index_equal(df.columns, base_df.columns) assertDataFrameEqualsPandas(df, base_df) # Write a constant column df['d'] = 10 df['e'] = 'dummy' base_df['d'] = 10 base_df['e'] = 'dummy' pd.testing.assert_index_equal(df.columns, base_df.columns) assertDataFrameEqualsPandas(df, base_df)