def run_benchmark(benchmark, nrows, limit=None):

    func = globals()[benchmark]
    top_authors_needed = benchmark == 'triple_join'
    print(f"Reading {nrows} CSV rows, running {benchmark} with limit={limit}")

    stats = {"pandas": {}, "pandaSQL": {}, "dask": {},
             "nrows": nrows, "benchmark": benchmark, "limit": limit}

    start = time.time()
    authors_df = pandas.read_csv('authors.csv')
    books_df = pandas.read_csv('books.csv', nrows=nrows)
    if top_authors_needed:
        top_authors_df = pandas.read_csv('top_authors.csv')
    else:
        top_authors_df = None
        top_authors = None
    time_taken = time.time() - start
    stats['pandas']['read_time'] = time_taken
    print("[Pandas]   Time taken to read: {:0.3f} seconds".format(time_taken))

    start = time.time()
    func(pandas, authors_df, books_df, n=limit, top_authors=top_authors_df)
    time_taken = time.time() - start
    stats['pandas']['run_time'] = time_taken
    print("[Pandas]   Time taken to run:  {:0.3f} seconds".format(time_taken))

    start = time.time()
    # authors = pandasql.read_csv('authors.csv')
    # books = pandasql.read_csv('books.csv')
    authors = pandasql.DataFrame(authors_df)
    books = pandasql.DataFrame(books_df)
    if top_authors_needed:
        top_authors = pandasql.DataFrame(top_authors_df)
    time_taken = time.time() - start
    stats['pandaSQL']['read_time'] = time_taken
    print("[PandaSQL] Time taken to read: {:0.3f} seconds".format(time_taken))

    start = time.time()
    func(pandasql, authors, books, n=limit, top_authors=top_authors)
    time_taken = time.time() - start
    stats['pandaSQL']['run_time'] = time_taken
    print("[PandaSQL] Time taken to run:  {:0.3f} seconds".format(time_taken))

    start = time.time()
    authors = dd.from_pandas(authors_df, npartitions=1)
    books = dd.from_pandas(books_df, npartitions=1)
    if top_authors_needed:
        top_authors = dd.from_pandas(top_authors_df, npartitions=1)
    time_taken = time.time() - start
    stats['dask']['read_time'] = time_taken
    print("[Dask]     Time taken to read: {:0.3f} seconds".format(time_taken))

    start = time.time()
    func(dd, authors, books, n=limit, top_authors=top_authors)
    time_taken = time.time() - start
    stats['dask']['run_time'] = time_taken
    print("[Dask]     Time taken to run:  {:0.3f} seconds".format(time_taken))

    print(json.dumps(stats, indent=4))
Exemple #2
0
    def test_complex_read_query(self):
        base_df_1 = pd.DataFrame([{
            'a': str(i),
            'b': str(j),
            'c': 100 * i,
            'd': -j
        } for i in range(3) for j in range(3)])
        base_df_2 = pd.DataFrame([{
            'a': str(i),
            'b': str(j),
            'e': 50 * i,
            'f': j
        } for i in range(3) for j in range(3)])
        df_1 = ps.DataFrame(base_df_1)
        df_2 = ps.DataFrame(base_df_2)

        key = ['a', 'b']
        base_merged = base_df_1.merge(base_df_2, on=key)
        base_agg = base_merged.groupby(key, as_index=False)[['c', 'f']].sum()
        base_ordered = base_agg.sort_values(by=key, ascending=False)
        base_limit = base_ordered.head(3)

        merged = df_1.merge(df_2, on=key)
        agg = merged.groupby(key)[['c', 'f']].sum()
        ordered = agg.sort_values(by=key, ascending=False)
        limit = ordered.head(3)

        # This should trigger computation
        self.assertEqual(str(limit), str(base_limit))

        # All dependencies should also have cached results
        pd.testing.assert_frame_equal(merged.result, base_merged)
        pd.testing.assert_frame_equal(agg.result, base_agg)
        pd.testing.assert_frame_equal(ordered.result, base_ordered)
        pd.testing.assert_frame_equal(limit.result, base_limit)
Exemple #3
0
    def test_union(self):
        base_df_1 = pd.DataFrame([{'n': i, 's': str(i)} for i in range(8)])
        df_1 = ps.DataFrame(base_df_1)
        base_df_2 = pd.DataFrame([{'n': i, 's': str(i)} for i in range(4, 12)])
        df_2 = ps.DataFrame(base_df_2)
        base_df_3 = pd.DataFrame([{'n': i, 's': str(i)} for i in range(8, 16)])
        df_3 = ps.DataFrame(base_df_3)

        union = ps.concat([df_1, df_2, df_3])
        expected = pd.concat([base_df_1, base_df_2, base_df_3])
        assertDataFrameEqualsPandas(union, expected)
Exemple #4
0
    def test_topological_sort(self):
        base_df_1 = pd.DataFrame([{'n': i, 's1': str(i*2)} for i in range(10)])
        df_1 = ps.DataFrame(base_df_1)
        base_df_2 = pd.DataFrame([{'n': i, 's2': str(i*2)} for i in range(10)])
        df_2 = ps.DataFrame(base_df_2)
        merged = df_1.merge(df_2, on='n')

        graph = _get_dependency_graph(merged)
        ordered = _topological_sort(graph)
        self.assertEqual(ordered[0].name, df_1.name)
        self.assertEqual(ordered[1].name, df_2.name)
        self.assertEqual(ordered[2].name, merged.name)
    def test_run_with_missing_dependencies_sqlite(self):
        ps.offloading_strategy('ALWAYS')

        base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)])
        base_selection = base_df[base_df['n'] >= 5]

        # Should run on SQLite since original data was offloaded
        df = ps.DataFrame(base_df, offload=True)
        selection = df[df['n'] >= 5]
        assertDataFrameEqualsPandas(selection, base_selection)

        # Should not run on SQLite since original data was not offloaded
        df = ps.DataFrame(base_df, offload=False)
        selection = df[df['n'] >= 5]
        self.assertRaises(RuntimeError, lambda: selection.compute())
    def test_result_out_of_memory(self):
        ps.offloading_strategy('ALWAYS')

        size = 10**4

        base_df = pd.DataFrame([{
            'n': i,
            's': str(i * 2)
        } for i in range(size)])
        base_selection = base_df[base_df['n'] >= 5]
        base_limit = base_selection.head()

        df = ps.DataFrame(base_df)

        memory_thresh = 10**4
        new_factor = memory_thresh / psutil.virtual_memory().available
        old_factor = ps.memory_utils.SAFETY_FACTOR
        ps.memory_utils.SAFETY_FACTOR = new_factor

        # Should fail since the result is too big to be brought back
        selection = df[df['n'] >= 5]
        self.assertRaises(MemoryError, lambda: selection.compute())

        # Should run since the result is small enough to be brought back
        limit = selection.head()
        assertDataFrameEqualsPandas(limit, base_limit)

        ps.memory_utils.SAFETY_FACTOR = old_factor
Exemple #7
0
    def test_groupby(self):
        base_df = pd.DataFrame([{
            'a': str(i),
            'b': str(j),
            'c': 100 * i,
            'd': -j
        } for i in range(3) for j in range(3)])
        df = ps.DataFrame(base_df)

        # Regular groupby
        res = df.groupby(['a', 'b'], as_index=False).sum()
        base_res = base_df.groupby(['a', 'b'], as_index=False).sum()
        assertDataFrameEqualsPandas(res, base_res)

        # groupby with group names in index
        res = df.groupby('a', as_index=True).prod()
        base_res = base_df.groupby('a', as_index=True).prod()
        assertDataFrameEqualsPandas(res, base_res)

        # Projection before aggregation
        res = df.groupby(['a', 'b'], as_index=False)['c'].count()
        base_res = base_df.groupby(['a', 'b'], as_index=False)['c'].count()
        assertDataFrameEqualsPandas(res, base_res)

        # Projection before aggregation with group names in index
        res = df.groupby(['a', 'b'], as_index=True)['c'].all()
        base_res = base_df.groupby(['a', 'b'], as_index=True)['c'].all()
        assertDataFrameEqualsPandas(res, base_res)
Exemple #8
0
    def test_get_dependency_graph(self):
        base_df_1 = pd.DataFrame([{'n': i, 's1': str(i*2)} for i in range(10)])
        df_1 = ps.DataFrame(base_df_1)
        base_df_2 = pd.DataFrame([{'n': i, 's2': str(i*2)} for i in range(10)])
        df_2 = ps.DataFrame(base_df_2)
        merged = df_1.merge(df_2, on='n')

        graph = _get_dependency_graph(merged)
        self.assertIn(df_1, graph)
        self.assertIn(df_2, graph)
        self.assertIn(merged, graph)
        self.assertIn(df_1, set(graph[merged]))
        self.assertIn(df_2, set(graph[merged]))
        self.assertEqual(len(graph[merged]), 2)
        self.assertEqual(len(graph[df_1]), 0)
        self.assertEqual(len(graph[df_2]), 0)
Exemple #9
0
    def test_merge_on_different_columns(self):
        base_df_1 = pd.DataFrame([{
            'n': i,
            's1': str(i * 2)
        } for i in range(10)])
        df_1 = ps.DataFrame(base_df_1)
        base_df_2 = pd.DataFrame([{
            'm': i,
            's2': str(i * 2)
        } for i in range(10)])
        df_2 = ps.DataFrame(base_df_2)

        merged_a = df_1.merge(df_2, left_on='n', right_on='m')
        merged_b = ps.merge(df_1, df_2, left_on='n', right_on='m')
        expected = pd.merge(base_df_1, base_df_2, left_on='n', right_on='m')
        assertDataFrameEqualsPandas(merged_a, expected)
        assertDataFrameEqualsPandas(merged_b, expected)
Exemple #10
0
    def test_merge(self):
        base_df_1 = pd.DataFrame([{
            'n': i,
            's1': str(i * 2)
        } for i in range(10)])
        df_1 = ps.DataFrame(base_df_1)
        base_df_2 = pd.DataFrame([{
            'n': i,
            's2': str(i * 2)
        } for i in range(10)])
        df_2 = ps.DataFrame(base_df_2)

        merged_a = df_1.merge(df_2, on='n')
        merged_b = ps.merge(df_1, df_2, on='n')
        expected = pd.merge(base_df_1, base_df_2, on='n')
        assertDataFrameEqualsPandas(merged_a, expected)
        assertDataFrameEqualsPandas(merged_b, expected)
Exemple #11
0
    def test_offloading_rule_join_then_restrict(self):
        df_1 = ps.DataFrame([{'n': i, 's': str(i * 2)} for i in range(100)])
        df_2 = ps.DataFrame([{'n': i, 't': str(i * 4)} for i in range(100)])

        join = df_1.merge(df_2, on='n')
        filtered = join[join['s'] + join['t'] < 50]
        limit = join[:20]

        self.assertFalse(COST_MODEL.should_offload(df_1))
        self.assertFalse(COST_MODEL.should_offload(df_2))
        self.assertTrue(COST_MODEL.should_offload(filtered))
        self.assertTrue(COST_MODEL.should_offload(limit))

        limit.compute()
        selection = limit['s']
        # No pending join-limit operations for selection
        self.assertFalse(COST_MODEL.should_offload(selection))
Exemple #12
0
 def test_limit_after_selection(self):
     base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)])
     df = ps.DataFrame(base_df)
     limit = df[df['n'] != 0][:5]
     head = df[df['n'] != 0].head(5)
     expected = base_df[base_df['n'] != 0].head()
     assertDataFrameEqualsPandas(limit, expected)
     assertDataFrameEqualsPandas(head, expected)
Exemple #13
0
    def test_offloading_rule_limit_output(self):
        df = ps.DataFrame([{'n': i, 's': str(i % 2)} for i in range(100)])

        filtered = df[df['n'] > 25]
        limit = filtered.head(5)

        self.assertFalse(COST_MODEL.should_offload(filtered))
        self.assertTrue(COST_MODEL.should_offload(limit))
    def test_run_fallback_on_sqlite(self):
        ps.offloading_strategy('ALWAYS')

        base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(30)])
        df = ps.DataFrame(base_df)

        largest = df.nlargest(n=10, columns='n')
        self.assertRaises(RuntimeError, lambda: largest.compute())
Exemple #15
0
    def test_drop_duplicates_projection(self):
        base_df = pd.DataFrame([{'n': int(i / 2), 's': 0} for i in range(10)])
        df = ps.DataFrame(base_df)

        base_df_dup = base_df['n'].drop_duplicates()
        df_dup = df['n'].drop_duplicates()

        assertDataFrameEqualsPandas(df_dup, pd.DataFrame(base_df_dup))
Exemple #16
0
    def test_selection(self):
        base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)])
        df = ps.DataFrame(base_df)

        assertDataFrameEqualsPandas(df[df['n'] == 5],
                                    base_df[base_df['n'] == 5])
        assertDataFrameEqualsPandas(df[(df['n'] < 2) | (df['n'] > 6)],
                                    base_df[(base_df['n'] < 2) |
                                            (base_df['n'] > 6)])  # noqa
Exemple #17
0
    def test_order_by(self):
        base_df = pd.DataFrame([{'x': i // 2, 'y': i % 2} for i in range(10)])
        df = ps.DataFrame(base_df)

        assertDataFrameEqualsPandas(df.sort_values('x', ascending=False),
                                    base_df.sort_values('x', ascending=False))

        assertDataFrameEqualsPandas(
            df.sort_values(['x', 'y'], ascending=[True, False]),
            base_df.sort_values(['x', 'y'], ascending=[True, False]))
    def test_get_and_set_database_file(self):
        old_file = ps.get_database_file()
        self.assertTrue(os.path.exists(old_file))

        new_file = NamedTemporaryFile().name
        ps.set_database_file(new_file, delete=True)
        self.assertFalse(os.path.exists(old_file))
        self.assertTrue(os.path.exists(new_file))

        self.assertEqual(os.path.getsize(new_file), 0)
        _ = ps.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)])
        self.assertGreater(os.path.getsize(new_file), 0)
Exemple #19
0
    def test_complex_write_query(self):
        base_df_1 = pd.DataFrame([{
            'a': i,
            'b': j,
            'c': 100 * i,
            'd': -j
        } for i in range(3) for j in range(3)])
        base_df_2 = pd.DataFrame([{
            'a': i,
            'b': j,
            'e': 50 * i,
            'f': j
        } for i in range(3) for j in range(3)])
        df_1 = ps.DataFrame(base_df_1)
        df_2 = ps.DataFrame(base_df_2)

        base_merged = base_df_1.merge(base_df_2, on=['a', 'b'])
        base_merged['diff'] = base_merged['c'] - base_merged['e']
        base_merged['key'] = base_merged['diff'] * \
            (base_merged['d'] - base_merged['f'])
        base_agg = base_merged.groupby('key', as_index=False)[['a', 'b']].sum()
        base_agg['sum'] = base_agg['a'] + base_agg['b']
        base_ordered = base_agg.sort_values(by='sum')

        merged = df_1.merge(df_2, on=['a', 'b'])
        merged['diff'] = merged['c'] - merged['e']
        merged['key'] = merged['diff'] * \
            (merged['d'] - merged['f'])
        agg = merged.groupby('key')[['a', 'b']].sum()
        agg['sum'] = agg['a'] + agg['b']
        ordered = agg.sort_values(by='sum')

        # This should trigger computation
        self.assertEqual(str(ordered), str(base_ordered))

        # All dependencies should also have cached results
        pd.testing.assert_frame_equal(merged.result, base_merged)
        pd.testing.assert_frame_equal(agg.result, base_agg)
        pd.testing.assert_frame_equal(ordered.result, base_ordered)
Exemple #20
0
    def test_offloading_fallback_operation(self):
        ps.offloading_strategy('BEST')

        df = ps.DataFrame([{'n': i, 's': str(i % 2)} for i in range(100)])

        largest = df.nlargest(10, 'n')
        limit = largest[:3]

        self.assertFalse(COST_MODEL.should_offload(largest))
        self.assertFalse(COST_MODEL.should_offload(limit))

        largest.compute()
        # No more pending fallback operations now
        self.assertTrue(COST_MODEL.should_offload(limit))
Exemple #21
0
    def test_criterion(self):
        base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)])
        df = ps.DataFrame(base_df)

        assertDataFrameEqualsPandas(df['n'] == 5, base_df['n'] == 5)
        assertDataFrameEqualsPandas(df['n'] != 5, base_df['n'] != 5)
        assertDataFrameEqualsPandas(df['n'] >= 5, base_df['n'] >= 5)
        assertDataFrameEqualsPandas(df['n'] > 5, base_df['n'] > 5)
        assertDataFrameEqualsPandas(df['n'] <= 5, base_df['n'] <= 5)
        assertDataFrameEqualsPandas(~(df['n'] <= 5), ~(base_df['n'] <= 5))
        assertDataFrameEqualsPandas((df['n'] < 2) | (df['n'] > 6),
                                    (base_df['n'] < 2) | (base_df['n'] > 6))
        assertDataFrameEqualsPandas((df['n'] > 2) & (df['n'] < 6),
                                    (base_df['n'] > 2) & (base_df['n'] < 6))
Exemple #22
0
    def test_arithmetic(self):
        base_df = pd.DataFrame([{'n': i, 'm': 10 - i} for i in range(10)])
        df = ps.DataFrame(base_df)

        assertDataFrameEqualsPandas(df['n'] + 2 * df['m'],
                                    base_df['n'] + 2 * base_df['m'])
        assertDataFrameEqualsPandas(
            (df['n'] - 1) // (2**(df['m'] % 3)),
            (base_df['n'] - 1) // (2**(base_df['m'] % 3)))  # noqa
        assertDataFrameEqualsPandas(
            abs(df['n']) // 5 & df['m'],
            abs(base_df['n']) // 5 & base_df['m'])
        assertDataFrameEqualsPandas(df['n'] | 0 ^ ~df['m'],
                                    base_df['n'] | 0 ^ ~base_df['m'])
Exemple #23
0
    def test_write_on_downstream_dataframe(self):
        base_df = pd.DataFrame([{'n': i, 'a': str(i * 2)} for i in range(10)])
        df = ps.DataFrame(base_df)

        # New columns
        selection = df[df['a'] != '4']
        selection['b'] = 10

        # Make new copy to avoid Pandas warning about writing to a slice
        expected = pd.DataFrame(base_df[base_df['a'] != '4'])
        expected['b'] = 10

        pd.testing.assert_index_equal(selection.columns, expected.columns)
        assertDataFrameEqualsPandas(selection, expected)
    def test_run_with_missing_dependencies_pandas(self):
        ps.offloading_strategy('NEVER')

        base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)])
        base_selection = base_df[base_df['n'] >= 5]

        # Should run on Pandas since original data exists
        df = ps.DataFrame(base_df)
        selection = df[df['n'] >= 5]
        assertDataFrameEqualsPandas(selection, base_selection)

        # Should not run on Pandas since original data does not exist
        df._cached_result = None
        selection = df[df['n'] >= 5]
        self.assertRaises(RuntimeError, lambda: selection.compute())
    def test_nlargest_nsmallest(self):
        ps.offloading_strategy('NEVER')

        base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(30)])
        df = ps.DataFrame(base_df)

        base_largest = base_df.nlargest(n=10, columns='n')
        largest = df.nlargest(n=10, columns='n')
        self.assertIsInstance(largest, ps.core.FallbackOperation)
        assertDataFrameEqualsPandas(largest, base_largest)

        base_smallest = base_df.nsmallest(n=5, columns='n')
        smallest = df.nsmallest(n=5, columns='n')
        self.assertIsInstance(smallest, ps.core.FallbackOperation)
        assertDataFrameEqualsPandas(smallest, base_smallest)
Exemple #26
0
    def test_offloading_rule_deep_dependency_graph(self):
        depth = 10
        size = 10**2
        step = size // depth

        df = ps.DataFrame([{'n': i, 's': str(i * 2)} for i in range(size)])
        descendants = [df]

        for d in range(step, size, step):
            parent = descendants[-1]
            child = parent[parent['n'] > d]
            descendants.append(child)

        self.assertFalse(COST_MODEL.should_offload(df))
        self.assertTrue(COST_MODEL.should_offload(descendants[-1]))
Exemple #27
0
    def test_projection_after_selection(self):
        base_df = pd.DataFrame([{'n': i, 's': str(i * 2)} for i in range(10)])
        df = ps.DataFrame(base_df)

        sel = df[df['n'] != 5]
        base_sel = base_df[base_df['n'] != 5]
        proj = sel['s']
        base_proj = base_sel[['s']]

        self.assertIsNone(sel.result)
        self.assertIsNone(proj.result)

        assertDataFrameEqualsPandas(proj, base_proj)

        # sel should also be cached because of the Pandas computation triggered
        self.assertIsNotNone(sel.result)
        self.assertIsNotNone(proj.result)
Exemple #28
0
    def test_old_dependents_after_write(self):
        base_df = pd.DataFrame([{'n': i, 'a': str(i * 2)} for i in range(10)])
        df = ps.DataFrame(base_df, deep_copy=True)

        old_proj = df['a']
        expected_old_proj = base_df[['a']]

        # Change values in column
        df['a'] = df['n']
        base_df['a'] = base_df['n']

        # df should have the updated column
        pd.testing.assert_index_equal(df.columns, base_df.columns)
        assertDataFrameEqualsPandas(df, base_df)

        # But old_proj should have old values of column
        assertDataFrameEqualsPandas(old_proj, expected_old_proj)
Exemple #29
0
    def test_string_operations(self):
        # This is the same test as in TestDataFrame, run on Pandas this time
        base_df = pd.DataFrame([{
            'n': str(i),
            'm': chr(97 + i)
        } for i in range(26)])
        df = ps.DataFrame(base_df)

        res = df[df['n'].isin(['1', '5', '8'])]
        base_res = base_df[base_df['n'].isin(['1', '5', '8'])]
        assertDataFrameEqualsPandas(res, base_res)

        res = df[df['m'].str.contains('g')]
        base_res = base_df[base_df['m'].str.contains('g', regex=False)]
        assertDataFrameEqualsPandas(res, base_res)

        res = df[df['n'].str.startswith('1') | df['n'].str.endswith('3')]
        base_res = base_df[base_df['n'].str.startswith('1')
                           | base_df['n'].str.endswith('3')]
        assertDataFrameEqualsPandas(res, base_res)
Exemple #30
0
    def test_write_column(self):
        base_df = pd.DataFrame([{'n': i, 'a': str(i * 2)} for i in range(10)])
        df = ps.DataFrame(base_df)

        # Duplicate a column
        df['b'] = df['n']
        base_df['b'] = base_df['n']
        df['c'] = df['b'] * 2
        base_df['c'] = base_df['b'] * 2

        pd.testing.assert_index_equal(df.columns, base_df.columns)
        assertDataFrameEqualsPandas(df, base_df)

        # Write a constant column
        df['d'] = 10
        df['e'] = 'dummy'
        base_df['d'] = 10
        base_df['e'] = 'dummy'

        pd.testing.assert_index_equal(df.columns, base_df.columns)
        assertDataFrameEqualsPandas(df, base_df)