def test_SO_13576164_depth3_simplified(self): b_id = "b_9" inputs = [ pd.DataFrame(columns=['col1', 'to_merge_on'], index=pd.MultiIndex.from_arrays( [[1, 1, 2, 2], ['a', 'b', 'a', 'b']], names=['id1', 'id2']), data=[[1, 2], [3, 4], [1, 2], [3, 4]]).reset_index(), pd.DataFrame(columns=['col2', 'to_merge_on'], index=[0, 1, 2], data=[[1, 1], [2, 3], [3, 4]]) ] output = inputs[0].merge(inputs[1], how='inner') intermediates = [output] skeleton = Skeleton([('pd.merge', [(-1), (-2)])]) replay_map = {} stats = {'autopandas_time': 339.25} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_13807758_depth2(self): b_id = "b_13" df1 = pd.DataFrame([[10], [11], [12], [14], [16], [18]], columns=['A']) df1[::3] = np.nan inputs = [df1] output = inputs[0].dropna() intermediates = [output] skeleton = Skeleton([('pd.dropna', [(-1)])]) replay_map = { 'dropna_how': ['any'], 'dropna_inspect_cols': [['A']], } stats = {'autopandas_time': 7.21} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_11941492_depth1(self): b_id = "b_20" df = pd.DataFrame({ 'group1': ['a', 'a', 'a', 'b', 'b', 'b'], 'group2': ['c', 'c', 'd', 'd', 'd', 'e'], 'value1': [1.1, 2, 3, 4, 5, 6], 'value2': [7.1, 8, 9, 10, 11, 12] }) constants = ['`group1` == "a"'] inputs = [df] output = df.set_index(['group1', 'group2']).xs('a', level=0).reset_index() intermediates = [output] skeleton = Skeleton([('pd.filtering_expr', [-1]), ('pd.drop_columns', [1])]) replay_map = { 'filtering_expr_expression': [constants[0]], "drop_columns_cols": [['group1']] } stats = {'autopandas_time': 12.55} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, constants=constants, stats=stats)
def test_SO_23321300_depth3(self): b_id = "b_4" inputs = [ pd.DataFrame({ "a": [1, 1, 1, 1, 1, 1, 1, 1, 1], "b": [1, 1, 1, 1, 1, 2, 2, 2, 3], "d": [0, 200, 300, 0, 600, 0, 100, 200, 0] }) ] output = inputs[0].query('d > 0').groupby(['a', 'b'], as_index=False).mean() intermediates = [output] skeleton = Skeleton([('pd.filtering_expr', [(-1)]), ('pd.groupby_agg', [1])]) replay_map = { 'filtering_expr_expression': ['d > 0'], 'groupby_agg_by_cols': [['a', 'b']], 'groupby_agg_op': ['mean'], } stats = {'autopandas_time': 0} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, constants=['d > 0'], stats=stats)
def test_SO_49987108_depth2(self): b_id = "b_14" inputs = [ pd.DataFrame({ 'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'COL': [ 23, np.nan, np.nan, np.nan, np.nan, 21, np.nan, np.nan, np.nan, 25, np.nan, np.nan ] }).set_index('ID') ] output = inputs[0].fillna(method='pad') intermediates = [output] skeleton = Skeleton([('pd.fillna', [(-1)])]) replay_map = { 'fillna_mode': ['method'], 'fillna_axis': ['index'], 'fillna_method': ['pad'], } stats = {'autopandas_time': 0} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_11418192_depth2(self): b_id = "b_16" inputs = [ pd.DataFrame(data=[[5, 7], [6, 8], [-1, 9], [-2, 10]], columns=['a', 'b']) ] constants = ['`a` > 1'] output = inputs[0].query("`a` > 1") intermediates = [output] skeleton = Skeleton([('pd.filtering_expr', [-1])]) replay_map = {'filtering_expr_expression': [constants[0]]} stats = {'autopandas_time': 0.71} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, constants=constants, stats=stats)
def test_SO_12860421_depth1(self): b_id = "b_3" inputs = [ pd.DataFrame(columns=['X', 'Y', 'Z'], data=[['X1', 'Y2', 'Z3'], ['X1', 'Y1', 'Z1'], ['X1', 'Y1', 'Z1'], ['X1', 'Y1', 'Z2']]) ] output = inputs[0].pivot_table( values='X', index='Y', columns='Z', aggfunc=pd.Series.nunique).reset_index() intermediates = [output] skeleton = Skeleton([('pd.groupby_agg', [(-1)]), ('pd.pivot_table', [1])]) replay_map = { 'groupby_agg_by_cols': [['Z', 'Y']], 'groupby_agg_op': ['AGG_nunique'], 'pivot_columns': ['Z'], 'pivot_values': ['X'], 'pivot_index': [['Y']], } stats = {'autopandas_time': 3.3} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_13659881_depth2(self): b_id = "b_1" inputs = [ pd.DataFrame(columns=['ip', 'useragent'], index=[0, 1, 2, 3], data=[['192.168.0.1', 'a'], ['192.168.0.1', 'a'], ['192.168.0.1', 'b'], ['192.168.0.2', 'b']]) ] output = inputs[0].groupby( ['ip', 'useragent'], as_index=False).size().reset_index(name='size') intermediates = [output] skeleton = Skeleton([('pd.groupby_agg', [(-1)])]) replay_map = { 'groupby_agg_by_cols': [['ip', 'useragent']], 'groupby_agg_op': ['size'], 'groupby_agg_size_new_col': ['size'] } stats = {'autopandas_time': 1.38} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_13793321_depth1(self): b_id = "b_17" inputs = [ pd.DataFrame([[11, 12, 13]], columns=[10, 1, 2]), pd.DataFrame([[11, 37, 38], [34, 19, 39]], columns=[10, 3, 4]) ] output = inputs[0].merge(inputs[1], on=10) intermediates = [output] skeleton = Skeleton([('pd.merge', [(-1), (-2)])]) replay_map = {} stats = {'autopandas_time': 4.16} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def _get_canonical_query_plans(self, sequence: List[str], transformation: Transformation) -> Dict[Skeleton, Set[QueryPlan]]: meta_plan = self._meta_plans[transformation] blueprint_item_lists = self._get_blueprint_item_lists(sequence, meta_plan, _d=len(sequence)) canonical_transformation = meta_plan.canonical_transformations[len(sequence)] mapping = next(canonical_transformation.get_subgraph_mappings(transformation)) skeletons_to_plans: Dict[Skeleton, Set[QueryPlan]] = collections.defaultdict(set) for blueprint_item_list in blueprint_item_lists: # Breakdown the overall transformation in terms of the unit plans contained in the blueprint items. # Store the connections between them as a graph mapping. connections = GraphMapping() connections.update(mapping) graph = Graph() for item in blueprint_item_list: graph.merge(item.unit.transformation) connections = connections.apply_mapping(item.canonical_mapping, only_keys=True) if item.border_mapping: connections.update(item.border_mapping) connections = connections.apply_mapping(connections, only_values=True) # Assemble the query plan query_plan = QueryPlan(transformation, units=[item.unit.transformation for item in blueprint_item_list], all_connections=connections, strengthenings=[item.unit.strengthenings[component_name] for component_name, item in zip(sequence, blueprint_item_list)]) # Obtain the skeletons for which this query plan would work. # External inputs are negative integers. See gauss.synthesis.skeleton for details. ent_to_idx = {ent: -idx for idx, ent in enumerate(transformation.get_input_entities(), 1)} possible_arg_ints_lists = [] for component_name, (idx, item) in zip(sequence, enumerate(blueprint_item_list, 1)): # Get the mapped entities to the inputs of this unit's transformation, and look up their idx values. arg_ints = [ent_to_idx[connections.m_ent[ent]] for ent in item.unit.transformation.get_input_entities()] # Get all the permutations as well. arg_ints_list = [arg_num_mapping.apply_list(arg_ints) for arg_num_mapping in item.unit.component_entries[component_name].argument_mappings] possible_arg_ints_lists.append(arg_ints_list) ent_to_idx[item.unit.transformation.get_output_entity()] = idx # The skeletons are then simply the all the combinations for arg_ints_list in itertools.product(*possible_arg_ints_lists): skeleton = Skeleton(list(zip(sequence, arg_ints_list))) skeletons_to_plans[skeleton].add(query_plan) return skeletons_to_plans
def test_SO_21982987_depth3(self): b_id = "b_6" inputs = [ pd.DataFrame({ "Name": ["Aira", "Aira", "Ben", "Ben", "Cat", "Cat"], "Month": [1, 2, 1, 2, 1, 2], "Rate1": [12, 18, 53, 22, 22, 27], "Rate2": [23, 73, 19, 87, 87, 43] }) ] output = pd.DataFrame({ 'Name': { 0: 'Aira', 1: 'Ben', 2: 'Cat' }, 'Rate1': { 0: 15.0, 1: 37.5, 2: 24.5 }, 'Rate2': { 0: 48.0, 1: 53.0, 2: 65.0 } }) intermediates = [output] skeleton = Skeleton([('pd.groupby_agg', [(-1)]), ('pd.drop_columns', [1])]) replay_map = { 'groupby_agg_by_cols': [['Name']], 'groupby_agg_op': ['mean'], 'drop_columns_cols': [['Month']], } stats = {'autopandas_time': 30.80} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_10982266_depth3(self): b_id = "b_11" inputs = [ pd.DataFrame( [['08:01:08', 'C', 'PXA', 20100101, 4000, 'A', 57.8, 60], ['08:01:11', 'C', 'PXA', 20100101, 4000, 'A', 58.4, 60], ['08:01:12', 'C', 'PXA', 20100101, 4000, 'A', 58.0, 60], ['08:01:16', 'C', 'PXA', 20100101, 4000, 'A', 58.4, 60], ['08:01:16', 'C', 'PXA', 20100101, 4000, 'A', 58.0, 60], ['08:01:21', 'C', 'PXA', 20100101, 4000, 'A', 58.4, 60], ['08:01:21', 'C', 'PXA', 20100101, 4000, 'A', 58.0, 60]], columns=[ 'time', 'contract', 'ticker', 'expiry', 'strike', 'quote', 'price', 'volume' ], index=[0, 1, 2, 3, 4, 5, 6]) ] output = pd.DataFrame([['08:01:08', 57.8, 60], ['08:01:11', 58.4, 60], ['08:01:12', 58.0, 60], ['08:01:16', 58.2, 60], ['08:01:21', 58.2, 60]], columns=['time', 'price', 'volume'], index=[0, 1, 2, 3, 4]) intermediates = [output] skeleton = Skeleton([('pd.groupby_agg', [(-1)]), ('pd.drop_columns', [1])]) replay_map = { 'groupby_agg_by_cols': [['time', 'volume']], 'groupby_agg_op': ['mean'], 'drop_columns_cols': [['expiry', 'strike']], } stats = {'autopandas_time': 0} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_53762029_depth3(self): b_id = "b_7" data = """ doc_created_month doc_created_year speciality doc_id 8 2016 Acupuncturist 1 2 2017 Acupuncturist 1 4 2017 Acupuncturist 1 4 2017 Allergist 1 5 2018 Allergist 1 10 2018 Allergist 2 """ df = pd.read_csv(StringIO(data), sep=r'\s+') inputs = [df] output = df.assign( doc_id_count=df.groupby(['speciality'], as_index=False) ['doc_id'].transform('cumsum')).drop(columns=['doc_id']) intermediates = [output] skeleton = Skeleton([('pd.groupby_transform', [(-1)]), ('pd.drop_columns', [1])]) replay_map = { 'groupby_transform_by_cols': [['speciality']], 'groupby_transform_op': ['cumsum'], 'groupby_transform_op_col': ['doc_id'], 'groupby_transform_new_col': ['doc_id_count'], 'drop_columns_cols': [['doc_id']], } stats = {'autopandas_time': 1.90} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_49583055_depth1(self): b_id = "b_21" inputs = [ pd.DataFrame({ 'value': { pd.Timestamp('2014-05-21 09:30:00'): 0.0, pd.Timestamp('2014-05-21 10:00:00'): 10.0, pd.Timestamp('2014-05-21 10:30:00'): 3.0, pd.Timestamp('2017-07-10 22:30:00'): 18.3, pd.Timestamp('2017-07-10 23:00:00'): 7.6, pd.Timestamp('2017-07-10 23:30:00'): 2.0 } }), pd.DataFrame({ 'value': { pd.Timestamp('2014-05-21 09:00:00'): 1.0, pd.Timestamp('2014-05-21 10:00:00'): 13.0, pd.Timestamp('2017-07-10 21:00:00'): 1.6, pd.Timestamp('2017-07-10 22:00:00'): 32.1, pd.Timestamp('2017-07-10 23:00:00'): 7.7 } }) ] output = inputs[0].combine_first(inputs[1]) intermediates = [output] skeleton = Skeleton([('pd.combine_first', [(-1), (-2)])]) replay_map = {} stats = {'autopandas_time': 0} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_14023037_depth3_part2(self): b_id = "b_8_2" df = pd.DataFrame( { 'id': [1, 2, 3, 4, 5, 6], 'col1': ['A1', 'A1', 'A1', 'A1', 'A2', 'A2'], 'col2': ['B1', 'B1', 'B2', 'B2', 'B1', 'B2'], 'col3': ['before', 'after', 'before', 'after', 'before', 'after'], 'value': [20, 13, 11, 21, 18, 22] }, columns=['id', 'col1', 'col2', 'col3', 'value']) inputs = [ df.pivot_table(values='value', index=['col1', 'col2'], columns=['col3']).reset_index() ] output = inputs[0].fillna(method='backfill').dropna() intermediates = [output] skeleton = Skeleton([('pd.fillna', [(-1)]), ('pd.dropna', [1])]) replay_map = { 'fillna_mode': ['method'], 'fillna_axis': ['index'], 'fillna_method': ['backfill'], 'dropna_how': ['any'], 'dropna_inspect_cols': [['before']], } stats = {'autopandas_time': 0} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_39656670_depth3(self): b_id = "b_5" inputs = [ pd.DataFrame({ "Player": ["Abdoun", "Abe", "Abidal", "Abreu"], "Team": ["Algeria", "Japan", "France", "Uruguay"], "Shots": [0, 3, 0, 5], "Passes": [6, 101, 91, 15], "Tackles": [0, 14, 6, 0] }) ] output = inputs[0].melt(value_vars=["Passes", "Tackles"], var_name="Var", value_name="Mean").groupby( "Var", as_index=False).mean() intermediates = [output] skeleton = Skeleton([('pd.melt', [(-1)]), ('pd.groupby_agg', [1])]) replay_map = { 'melt_id_vars': [[]], 'melt_value_vars': [['Passes', 'Tackles']], 'melt_var_name': ['Var'], 'melt_value_name': ['Mean'], 'groupby_agg_by_cols': [['Var']], 'groupby_agg_op': ['mean'], } stats = {'autopandas_time': 0} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_14023037_depth3_part1(self): b_id = "b_8_1" inputs = [ pd.DataFrame( { 'id': [1, 2, 3, 4, 5, 6], 'col1': ['A1', 'A1', 'A1', 'A1', 'A2', 'A2'], 'col2': ['B1', 'B1', 'B2', 'B2', 'B1', 'B2'], 'col3': ['before', 'after', 'before', 'after', 'before', 'after'], 'value': [20, 13, 11, 21, 18, 22] }, columns=['id', 'col1', 'col2', 'col3', 'value']) ] output = inputs[0].pivot_table(values='value', index=['col1', 'col2'], columns=['col3']).reset_index() intermediates = [output] skeleton = Skeleton([('pd.pivot_table', [(-1)])]) replay_map = { 'pivot_columns': ['col3'], 'pivot_values': ['value'], 'pivot_index': [['col1', 'col2']], } stats = {'autopandas_time': 0} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_13261175_depth1_simplified(self): b_id = "b_18" inputs = [ pd.DataFrame({ 'name': ['A', 'B', 'A', 'B'], 'type': [11, 11, 12, 12], 'date': ['2012-01-01', '2012-01-01', '2012-02-01', '2012-02-01'], 'value': [4, 5, 6, 7] }) ] output = inputs[0].pivot_table(values='value', index='name', columns='date').reset_index() intermediates = [output] skeleton = Skeleton([('pd.pivot_table', [(-1)])]) replay_map = { 'pivot_columns': ['date'], 'pivot_values': ['value'], 'pivot_index': [['name']], } stats = {'autopandas_time': 300.20} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_18172851_depth1(self): b_id = "b_19" inputs = [ pd.DataFrame({ 'daysago': { '2007-03-31': 62, '2007-03-10': 83, '2007-02-10': 111, '2007-01-13': 139, '2006-12-23': 160, '2006-11-09': 204, '2006-10-22': 222, '2006-09-29': 245, '2006-09-16': 258, '2006-08-30': 275, '2006-02-11': 475, '2006-01-13': 504, '2006-01-02': 515, '2005-12-06': 542, '2005-11-29': 549, '2005-11-22': 556, '2005-11-01': 577, '2005-10-20': 589, '2005-09-27': 612, '2005-09-07': 632, '2005-06-12': 719, '2005-05-29': 733, '2005-05-02': 760, '2005-04-02': 790, '2005-03-13': 810, '2004-11-09': 934 }, 'line_race': { '2007-03-31': 111, '2007-03-10': 211, '2007-02-10': 29, '2007-01-13': 110, '2006-12-23': 210, '2006-11-09': 39, '2006-10-22': 28, '2006-09-29': 49, '2006-09-16': 311, '2006-08-30': 48, '2006-02-11': 45, '2006-01-13': 0, '2006-01-02': 0, '2005-12-06': 0, '2005-11-29': 0, '2005-11-22': 0, '2005-11-01': 0, '2005-10-20': 0, '2005-09-27': 0, '2005-09-07': 0, '2005-06-12': 0, '2005-05-29': 0, '2005-05-02': 0, '2005-04-02': 0, '2005-03-13': 0, '2004-11-09': 0 }, 'rating': { '2007-03-31': 2, '2007-03-10': 3, '2007-02-10': 4, '2007-01-13': 5, '2006-12-23': 6, '2006-11-09': 7, '2006-10-22': 8, '2006-09-29': 9, '2006-09-16': 10, '2006-08-30': 11, '2006-02-11': 12, '2006-01-13': 13, '2006-01-02': 14, '2005-12-06': 15, '2005-11-29': 16, '2005-11-22': 17, '2005-11-01': 18, '2005-10-20': 19, '2005-09-27': 20, '2005-09-07': 21, '2005-06-12': 22, '2005-05-29': 23, '2005-05-02': 24, '2005-04-02': 25, '2005-03-13': 26, '2004-11-09': 27 }, 'rw': { '2007-03-31': 0.99999, '2007-03-10': 0.97, '2007-02-10': 0.9, '2007-01-13': 0.8806780000000001, '2006-12-23': 0.793033, '2006-11-09': 0.636655, '2006-10-22': 0.581946, '2006-09-29': 0.518825, '2006-09-16': 0.48622600000000005, '2006-08-30': 0.446667, '2006-02-11': 0.16459100000000002, '2006-01-13': 0.14240899999999998, '2006-01-02': 0.1348, '2005-12-06': 0.11780299999999999, '2005-11-29': 0.113758, '2005-11-22': 0.10985199999999999, '2005-11-01': 0.098919, '2005-10-20': 0.093168, '2005-09-27': 0.083063, '2005-09-07': 0.075171, '2005-06-12': 0.04869, '2005-05-29': 0.045404, '2005-05-02': 0.039679, '2005-04-02': 0.03416, '2005-03-13': 0.030914999999999998, '2004-11-09': 0.016647 }, 'wrating': { '2007-03-31': 1.99998, '2007-03-10': 2.91, '2007-02-10': 3.6, '2007-01-13': 4.40339, '2006-12-23': 4.758198, '2006-11-09': 4.456585, '2006-10-22': 4.655568, '2006-09-29': 4.6694249999999995, '2006-09-16': 4.862260000000001, '2006-08-30': 4.913336999999999, '2006-02-11': 1.975092, '2006-01-13': 1.8513169999999997, '2006-01-02': 1.8872, '2005-12-06': 1.767045, '2005-11-29': 1.820128, '2005-11-22': 1.867484, '2005-11-01': 1.780542, '2005-10-20': 1.770192, '2005-09-27': 1.66126, '2005-09-07': 1.578591, '2005-06-12': 1.07118, '2005-05-29': 1.044292, '2005-05-02': 0.952296, '2005-04-02': 0.8540000000000001, '2005-03-13': 0.80379, '2004-11-09': 0.44946899999999995 } }) ] constants = ["`line_race` != 0"] output = inputs[0].query("`line_race` != 0") intermediates = [output] skeleton = Skeleton([('pd.filtering_expr', [-1])]) replay_map = {'filtering_expr_expression': [constants[0]]} stats = {'autopandas_time': 0} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, constants=constants, stats=stats)
def apply_skeleton(self, skeleton: Skeleton): ext_inp_dict = {(-k - 1): (-v - 1) for k, v in self.mapping.items()} new_skeleton = Skeleton([(func, [ext_inp_dict.get(i, i) for i in arg_ints]) for func, arg_ints in skeleton]) return new_skeleton
def test_SO_49572546_depth1(self): b_id = "b_22" inputs = [ pd.DataFrame({ 'C1': { 1: 100, 2: 102, 3: 103, 4: 104, 5: 105, 6: 106, 7: 107 }, 'C2': { 1: 201, 2: 202, 3: 203, 4: 204, 5: 205, 6: 206, 7: 207 }, 'C3': { 1: 301, 2: 302, 3: 303, 4: 304, 5: 305, 6: 306, 7: 307 } }), pd.DataFrame({ 'C1': { 2: '1002', 3: 'v1', 4: 'v4', 7: '1007' }, 'C2': { 2: '2002', 3: 'v2', 4: 'v5', 7: '2007' }, 'C3': { 2: '3002', 3: 'v3', 4: 'v6', 7: '3007' } }) ] output = inputs[1].combine_first(inputs[0]) intermediates = [output] skeleton = Skeleton([('pd.combine_first', [(-2), (-1)])]) replay_map = {} stats = {'autopandas_time': 1.1} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_13647222_depth1(self): b_id = "b_2" inputs = [ pd.DataFrame({ 'series': { 0: 'A', 1: 'B', 2: 'C', 3: 'A', 4: 'B', 5: 'C', 6: 'A', 7: 'B', 8: 'C', 9: 'A', 10: 'B', 11: 'C', 12: 'A', 13: 'B', 14: 'C' }, 'step': { 0: '100', 1: '100', 2: '100', 3: '101', 4: '101', 5: '101', 6: '102', 7: '102', 8: '102', 9: '103', 10: '103', 11: '103', 12: '104', 13: '104', 14: '104' }, 'value': { 0: '1000', 1: '1001', 2: '1002', 3: '1003', 4: '1004', 5: '1005', 6: '1006', 7: '1007', 8: '1008', 9: '1009', 10: '1010', 11: '1011', 12: '1012', 13: '1013', 14: '1014' } }) ] output = inputs[0].pivot(columns='series', values='value', index='step').reset_index() intermediates = [output] skeleton = Skeleton([('pd.pivot_table', [(-1)])]) replay_map = { 'pivot_columns': ['series'], 'pivot_values': ['value'], 'pivot_index': [['step']], } stats = {'autopandas_time': 3.32} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, stats=stats)
def test_SO_49567723_depth2(self): b_id = "b_15" inputs = [ pd.DataFrame({ 'id': { 0: 255, 1: 91, 2: 347, 3: 30, 4: 68, 5: 159, 6: 32, 7: 110, 8: 225, 9: 257 }, 'valueA': { 0: 1141, 1: 1130, 2: 830, 3: 757, 4: 736, 5: 715, 6: 713, 7: 683, 8: 638, 9: 616 } }), pd.DataFrame({ 'id': { 0: 255, 1: 91, 2: 5247, 3: 347, 4: 30, 5: 68, 6: 159, 7: 32, 8: 110, 9: 225, 10: 257, 11: 917, 12: 211, 13: 25 }, 'valueB': { 0: 1231, 1: 1170, 2: 954, 3: 870, 4: 757, 5: 736, 6: 734, 7: 713, 8: 683, 9: 644, 10: 616, 11: 585, 12: 575, 13: 530 } }) ] constants = ['`valueA` != `valueB`'] output = inputs[0].merge(inputs[1]).query('`valueA` != `valueB`') intermediates = [output] skeleton = Skeleton([('pd.merge', [(-1), (-2)]), ('pd.filtering_expr', [1])]) replay_map = {'filtering_expr_expression': [constants[0]]} stats = {'autopandas_time': 753.10} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, constants=constants, stats=stats)
def test_SO_34365578_depth2(self): b_id = "b_12" inputs = [ pd.DataFrame({ 'Group': { 0: 'A', 1: 'A', 2: 'A', 3: 'B', 4: 'B', 5: 'B' }, 'Id': { 0: 11, 1: 12, 2: 13, 3: 14, 4: 15, 5: 16 }, 'Var1': { 0: 'good', 1: 'good', 2: 'bad', 3: 'good', 4: 'good', 5: 'bad' }, 'Var2': { 0: 20, 1: 26, 2: 29, 3: 23, 4: 23, 5: 28 } }) ] constants = ["`Group` == 'A'"] output = inputs[0].query('Group == "A"').pivot_table( index='Group', columns='Var1', values='Var2', aggfunc='sum').reset_index() intermediates = [output] skeleton = Skeleton([('pd.filtering_expr', [(-1)]), ('pd.groupby_agg', [1]), ('pd.pivot_table', [2])]) replay_map = { 'filtering_expr_expression': ['`Group` == "A"'], 'groupby_agg_by_cols': [['Group', 'Var1']], 'groupby_agg_op': ['sum'], 'pivot_columns': ['Var1'], 'pivot_values': ['Var2'], 'pivot_index': [['Group']], } stats = {'autopandas_time': 0} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, constants=constants, stats=stats)
def test_SO_12065885_depth3(self): b_id = "b_10" inputs = [ pd.DataFrame({ 'RPT_Date': { 0: '1980-01-01', 1: '1980-01-02', 2: '1980-01-03', 3: '1980-01-04', 4: '1980-01-05', 5: '1980-01-06', 6: '1980-01-07', 7: '1980-01-08', 8: '1980-01-09', 9: '1980-01-10' }, 'STK_ID': { 0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9 }, 'STK_Name': { 0: 'Arthur', 1: 'Beate', 2: 'Cecil', 3: 'Dana', 4: 'Eric', 5: 'Fidel', 6: 'George', 7: 'Hans', 8: 'Ingrid', 9: 'Jones' }, 'sales': { 0: 0, 1: 4, 2: 2, 3: 8, 4: 4, 5: 5, 6: 4, 7: 7, 8: 7, 9: 4 } }) ] constants = [[4, 2, 6]] output = inputs[0][inputs[0].STK_ID.isin(constants[0])] intermediates = [output] skeleton = Skeleton([('pd.filtering_contains', [(-1)])]) replay_map = { 'filtering_contains_filter_col': ['STK_ID'], 'filtering_contains_collection': [[4, 2, 6]], } stats = {'autopandas_time': 0.9} return AutoPandasBenchmark(b_id, inputs, intermediates, output, "", skeleton, replay_map, constants=constants, stats=stats)