Example #1
0
def datagen_filter():
    mode = random.choice(["equality-inequality", "relop"])
    if mode == "equality-inequality":
        while True:
            try:
                df = generate_random_dataframe(
                    DfConfig(min_width=2,
                             max_width=MAX_COLS,
                             min_height=MIN_ROWS,
                             max_height=MAX_ROWS,
                             max_index_levels=1,
                             max_column_levels=1))
                col = random.choice(list(df.columns))
                value = random.choice(list(set(df.loc[:, col])))
                op = random.choice(["==", "!="])

                return [df], {
                    "filter_mode": [mode],
                    "filter_column_eq": [col],
                    "filter_value_eq": [value],
                    "filter_eq_op": [op]
                }

            except:
                pass

    else:
        while True:
            try:
                df = generate_random_dataframe(
                    DfConfig(min_width=2,
                             max_width=MAX_COLS,
                             min_height=MIN_ROWS,
                             max_height=MAX_ROWS,
                             max_index_levels=1,
                             max_column_levels=1))

                numeric_cols = df.select_dtypes('number').columns
                if len(numeric_cols) < 1:
                    continue

                col = random.choice(list(numeric_cols))
                value = random.choice(list(set(df.loc[:, col])))
                op = random.choice(["<", ">"])

                return [df], {
                    "filter_mode": [mode],
                    "filter_column_relop": [col],
                    "filter_value_relop": [value],
                    "filter_relop": [op]
                }

            except:
                pass
Example #2
0
def datagen_fillna(seed: int):
    mode = ["method", "value"][seed % 2]
    axis, method = list(
        itertools.product(["index", "columns"],
                          ["backfill", "pad"]))[(seed // 2) % 4]

    while True:
        try:
            df = generate_random_dataframe(
                DfConfig(min_width=MIN_COLS,
                         max_width=MAX_COLS,
                         min_height=MIN_ROWS,
                         max_height=MAX_ROWS,
                         max_index_levels=1,
                         max_column_levels=1,
                         nan_prob=0.5))

            if mode == "value":
                df.fillna(0)
                return [df], [0], {"fillna_mode": ["value"]}

            else:
                return [df], [], {
                    "fillna_method": [method],
                    "fillna_axis": [axis],
                }

        except:
            pass
Example #3
0
def datagen_combine_first(seed: int):
    while True:
        df1 = generate_random_dataframe(
            DfConfig(min_width=MIN_COLS,
                     max_width=MAX_COLS,
                     min_height=MIN_ROWS,
                     max_height=MAX_ROWS,
                     max_index_levels=1,
                     nan_prob=0.5,
                     max_column_levels=1))
        on_columns = random.sample(list(df1.columns),
                                   random.randint(1, df1.shape[1]))
        df2_width = random.randint(len(on_columns), MAX_COLS)
        df2 = generate_random_dataframe(
            DfConfig(num_cols=df2_width,
                     min_height=MIN_ROWS,
                     max_height=MAX_ROWS,
                     max_index_levels=1,
                     max_column_levels=1,
                     nan_prob=0.5,
                     col_prefix="DF2"))
        replaced_cols = random.sample(list(df2.columns), len(on_columns))
        df2 = df2.rename(columns=dict(zip(replaced_cols, on_columns)))

        df1_items = [tuple(i) for i in df1.loc[:, on_columns].values]
        df2_items = [tuple(i) for i in df2.loc[:, on_columns].values]

        new_df1_items = random.sample(df1_items + df2_items, df1.shape[0])
        new_df2_items = random.sample(df1_items + df2_items, df2.shape[0])

        for idx, items in enumerate(new_df1_items):
            df1.loc[idx, on_columns] = items

        for idx, items in enumerate(new_df2_items):
            df2.loc[idx, on_columns] = items

        try:
            df1 = df1.T
            df2 = df2.T
            res = df1.combine_first(df2)
            if res.shape[0] == 0 or res.shape[1] == 0:
                continue

        except:
            continue

        return [df1, df2], [], {}
Example #4
0
def datagen_default():
    return [
        generate_random_dataframe(
            DfConfig(min_width=MIN_COLS,
                     max_width=MAX_COLS,
                     min_height=MIN_ROWS,
                     max_height=MAX_ROWS,
                     max_index_levels=1,
                     max_column_levels=1))
    ], {}
Example #5
0
def datagen_separate():
    df = generate_random_dataframe(
        DfConfig(min_width=3, max_index_levels=1, max_column_levels=1))
    unite_cols = random.sample(list(df.columns), random.choice([2, 3]))
    df["NEW-VALS"] = [
        "@".join(map(str, vals)) for vals in zip(*[df[c] for c in unite_cols])
    ]
    df.drop(columns=unite_cols, inplace=True)
    new_cols = list(df.columns)
    random.shuffle(new_cols)
    return [df], {"separate_split_col": ["NEW-VALS"]}
Example #6
0
def datagen_dropna(seed: int):
    df = generate_random_dataframe(
        DfConfig(min_width=MIN_COLS,
                 max_width=MAX_COLS,
                 min_height=MIN_ROWS,
                 max_height=MAX_ROWS,
                 max_index_levels=1,
                 max_column_levels=1,
                 nan_prob=0.5))

    return [df], [], {}
Example #7
0
def datagen_mutate():
    operation = random.choice(["normalize", "div"])
    if operation == 'normalize':
        while True:
            df = generate_random_dataframe(
                DfConfig(min_width=2,
                         max_width=MAX_COLS,
                         min_height=MIN_ROWS,
                         max_height=MAX_ROWS,
                         max_index_levels=1,
                         max_column_levels=1))
            numeric_cols = df.select_dtypes('number').columns
            if len(numeric_cols) == 0:
                continue

            return [df], {
                "mutate_operation": ["normalize"],
                "mutate_col_args_normalize": [random.choice(numeric_cols)]
            }

    else:
        while True:
            df = generate_random_dataframe(
                DfConfig(min_width=2,
                         max_width=MAX_COLS,
                         min_height=MIN_ROWS,
                         max_height=MAX_ROWS,
                         max_index_levels=1,
                         max_column_levels=1))
            numeric_cols = df.select_dtypes('number').columns
            if len(numeric_cols) < 2:
                continue

            col_args = random.sample(list(numeric_cols), 2)
            return [df], {
                "mutate_operation": ["div"],
                "mutate_col_arg1": [col_args[0]],
                "mutate_col_arg2": [col_args[1]],
            }
Example #8
0
def datagen_separate(seed: int):
    while True:
        try:
            df = generate_random_dataframe(
                DfConfig(min_width=2,
                         max_width=MAX_COLS,
                         min_height=MIN_ROWS,
                         max_height=MAX_ROWS,
                         max_index_levels=1,
                         max_column_levels=1))

            unite_cols = random.sample(list(df.columns),
                                       random.randint(2, df.shape[1]))
            df = df.drop(columns=unite_cols).assign(MY_NEW_COL=df[
                unite_cols[0]].str.cat(df[unite_cols[1:]], sep='_'))
            return [df], [], {"separate_col": ["MY_NEW_COL"]}

        except:
            pass
Example #9
0
def datagen_filtering_expr(seed: int):
    while True:
        try:
            df = generate_random_dataframe(
                DfConfig(min_width=MIN_COLS,
                         max_width=MAX_COLS,
                         min_height=MIN_ROWS,
                         max_height=MAX_ROWS,
                         max_index_levels=1,
                         max_column_levels=1))
            column = random.choice(list(df.columns))
            value = random.choice(list(df[column]))
            op = [">", "<", "==", "!="][seed % 4]

            expr = f"`{column}` {op} {value!r}"
            df.query(expr)

            return [df], [expr], {"filtering_expr_expression": [expr]}

        except:
            pass
Example #10
0
def datagen_groupby_transform(seed: int):
    while True:
        op = candidates_groupby_transform_op[
            seed % len(candidates_groupby_transform_op)]
        num_group_cols = (seed // len(candidates_groupby_transform_op)) % 2 + 1

        df = generate_random_dataframe(
            DfConfig(min_width=max(MIN_COLS, num_group_cols + 1),
                     max_width=MAX_COLS,
                     min_height=MIN_ROWS,
                     max_height=MAX_ROWS,
                     max_index_levels=1,
                     max_column_levels=1))

        #  Find group columns such that there is at least one duplicate key
        for group_cols in itertools.combinations(list(df.columns),
                                                 num_group_cols):
            if len(df.groupby(list(group_cols)).groups) < df.shape[0]:
                return [df], [], {
                    'groupby_transform_by_cols': [list(group_cols)],
                    'groupby_transform_op': [op]
                }
Example #11
0
def datagen_filtering_contains(seed: int):
    while True:
        try:
            df = generate_random_dataframe(
                DfConfig(min_width=MIN_COLS,
                         max_width=MAX_COLS,
                         min_height=MIN_ROWS,
                         max_height=MAX_ROWS,
                         max_index_levels=1,
                         max_column_levels=1))
            column = random.choice(list(df.columns))
            values = set(
                random.sample(list(df[column]),
                              random.randint(1, df.shape[1] - 1)))

            return [df], [values], {
                "filtering_contains_filter_col": [column],
                "filtering_contains_collection": [values],
            }

        except:
            pass