Ejemplo n.º 1
0
    def add():
        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            v_nc = random.choice([nc, nc - 1, nc + 1])
            val = next(
                defaultRandDf(num_rows=nr,
                              num_columns=v_nc,
                              column_levels=v_self.columns.nlevels,
                              col_prefix='i1_',
                              value_bags=[*ints_bags, *floats_bags]))
            val.index = v_self.index
            if (coin_flip() == 0) and (len(val.columns) == nc):
                val.columns = v_self.columns
            elif v_self.columns.nlevels == 1:
                val.columns = pd.Index(
                    random.sample(
                        set((list(v_self.columns) + list(val.columns))),
                        len(val.columns)))
            else:
                val.columns = pd.MultiIndex.from_tuples(
                    random.sample(
                        set((list(v_self.columns) + list(val.columns))),
                        len(val.columns)))
            yield val

        def arg_fill_value():
            yield random.uniform((-100), 100)

        _self = RExt(DType(pd.DataFrame),
                     defaultRandDf(value_bags=[*ints_bags, *floats_bags]))
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
        _fill_value = Chain(Default(None), RExt(DType(float),
                                                arg_fill_value()))
Ejemplo n.º 2
0
    def fillna():
        def arg_value():
            if _spec.depth == _spec.max_depth:
                output: pd.DataFrame = _spec.output
                all_values = set([
                    i for col in output for i in output[col]
                    if (not pd.isnull(i))
                ])
                yield from map(lambda x: AnnotatedVal(x, cost=2),
                               Select(all_values))

        def arg_limit(v_self: pd.DataFrame):
            yield from map(lambda x: AnnotatedVal(x, cost=5),
                           Select(range(1, (max(v_self.shape) + 1))))

        def rarg_limit(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            if coin_flip() == 0:
                yield random.choice(range(1, max(nr, 2)))
            else:
                yield random.choice(range(1, max(nc, 2)))

        def rarg_value():
            yield random.uniform((-1000), 1000)

        _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.5))
        _limit = Chain(Default(None), RExt(DType(int), rarg_limit(_self)),
                       arg_limit(_self))
        _value = Chain(Default(None), RExt(FType(np.isscalar), rarg_value()),
                       Ext(DType([dict, pd.Series, pd.DataFrame])),
                       arg_value())
Ejemplo n.º 3
0
    def corrwith():
        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            val = next(
                defaultRandDf(num_rows=nr,
                              column_levels=v_self.columns.nlevels,
                              col_prefix='i1_',
                              value_bags=[*ints_bags, *floats_bags]))
            val.index = v_self.index
            if (coin_flip() == 0) and (len(val.columns) == nc):
                val.columns = v_self.columns
            elif v_self.columns.nlevels == 1:
                val.columns = pd.Index(
                    random.sample(
                        set((list(v_self.columns) + list(val.columns))),
                        len(val.columns)))
            else:
                val.columns = pd.MultiIndex.from_tuples(
                    random.sample(
                        set((list(v_self.columns) + list(val.columns))),
                        len(val.columns)))
            yield val

        _self = RExt(DType(pd.DataFrame),
                     defaultRandDf(value_bags=[*ints_bags, *floats_bags]))
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
Ejemplo n.º 4
0
    def merge():
        def arg_right(v_self: pd.DataFrame):
            new_df: pd.DataFrame = next(defaultRandDf(col_prefix='i1_'))
            dg1 = collections.defaultdict(list)
            dg2 = collections.defaultdict(list)
            for (k, v) in dict(v_self.dtypes).items():
                dg1[v].append(k)
            for (k, v) in dict(new_df.dtypes).items():
                dg2[v].append(k)
            c = (set(dg1.keys()) & set(dg2.keys()))
            for dt in c:
                cols1 = list(dg1[dt])
                cols2 = list(dg2[dt])
                random.shuffle(cols1)
                random.shuffle(cols2)
                pairs = list(zip(cols1, cols2))
                for pair in pairs:
                    if coin_flip() == 0:
                        new_df[pair[1]] = random.sample(
                            (list(new_df[pair[1]]) + list(v_self[pair[0]])),
                            new_df.shape[0])
                        if (coin_flip() == 0) and (pair[0]
                                                   not in new_df.columns):
                            new_df = new_df.rename({
                                pair[1]: pair[0],
                            },
                                                   axis=1)
            yield new_df

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _right = RExt(DType(pd.DataFrame), arg_right(_self))
Ejemplo n.º 5
0
    def diff():
        def arg_periods(v_self: pd.DataFrame):
            (nr, _) = v_self.shape
            yield random.choice(range((-(nr - 1)), nr))

        _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.1))
        _periods = Chain(Default(1), RExt(DType(int), arg_periods(_self)))
Ejemplo n.º 6
0
    def round():

        def arg_decimals():
            yield random.choice([1, 2, 3, 4, 5])

        _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.1))
        _decimals = Chain(Default(0), RExt(DType(int), arg_decimals()))
Ejemplo n.º 7
0
    def align():

        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            val = next(defaultRandDf(num_rows=random.choice([max((nr - 1), 1), nr, (nr + 1)]),
                                     num_columns=random.choice([max((nc - 1), 1), nc, (nc + 1)]), col_prefix='i1_',
                                     index_levels=v_self.index.nlevels, column_levels=v_self.columns.nlevels,
                                     value_bags=[*ints_bags, *floats_bags]))
            if (coin_flip() == 0) and (len(val.index) == nr):
                val.index = v_self.index
            elif v_self.index.nlevels == 1:
                val.index = pd.Index(random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
            else:
                val.index = pd.MultiIndex.from_tuples(
                    random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
            if (coin_flip() == 0) and (len(val.columns) == nc):
                val.columns = v_self.columns
            elif v_self.columns.nlevels == 1:
                val.columns = pd.Index(random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns)))
            else:
                val.columns = pd.MultiIndex.from_tuples(
                    random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns)))
            yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _other = RExt(DType([pd.DataFrame, pd.Series]), arg_other(_self))
Ejemplo n.º 8
0
    def astype():
        def arg_astype_partial(v_self):
            if _spec.depth == _spec.max_depth:
                v_self: pd.DataFrame = v_self
                output: pd.DataFrame = _spec.output
                try:
                    if set(output.columns).issubset(set(v_self.columns)):
                        yield dict(output.dtypes)
                except:
                    pass

        def arg_dtype(v_self: pd.DataFrame):
            pool = ['int32', 'uint32', 'float64', 'float32', 'int64', 'uint64']
            mapping = {
                pool[i]: (([None] + pool[:i]) + pool[(i + 1):])
                for i in range(len(pool))
            }
            mapping['object'] = [None]
            res = {}
            for col in v_self.columns:
                chosen = random.choice(mapping[str(v_self.dtypes[col])])
                if chosen is not None:
                    res[col] = chosen
            yield res

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _dtype = Chain(RExt(DType(dict), arg_dtype(_self)),
                       arg_astype_partial(_self))
Ejemplo n.º 9
0
    def combine_first():

        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            if coin_flip() == 0:
                val = next(defaultRandDf(num_columns=nc, num_rows=nr, value_bags=(
                        [*string_bags, *ints_bags] + [*floats_bags, moar_nans_floats_bag])))
                val.columns = v_self.columns
                val.index = v_self.index
            else:
                val = next(defaultRandDf(index_levels=v_self.index.nlevels, column_levels=v_self.columns.nlevels,
                                         col_prefix='i1_', value_bags=(
                            [*string_bags, *ints_bags] + [*floats_bags, moar_nans_floats_bag])))
                if v_self.index.nlevels == 1:
                    val.index = pd.Index(random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
                else:
                    val.index = pd.MultiIndex.from_tuples(
                        random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
                if v_self.columns.nlevels == 1:
                    val.columns = pd.Index(
                        random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns)))
                else:
                    val.columns = pd.MultiIndex.from_tuples(
                        random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns)))
            yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*string_bags, *ints_bags] + [*floats_bags, moar_nans_floats_bag]))
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
Ejemplo n.º 10
0
    def query():
        def arg_expr(v_self: pd.DataFrame):
            pool = []
            dtypes = v_self.dtypes
            for col in v_self:
                dtype = dtypes[col]
                vals = list(v_self[col])
                if ('int' in str(dtype)) or ('float' in str(dtype)):
                    pool.append('{} > {}'.format(col, random.choice(vals)))
                    pool.append('{} < {}'.format(col, random.choice(vals)))
                    pool.append('{} == {}'.format(col, random.choice(vals)))
                    pool.append('{} != {}'.format(col, random.choice(vals)))
                elif 'object' in str(dtype):
                    pool.append('{} == {}'.format(col, random.choice(vals)))
                    pool.append('{} != {}'.format(col, random.choice(vals)))
            sample_size = random.randint(1, min(5, len(pool)))
            sample = random.sample(pool, sample_size)
            expr = sample[0]
            for i in range(1, len(sample)):
                expr += ' {} '.format(random.choice(['and', 'or']))
                expr += sample[i]
            yield expr

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _expr = RExt(DType(str), arg_expr(_self))
Ejemplo n.º 11
0
    def isin():
        def arg_values(v_self: pd.DataFrame):
            vals = list(v_self.values.flatten())
            sample_size = random.randint(1, max((len(vals) - 1), 1))
            yield list(random.sample(vals, sample_size))

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _values = RExt(DType(dict), arg_values(_self))
Ejemplo n.º 12
0
    def clip_lower():

        def arg_threshold(v_self: pd.DataFrame):
            vals = list(filter((lambda x: (not isinstance(x, str))), list(v_self.values.flatten())))
            yield random.uniform(min(vals), max(vals))

        _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags]))
        _threshold = RExt(DType(float), arg_threshold(_self))
Ejemplo n.º 13
0
    def apply():
        def arg_func(v_self: pd.DataFrame):
            numeric_cols = v_self.select_dtypes(include=np.number).columns
            if len(numeric_cols) == 0:
                return
            choice = random.choice(list(numeric_cols))
            yield Lambda('lambda x: x["{}"] > 1'.format(choice))
            yield Lambda('lambda x: x["{}"] + 1'.format(choice))

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _func = RExt(DType(Callable), arg_func(_self))
Ejemplo n.º 14
0
    def reindex_like():

        def arg_other(v_self: pd.DataFrame):
            val = next(defaultRandDf(index_levels=v_self.index.nlevels, col_prefix=random.choice(['', 'i1_']),
                                     value_bags=[*ints_bags, *floats_bags]))
            if v_self.index.nlevels == 1:
                val.index = pd.Index(random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
            else:
                val.index = pd.MultiIndex.from_tuples(
                    random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
            yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
Ejemplo n.º 15
0
    def head():
        def arg_head_partial(v_self: pd.DataFrame):
            if _spec.depth == _spec.max_depth:
                output: pd.DataFrame = _spec.output
                yield AnnotatedVal(output.shape[0], cost=0)

            yield from Select(list(range(1, v_self.shape[0] + 1)))

        def arg_n(v_self: pd.DataFrame):
            pool = list(set(([5] + list(range(1, len(v_self))))))
            yield random.choice(pool)

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _n = Chain(Default(5), RExt(DType(int), arg_n(_self)), arg_head_partial(_self))
Ejemplo n.º 16
0
    def ne():

        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            cond: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc, value_bags=bool_bags))
            cond.columns = v_self.columns
            cond.index = v_self.index
            val: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc))
            val.columns = v_self.columns
            val.index = v_self.index
            yield v_self.where(cond, val)

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
Ejemplo n.º 17
0
    def take():

        def arg_indices(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            if coin_flip() == 0:
                val = random.sample(range(nr), random.choice(range(1, (nr + 1))))
                random.shuffle(val)
                yield val
            else:
                val = random.sample(range(nc), random.choice(range(1, (nc + 1))))
                random.shuffle(val)
                yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _indices = RExt(DType(Sequence), arg_indices(_self))
Ejemplo n.º 18
0
    def combine():

        def arg_func():
            pool = [Lambda('lambda s1, s2: s1.mask(s1 < s2, s2)'), Lambda('lambda s1, s2: s1.mask(s1 > s2, s2)')]
            yield random.choice(pool)

        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            val: pd.DataFrame = next(
                defaultRandDf(num_rows=nr, num_columns=nc, value_bags=[*ints_bags, *floats_bags]))
            val.columns = v_self.columns
            val.index = v_self.index
            yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags]))
        _func = RExt(DType(Callable), arg_func())
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
Ejemplo n.º 19
0
    def clip():

        def arg_lower(v_self: pd.DataFrame):
            vals = list(filter((lambda x: (is_int(x) or is_float(x))), list(v_self.values.flatten())))
            if len(vals) == 0:
                return
            yield random.uniform(min(vals), max(vals))

        def arg_upper(v_self: pd.DataFrame, v_lower):
            vals = list(filter((lambda x: (is_int(x) or is_float(x))), list(v_self.values.flatten())))
            if len(vals) == 0:
                return
            if v_lower is None:
                v_lower = min(vals)
            yield random.uniform(v_lower, max(vals))

        _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags]))
        _lower = Chain(Default(None), RExt(DType(float), arg_lower(_self)))
        _upper = Chain(Default(None), RExt(DType(float), arg_upper(_self, _lower)))
Ejemplo n.º 20
0
    def reindex():

        def arg_labels(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            if coin_flip() == 0:
                vals = list(v_self.index)
                new_vals = list(StrColGen(all_distinct=True).generate((nr // 2))[1].values())
                yield list(random.sample((vals + new_vals), nr))
            else:
                vals = list(v_self.columns)
                new_vals = list(StrColGen(all_distinct=True).generate((nc // 2))[1].values())
                yield list(random.sample((vals + new_vals), nc))

        def arg_fill_value():
            yield random.uniform((- 100), 100)

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _labels = RExt(DType([list, dict]), arg_labels(_self))
        _fill_value = Chain(Default(np.NaN), RExt(DType(float), arg_fill_value()))
Ejemplo n.º 21
0
    def mask():

        def arg_cond(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            val: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc, value_bags=bool_bags))
            val.columns = v_self.columns
            val.index = v_self.index
            yield val

        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            val: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc))
            val.columns = v_self.columns
            val.index = v_self.index
            yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _cond = RExt(DType([Sequence, pd.DataFrame, Callable]), arg_cond(_self))
        _other = RExt(DType([Sequence, pd.DataFrame, Callable]), arg_other(_self))
Ejemplo n.º 22
0
 def dropna():
     _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.5))
Ejemplo n.º 23
0
 def set_index():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())
Ejemplo n.º 24
0
 def select_dtypes():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())
Ejemplo n.º 25
0
 def idxmax():
     _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.2))
Ejemplo n.º 26
0
 def filter():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())
Ejemplo n.º 27
0
 def equals():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())
     _other = RExt(DType(pd.DataFrame),
                   defaultRandDf(col_prefix=random.choice(['', 'i1_'])))
Ejemplo n.º 28
0
 def get_ftype_counts():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())
Ejemplo n.º 29
0
 def duplicated():
     _self = RExt(DType(pd.DataFrame), defaultRandDf(min_height=3))
Ejemplo n.º 30
0
 def drop():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())