Example #1
0
    def query():
        def arg_expr(v_self: pd.DataFrame):
            pool = []
            dtypes = v_self.dtypes
            for col in v_self:
                dtype = dtypes[col]
                vals = list(v_self[col])
                if ('int' in str(dtype)) or ('float' in str(dtype)):
                    pool.append('{} > {}'.format(col, random.choice(vals)))
                    pool.append('{} < {}'.format(col, random.choice(vals)))
                    pool.append('{} == {}'.format(col, random.choice(vals)))
                    pool.append('{} != {}'.format(col, random.choice(vals)))
                elif 'object' in str(dtype):
                    pool.append('{} == {}'.format(col, random.choice(vals)))
                    pool.append('{} != {}'.format(col, random.choice(vals)))
            sample_size = random.randint(1, min(5, len(pool)))
            sample = random.sample(pool, sample_size)
            expr = sample[0]
            for i in range(1, len(sample)):
                expr += ' {} '.format(random.choice(['and', 'or']))
                expr += sample[i]
            yield expr

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _expr = RExt(DType(str), arg_expr(_self))
Example #2
0
    def merge():
        def arg_right(v_self: pd.DataFrame):
            new_df: pd.DataFrame = next(defaultRandDf(col_prefix='i1_'))
            dg1 = collections.defaultdict(list)
            dg2 = collections.defaultdict(list)
            for (k, v) in dict(v_self.dtypes).items():
                dg1[v].append(k)
            for (k, v) in dict(new_df.dtypes).items():
                dg2[v].append(k)
            c = (set(dg1.keys()) & set(dg2.keys()))
            for dt in c:
                cols1 = list(dg1[dt])
                cols2 = list(dg2[dt])
                random.shuffle(cols1)
                random.shuffle(cols2)
                pairs = list(zip(cols1, cols2))
                for pair in pairs:
                    if coin_flip() == 0:
                        new_df[pair[1]] = random.sample(
                            (list(new_df[pair[1]]) + list(v_self[pair[0]])),
                            new_df.shape[0])
                        if (coin_flip() == 0) and (pair[0]
                                                   not in new_df.columns):
                            new_df = new_df.rename({
                                pair[1]: pair[0],
                            },
                                                   axis=1)
            yield new_df

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _right = RExt(DType(pd.DataFrame), arg_right(_self))
Example #3
0
    def corrwith():
        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            val = next(
                defaultRandDf(num_rows=nr,
                              column_levels=v_self.columns.nlevels,
                              col_prefix='i1_',
                              value_bags=[*ints_bags, *floats_bags]))
            val.index = v_self.index
            if (coin_flip() == 0) and (len(val.columns) == nc):
                val.columns = v_self.columns
            elif v_self.columns.nlevels == 1:
                val.columns = pd.Index(
                    random.sample(
                        set((list(v_self.columns) + list(val.columns))),
                        len(val.columns)))
            else:
                val.columns = pd.MultiIndex.from_tuples(
                    random.sample(
                        set((list(v_self.columns) + list(val.columns))),
                        len(val.columns)))
            yield val

        _self = RExt(DType(pd.DataFrame),
                     defaultRandDf(value_bags=[*ints_bags, *floats_bags]))
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
Example #4
0
    def round():

        def arg_decimals():
            yield random.choice([1, 2, 3, 4, 5])

        _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.1))
        _decimals = Chain(Default(0), RExt(DType(int), arg_decimals()))
Example #5
0
    def align():

        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            val = next(defaultRandDf(num_rows=random.choice([max((nr - 1), 1), nr, (nr + 1)]),
                                     num_columns=random.choice([max((nc - 1), 1), nc, (nc + 1)]), col_prefix='i1_',
                                     index_levels=v_self.index.nlevels, column_levels=v_self.columns.nlevels,
                                     value_bags=[*ints_bags, *floats_bags]))
            if (coin_flip() == 0) and (len(val.index) == nr):
                val.index = v_self.index
            elif v_self.index.nlevels == 1:
                val.index = pd.Index(random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
            else:
                val.index = pd.MultiIndex.from_tuples(
                    random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
            if (coin_flip() == 0) and (len(val.columns) == nc):
                val.columns = v_self.columns
            elif v_self.columns.nlevels == 1:
                val.columns = pd.Index(random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns)))
            else:
                val.columns = pd.MultiIndex.from_tuples(
                    random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns)))
            yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _other = RExt(DType([pd.DataFrame, pd.Series]), arg_other(_self))
Example #6
0
    def diff():
        def arg_periods(v_self: pd.DataFrame):
            (nr, _) = v_self.shape
            yield random.choice(range((-(nr - 1)), nr))

        _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.1))
        _periods = Chain(Default(1), RExt(DType(int), arg_periods(_self)))
Example #7
0
    def combine_first():

        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            if coin_flip() == 0:
                val = next(defaultRandDf(num_columns=nc, num_rows=nr, value_bags=(
                        [*string_bags, *ints_bags] + [*floats_bags, moar_nans_floats_bag])))
                val.columns = v_self.columns
                val.index = v_self.index
            else:
                val = next(defaultRandDf(index_levels=v_self.index.nlevels, column_levels=v_self.columns.nlevels,
                                         col_prefix='i1_', value_bags=(
                            [*string_bags, *ints_bags] + [*floats_bags, moar_nans_floats_bag])))
                if v_self.index.nlevels == 1:
                    val.index = pd.Index(random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
                else:
                    val.index = pd.MultiIndex.from_tuples(
                        random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
                if v_self.columns.nlevels == 1:
                    val.columns = pd.Index(
                        random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns)))
                else:
                    val.columns = pd.MultiIndex.from_tuples(
                        random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns)))
            yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*string_bags, *ints_bags] + [*floats_bags, moar_nans_floats_bag]))
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
Example #8
0
    def astype():
        def arg_astype_partial(v_self):
            if _spec.depth == _spec.max_depth:
                v_self: pd.DataFrame = v_self
                output: pd.DataFrame = _spec.output
                try:
                    if set(output.columns).issubset(set(v_self.columns)):
                        yield dict(output.dtypes)
                except:
                    pass

        def arg_dtype(v_self: pd.DataFrame):
            pool = ['int32', 'uint32', 'float64', 'float32', 'int64', 'uint64']
            mapping = {
                pool[i]: (([None] + pool[:i]) + pool[(i + 1):])
                for i in range(len(pool))
            }
            mapping['object'] = [None]
            res = {}
            for col in v_self.columns:
                chosen = random.choice(mapping[str(v_self.dtypes[col])])
                if chosen is not None:
                    res[col] = chosen
            yield res

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _dtype = Chain(RExt(DType(dict), arg_dtype(_self)),
                       arg_astype_partial(_self))
Example #9
0
    def fillna():
        def arg_value():
            if _spec.depth == _spec.max_depth:
                output: pd.DataFrame = _spec.output
                all_values = set([
                    i for col in output for i in output[col]
                    if (not pd.isnull(i))
                ])
                yield from map(lambda x: AnnotatedVal(x, cost=2),
                               Select(all_values))

        def arg_limit(v_self: pd.DataFrame):
            yield from map(lambda x: AnnotatedVal(x, cost=5),
                           Select(range(1, (max(v_self.shape) + 1))))

        def rarg_limit(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            if coin_flip() == 0:
                yield random.choice(range(1, max(nr, 2)))
            else:
                yield random.choice(range(1, max(nc, 2)))

        def rarg_value():
            yield random.uniform((-1000), 1000)

        _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.5))
        _limit = Chain(Default(None), RExt(DType(int), rarg_limit(_self)),
                       arg_limit(_self))
        _value = Chain(Default(None), RExt(FType(np.isscalar), rarg_value()),
                       Ext(DType([dict, pd.Series, pd.DataFrame])),
                       arg_value())
Example #10
0
    def add():
        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            v_nc = random.choice([nc, nc - 1, nc + 1])
            val = next(
                defaultRandDf(num_rows=nr,
                              num_columns=v_nc,
                              column_levels=v_self.columns.nlevels,
                              col_prefix='i1_',
                              value_bags=[*ints_bags, *floats_bags]))
            val.index = v_self.index
            if (coin_flip() == 0) and (len(val.columns) == nc):
                val.columns = v_self.columns
            elif v_self.columns.nlevels == 1:
                val.columns = pd.Index(
                    random.sample(
                        set((list(v_self.columns) + list(val.columns))),
                        len(val.columns)))
            else:
                val.columns = pd.MultiIndex.from_tuples(
                    random.sample(
                        set((list(v_self.columns) + list(val.columns))),
                        len(val.columns)))
            yield val

        def arg_fill_value():
            yield random.uniform((-100), 100)

        _self = RExt(DType(pd.DataFrame),
                     defaultRandDf(value_bags=[*ints_bags, *floats_bags]))
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
        _fill_value = Chain(Default(None), RExt(DType(float),
                                                arg_fill_value()))
Example #11
0
    def clip_lower():

        def arg_threshold(v_self: pd.DataFrame):
            vals = list(filter((lambda x: (not isinstance(x, str))), list(v_self.values.flatten())))
            yield random.uniform(min(vals), max(vals))

        _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags]))
        _threshold = RExt(DType(float), arg_threshold(_self))
Example #12
0
    def isin():
        def arg_values(v_self: pd.DataFrame):
            vals = list(v_self.values.flatten())
            sample_size = random.randint(1, max((len(vals) - 1), 1))
            yield list(random.sample(vals, sample_size))

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _values = RExt(DType(dict), arg_values(_self))
Example #13
0
    def apply():
        def arg_func(v_self: pd.DataFrame):
            numeric_cols = v_self.select_dtypes(include=np.number).columns
            if len(numeric_cols) == 0:
                return
            choice = random.choice(list(numeric_cols))
            yield Lambda('lambda x: x["{}"] > 1'.format(choice))
            yield Lambda('lambda x: x["{}"] + 1'.format(choice))

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _func = RExt(DType(Callable), arg_func(_self))
def RExt(dtype: DType,
         rgen=None,
         spec: SearchSpec = None,
         depth: int = 1,
         mode: str = None,
         tracker: OpTracker = None,
         arg_name: str = None,
         identifier: str = None,
         constraint: Callable[[Any], Any] = None,
         **kwargs):

    if constraint is None:

        def constraint(x):
            return True

    if mode != 'training-data':
        raise AutoPandasException("Unrecognized mode {} in RExt".format(mode))

    pool: List[Optional[Value]] = []
    for idx, val in enumerate(spec.inputs):
        if not (dtype.hasinstance(val) and constraint(val)):
            continue
        pool.append(Fetcher(val=val, source='inps', idx=idx))

    for idx, val in enumerate(spec.intermediates[:depth - 1]):
        if not (dtype.hasinstance(val) and constraint(val)):
            continue
        pool.append(Fetcher(val=val, source='intermediates', idx=idx))

    if rgen is not None:
        pool.append(None)

    random.shuffle(pool)
    label = 'ext_' + arg_name + '_' + identifier
    rlabel = 'rext_' + arg_name + '_' + identifier
    for selection in pool:
        tracker.record.pop(label, None)
        tracker.record.pop(rlabel, None)
        if selection is None:
            #  We've decided to create a new input altogether
            val = next(rgen)
            tracker.record[rlabel] = {'val': val, 'arg_name': arg_name}
            yield NewInp(val)

        else:
            selection: Fetcher
            tracker.record[label] = {
                'source': selection.source,
                'idx': selection.idx
            }
            yield selection
Example #15
0
    def head():
        def arg_head_partial(v_self: pd.DataFrame):
            if _spec.depth == _spec.max_depth:
                output: pd.DataFrame = _spec.output
                yield AnnotatedVal(output.shape[0], cost=0)

            yield from Select(list(range(1, v_self.shape[0] + 1)))

        def arg_n(v_self: pd.DataFrame):
            pool = list(set(([5] + list(range(1, len(v_self))))))
            yield random.choice(pool)

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _n = Chain(Default(5), RExt(DType(int), arg_n(_self)), arg_head_partial(_self))
Example #16
0
    def reindex_like():

        def arg_other(v_self: pd.DataFrame):
            val = next(defaultRandDf(index_levels=v_self.index.nlevels, col_prefix=random.choice(['', 'i1_']),
                                     value_bags=[*ints_bags, *floats_bags]))
            if v_self.index.nlevels == 1:
                val.index = pd.Index(random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
            else:
                val.index = pd.MultiIndex.from_tuples(
                    random.sample(set((list(v_self.index) + list(val.index))), len(val.index)))
            yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
Example #17
0
    def ne():

        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            cond: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc, value_bags=bool_bags))
            cond.columns = v_self.columns
            cond.index = v_self.index
            val: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc))
            val.columns = v_self.columns
            val.index = v_self.index
            yield v_self.where(cond, val)

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
Example #18
0
    def take():

        def arg_indices(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            if coin_flip() == 0:
                val = random.sample(range(nr), random.choice(range(1, (nr + 1))))
                random.shuffle(val)
                yield val
            else:
                val = random.sample(range(nc), random.choice(range(1, (nc + 1))))
                random.shuffle(val)
                yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _indices = RExt(DType(Sequence), arg_indices(_self))
Example #19
0
    def combine():

        def arg_func():
            pool = [Lambda('lambda s1, s2: s1.mask(s1 < s2, s2)'), Lambda('lambda s1, s2: s1.mask(s1 > s2, s2)')]
            yield random.choice(pool)

        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            val: pd.DataFrame = next(
                defaultRandDf(num_rows=nr, num_columns=nc, value_bags=[*ints_bags, *floats_bags]))
            val.columns = v_self.columns
            val.index = v_self.index
            yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags]))
        _func = RExt(DType(Callable), arg_func())
        _other = RExt(DType(pd.DataFrame), arg_other(_self))
Example #20
0
    def clip():

        def arg_lower(v_self: pd.DataFrame):
            vals = list(filter((lambda x: (is_int(x) or is_float(x))), list(v_self.values.flatten())))
            if len(vals) == 0:
                return
            yield random.uniform(min(vals), max(vals))

        def arg_upper(v_self: pd.DataFrame, v_lower):
            vals = list(filter((lambda x: (is_int(x) or is_float(x))), list(v_self.values.flatten())))
            if len(vals) == 0:
                return
            if v_lower is None:
                v_lower = min(vals)
            yield random.uniform(v_lower, max(vals))

        _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags]))
        _lower = Chain(Default(None), RExt(DType(float), arg_lower(_self)))
        _upper = Chain(Default(None), RExt(DType(float), arg_upper(_self, _lower)))
Example #21
0
    def reindex():

        def arg_labels(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            if coin_flip() == 0:
                vals = list(v_self.index)
                new_vals = list(StrColGen(all_distinct=True).generate((nr // 2))[1].values())
                yield list(random.sample((vals + new_vals), nr))
            else:
                vals = list(v_self.columns)
                new_vals = list(StrColGen(all_distinct=True).generate((nc // 2))[1].values())
                yield list(random.sample((vals + new_vals), nc))

        def arg_fill_value():
            yield random.uniform((- 100), 100)

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _labels = RExt(DType([list, dict]), arg_labels(_self))
        _fill_value = Chain(Default(np.NaN), RExt(DType(float), arg_fill_value()))
Example #22
0
    def mask():

        def arg_cond(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            val: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc, value_bags=bool_bags))
            val.columns = v_self.columns
            val.index = v_self.index
            yield val

        def arg_other(v_self: pd.DataFrame):
            (nr, nc) = v_self.shape
            val: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc))
            val.columns = v_self.columns
            val.index = v_self.index
            yield val

        _self = RExt(DType(pd.DataFrame), defaultRandDf())
        _cond = RExt(DType([Sequence, pd.DataFrame, Callable]), arg_cond(_self))
        _other = RExt(DType([Sequence, pd.DataFrame, Callable]), arg_other(_self))
Example #23
0
 def dropna():
     _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.5))
Example #24
0
 def set_index():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())
Example #25
0
 def select_dtypes():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())
Example #26
0
 def idxmax():
     _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.2))
Example #27
0
 def filter():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())
Example #28
0
 def equals():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())
     _other = RExt(DType(pd.DataFrame),
                   defaultRandDf(col_prefix=random.choice(['', 'i1_'])))
Example #29
0
 def get_ftype_counts():
     _self = RExt(DType(pd.DataFrame), defaultRandDf())
Example #30
0
 def duplicated():
     _self = RExt(DType(pd.DataFrame), defaultRandDf(min_height=3))