def query(): def arg_expr(v_self: pd.DataFrame): pool = [] dtypes = v_self.dtypes for col in v_self: dtype = dtypes[col] vals = list(v_self[col]) if ('int' in str(dtype)) or ('float' in str(dtype)): pool.append('{} > {}'.format(col, random.choice(vals))) pool.append('{} < {}'.format(col, random.choice(vals))) pool.append('{} == {}'.format(col, random.choice(vals))) pool.append('{} != {}'.format(col, random.choice(vals))) elif 'object' in str(dtype): pool.append('{} == {}'.format(col, random.choice(vals))) pool.append('{} != {}'.format(col, random.choice(vals))) sample_size = random.randint(1, min(5, len(pool))) sample = random.sample(pool, sample_size) expr = sample[0] for i in range(1, len(sample)): expr += ' {} '.format(random.choice(['and', 'or'])) expr += sample[i] yield expr _self = RExt(DType(pd.DataFrame), defaultRandDf()) _expr = RExt(DType(str), arg_expr(_self))
def merge(): def arg_right(v_self: pd.DataFrame): new_df: pd.DataFrame = next(defaultRandDf(col_prefix='i1_')) dg1 = collections.defaultdict(list) dg2 = collections.defaultdict(list) for (k, v) in dict(v_self.dtypes).items(): dg1[v].append(k) for (k, v) in dict(new_df.dtypes).items(): dg2[v].append(k) c = (set(dg1.keys()) & set(dg2.keys())) for dt in c: cols1 = list(dg1[dt]) cols2 = list(dg2[dt]) random.shuffle(cols1) random.shuffle(cols2) pairs = list(zip(cols1, cols2)) for pair in pairs: if coin_flip() == 0: new_df[pair[1]] = random.sample( (list(new_df[pair[1]]) + list(v_self[pair[0]])), new_df.shape[0]) if (coin_flip() == 0) and (pair[0] not in new_df.columns): new_df = new_df.rename({ pair[1]: pair[0], }, axis=1) yield new_df _self = RExt(DType(pd.DataFrame), defaultRandDf()) _right = RExt(DType(pd.DataFrame), arg_right(_self))
def corrwith(): def arg_other(v_self: pd.DataFrame): (nr, nc) = v_self.shape val = next( defaultRandDf(num_rows=nr, column_levels=v_self.columns.nlevels, col_prefix='i1_', value_bags=[*ints_bags, *floats_bags])) val.index = v_self.index if (coin_flip() == 0) and (len(val.columns) == nc): val.columns = v_self.columns elif v_self.columns.nlevels == 1: val.columns = pd.Index( random.sample( set((list(v_self.columns) + list(val.columns))), len(val.columns))) else: val.columns = pd.MultiIndex.from_tuples( random.sample( set((list(v_self.columns) + list(val.columns))), len(val.columns))) yield val _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags])) _other = RExt(DType(pd.DataFrame), arg_other(_self))
def round(): def arg_decimals(): yield random.choice([1, 2, 3, 4, 5]) _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.1)) _decimals = Chain(Default(0), RExt(DType(int), arg_decimals()))
def align(): def arg_other(v_self: pd.DataFrame): (nr, nc) = v_self.shape val = next(defaultRandDf(num_rows=random.choice([max((nr - 1), 1), nr, (nr + 1)]), num_columns=random.choice([max((nc - 1), 1), nc, (nc + 1)]), col_prefix='i1_', index_levels=v_self.index.nlevels, column_levels=v_self.columns.nlevels, value_bags=[*ints_bags, *floats_bags])) if (coin_flip() == 0) and (len(val.index) == nr): val.index = v_self.index elif v_self.index.nlevels == 1: val.index = pd.Index(random.sample(set((list(v_self.index) + list(val.index))), len(val.index))) else: val.index = pd.MultiIndex.from_tuples( random.sample(set((list(v_self.index) + list(val.index))), len(val.index))) if (coin_flip() == 0) and (len(val.columns) == nc): val.columns = v_self.columns elif v_self.columns.nlevels == 1: val.columns = pd.Index(random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns))) else: val.columns = pd.MultiIndex.from_tuples( random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns))) yield val _self = RExt(DType(pd.DataFrame), defaultRandDf()) _other = RExt(DType([pd.DataFrame, pd.Series]), arg_other(_self))
def diff(): def arg_periods(v_self: pd.DataFrame): (nr, _) = v_self.shape yield random.choice(range((-(nr - 1)), nr)) _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.1)) _periods = Chain(Default(1), RExt(DType(int), arg_periods(_self)))
def combine_first(): def arg_other(v_self: pd.DataFrame): (nr, nc) = v_self.shape if coin_flip() == 0: val = next(defaultRandDf(num_columns=nc, num_rows=nr, value_bags=( [*string_bags, *ints_bags] + [*floats_bags, moar_nans_floats_bag]))) val.columns = v_self.columns val.index = v_self.index else: val = next(defaultRandDf(index_levels=v_self.index.nlevels, column_levels=v_self.columns.nlevels, col_prefix='i1_', value_bags=( [*string_bags, *ints_bags] + [*floats_bags, moar_nans_floats_bag]))) if v_self.index.nlevels == 1: val.index = pd.Index(random.sample(set((list(v_self.index) + list(val.index))), len(val.index))) else: val.index = pd.MultiIndex.from_tuples( random.sample(set((list(v_self.index) + list(val.index))), len(val.index))) if v_self.columns.nlevels == 1: val.columns = pd.Index( random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns))) else: val.columns = pd.MultiIndex.from_tuples( random.sample(set((list(v_self.columns) + list(val.columns))), len(val.columns))) yield val _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*string_bags, *ints_bags] + [*floats_bags, moar_nans_floats_bag])) _other = RExt(DType(pd.DataFrame), arg_other(_self))
def astype(): def arg_astype_partial(v_self): if _spec.depth == _spec.max_depth: v_self: pd.DataFrame = v_self output: pd.DataFrame = _spec.output try: if set(output.columns).issubset(set(v_self.columns)): yield dict(output.dtypes) except: pass def arg_dtype(v_self: pd.DataFrame): pool = ['int32', 'uint32', 'float64', 'float32', 'int64', 'uint64'] mapping = { pool[i]: (([None] + pool[:i]) + pool[(i + 1):]) for i in range(len(pool)) } mapping['object'] = [None] res = {} for col in v_self.columns: chosen = random.choice(mapping[str(v_self.dtypes[col])]) if chosen is not None: res[col] = chosen yield res _self = RExt(DType(pd.DataFrame), defaultRandDf()) _dtype = Chain(RExt(DType(dict), arg_dtype(_self)), arg_astype_partial(_self))
def fillna(): def arg_value(): if _spec.depth == _spec.max_depth: output: pd.DataFrame = _spec.output all_values = set([ i for col in output for i in output[col] if (not pd.isnull(i)) ]) yield from map(lambda x: AnnotatedVal(x, cost=2), Select(all_values)) def arg_limit(v_self: pd.DataFrame): yield from map(lambda x: AnnotatedVal(x, cost=5), Select(range(1, (max(v_self.shape) + 1)))) def rarg_limit(v_self: pd.DataFrame): (nr, nc) = v_self.shape if coin_flip() == 0: yield random.choice(range(1, max(nr, 2))) else: yield random.choice(range(1, max(nc, 2))) def rarg_value(): yield random.uniform((-1000), 1000) _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.5)) _limit = Chain(Default(None), RExt(DType(int), rarg_limit(_self)), arg_limit(_self)) _value = Chain(Default(None), RExt(FType(np.isscalar), rarg_value()), Ext(DType([dict, pd.Series, pd.DataFrame])), arg_value())
def add(): def arg_other(v_self: pd.DataFrame): (nr, nc) = v_self.shape v_nc = random.choice([nc, nc - 1, nc + 1]) val = next( defaultRandDf(num_rows=nr, num_columns=v_nc, column_levels=v_self.columns.nlevels, col_prefix='i1_', value_bags=[*ints_bags, *floats_bags])) val.index = v_self.index if (coin_flip() == 0) and (len(val.columns) == nc): val.columns = v_self.columns elif v_self.columns.nlevels == 1: val.columns = pd.Index( random.sample( set((list(v_self.columns) + list(val.columns))), len(val.columns))) else: val.columns = pd.MultiIndex.from_tuples( random.sample( set((list(v_self.columns) + list(val.columns))), len(val.columns))) yield val def arg_fill_value(): yield random.uniform((-100), 100) _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags])) _other = RExt(DType(pd.DataFrame), arg_other(_self)) _fill_value = Chain(Default(None), RExt(DType(float), arg_fill_value()))
def clip_lower(): def arg_threshold(v_self: pd.DataFrame): vals = list(filter((lambda x: (not isinstance(x, str))), list(v_self.values.flatten()))) yield random.uniform(min(vals), max(vals)) _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags])) _threshold = RExt(DType(float), arg_threshold(_self))
def isin(): def arg_values(v_self: pd.DataFrame): vals = list(v_self.values.flatten()) sample_size = random.randint(1, max((len(vals) - 1), 1)) yield list(random.sample(vals, sample_size)) _self = RExt(DType(pd.DataFrame), defaultRandDf()) _values = RExt(DType(dict), arg_values(_self))
def apply(): def arg_func(v_self: pd.DataFrame): numeric_cols = v_self.select_dtypes(include=np.number).columns if len(numeric_cols) == 0: return choice = random.choice(list(numeric_cols)) yield Lambda('lambda x: x["{}"] > 1'.format(choice)) yield Lambda('lambda x: x["{}"] + 1'.format(choice)) _self = RExt(DType(pd.DataFrame), defaultRandDf()) _func = RExt(DType(Callable), arg_func(_self))
def RExt(dtype: DType, rgen=None, spec: SearchSpec = None, depth: int = 1, mode: str = None, tracker: OpTracker = None, arg_name: str = None, identifier: str = None, constraint: Callable[[Any], Any] = None, **kwargs): if constraint is None: def constraint(x): return True if mode != 'training-data': raise AutoPandasException("Unrecognized mode {} in RExt".format(mode)) pool: List[Optional[Value]] = [] for idx, val in enumerate(spec.inputs): if not (dtype.hasinstance(val) and constraint(val)): continue pool.append(Fetcher(val=val, source='inps', idx=idx)) for idx, val in enumerate(spec.intermediates[:depth - 1]): if not (dtype.hasinstance(val) and constraint(val)): continue pool.append(Fetcher(val=val, source='intermediates', idx=idx)) if rgen is not None: pool.append(None) random.shuffle(pool) label = 'ext_' + arg_name + '_' + identifier rlabel = 'rext_' + arg_name + '_' + identifier for selection in pool: tracker.record.pop(label, None) tracker.record.pop(rlabel, None) if selection is None: # We've decided to create a new input altogether val = next(rgen) tracker.record[rlabel] = {'val': val, 'arg_name': arg_name} yield NewInp(val) else: selection: Fetcher tracker.record[label] = { 'source': selection.source, 'idx': selection.idx } yield selection
def head(): def arg_head_partial(v_self: pd.DataFrame): if _spec.depth == _spec.max_depth: output: pd.DataFrame = _spec.output yield AnnotatedVal(output.shape[0], cost=0) yield from Select(list(range(1, v_self.shape[0] + 1))) def arg_n(v_self: pd.DataFrame): pool = list(set(([5] + list(range(1, len(v_self)))))) yield random.choice(pool) _self = RExt(DType(pd.DataFrame), defaultRandDf()) _n = Chain(Default(5), RExt(DType(int), arg_n(_self)), arg_head_partial(_self))
def reindex_like(): def arg_other(v_self: pd.DataFrame): val = next(defaultRandDf(index_levels=v_self.index.nlevels, col_prefix=random.choice(['', 'i1_']), value_bags=[*ints_bags, *floats_bags])) if v_self.index.nlevels == 1: val.index = pd.Index(random.sample(set((list(v_self.index) + list(val.index))), len(val.index))) else: val.index = pd.MultiIndex.from_tuples( random.sample(set((list(v_self.index) + list(val.index))), len(val.index))) yield val _self = RExt(DType(pd.DataFrame), defaultRandDf()) _other = RExt(DType(pd.DataFrame), arg_other(_self))
def ne(): def arg_other(v_self: pd.DataFrame): (nr, nc) = v_self.shape cond: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc, value_bags=bool_bags)) cond.columns = v_self.columns cond.index = v_self.index val: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc)) val.columns = v_self.columns val.index = v_self.index yield v_self.where(cond, val) _self = RExt(DType(pd.DataFrame), defaultRandDf()) _other = RExt(DType(pd.DataFrame), arg_other(_self))
def take(): def arg_indices(v_self: pd.DataFrame): (nr, nc) = v_self.shape if coin_flip() == 0: val = random.sample(range(nr), random.choice(range(1, (nr + 1)))) random.shuffle(val) yield val else: val = random.sample(range(nc), random.choice(range(1, (nc + 1)))) random.shuffle(val) yield val _self = RExt(DType(pd.DataFrame), defaultRandDf()) _indices = RExt(DType(Sequence), arg_indices(_self))
def combine(): def arg_func(): pool = [Lambda('lambda s1, s2: s1.mask(s1 < s2, s2)'), Lambda('lambda s1, s2: s1.mask(s1 > s2, s2)')] yield random.choice(pool) def arg_other(v_self: pd.DataFrame): (nr, nc) = v_self.shape val: pd.DataFrame = next( defaultRandDf(num_rows=nr, num_columns=nc, value_bags=[*ints_bags, *floats_bags])) val.columns = v_self.columns val.index = v_self.index yield val _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags])) _func = RExt(DType(Callable), arg_func()) _other = RExt(DType(pd.DataFrame), arg_other(_self))
def clip(): def arg_lower(v_self: pd.DataFrame): vals = list(filter((lambda x: (is_int(x) or is_float(x))), list(v_self.values.flatten()))) if len(vals) == 0: return yield random.uniform(min(vals), max(vals)) def arg_upper(v_self: pd.DataFrame, v_lower): vals = list(filter((lambda x: (is_int(x) or is_float(x))), list(v_self.values.flatten()))) if len(vals) == 0: return if v_lower is None: v_lower = min(vals) yield random.uniform(v_lower, max(vals)) _self = RExt(DType(pd.DataFrame), defaultRandDf(value_bags=[*ints_bags, *floats_bags])) _lower = Chain(Default(None), RExt(DType(float), arg_lower(_self))) _upper = Chain(Default(None), RExt(DType(float), arg_upper(_self, _lower)))
def reindex(): def arg_labels(v_self: pd.DataFrame): (nr, nc) = v_self.shape if coin_flip() == 0: vals = list(v_self.index) new_vals = list(StrColGen(all_distinct=True).generate((nr // 2))[1].values()) yield list(random.sample((vals + new_vals), nr)) else: vals = list(v_self.columns) new_vals = list(StrColGen(all_distinct=True).generate((nc // 2))[1].values()) yield list(random.sample((vals + new_vals), nc)) def arg_fill_value(): yield random.uniform((- 100), 100) _self = RExt(DType(pd.DataFrame), defaultRandDf()) _labels = RExt(DType([list, dict]), arg_labels(_self)) _fill_value = Chain(Default(np.NaN), RExt(DType(float), arg_fill_value()))
def mask(): def arg_cond(v_self: pd.DataFrame): (nr, nc) = v_self.shape val: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc, value_bags=bool_bags)) val.columns = v_self.columns val.index = v_self.index yield val def arg_other(v_self: pd.DataFrame): (nr, nc) = v_self.shape val: pd.DataFrame = next(defaultRandDf(num_rows=nr, num_columns=nc)) val.columns = v_self.columns val.index = v_self.index yield val _self = RExt(DType(pd.DataFrame), defaultRandDf()) _cond = RExt(DType([Sequence, pd.DataFrame, Callable]), arg_cond(_self)) _other = RExt(DType([Sequence, pd.DataFrame, Callable]), arg_other(_self))
def dropna(): _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.5))
def set_index(): _self = RExt(DType(pd.DataFrame), defaultRandDf())
def select_dtypes(): _self = RExt(DType(pd.DataFrame), defaultRandDf())
def idxmax(): _self = RExt(DType(pd.DataFrame), defaultRandDf(nan_prob=0.2))
def filter(): _self = RExt(DType(pd.DataFrame), defaultRandDf())
def equals(): _self = RExt(DType(pd.DataFrame), defaultRandDf()) _other = RExt(DType(pd.DataFrame), defaultRandDf(col_prefix=random.choice(['', 'i1_'])))
def get_ftype_counts(): _self = RExt(DType(pd.DataFrame), defaultRandDf())
def duplicated(): _self = RExt(DType(pd.DataFrame), defaultRandDf(min_height=3))