def get_text(data, limit, vert, col_width): """Returns the text to be printed by text_table.""" return tz.pipe( head(data, limit), lambda table: transpose(table) if vert else table, tz.partial(wrap_table, col_width=col_width), tz.partial(text_table, print_out=False), )
def make_border( points: np.ndarray, edges: Complex ) -> Tuple[np.ndarray, Complex, Complex, PSet[Cycle], PSet[Cycle]]: def first_index(array: np.ndarray, value: np.ndarray) -> float: return next(i for i, _ in enumerate(array) if np.linalg.norm(value - _) < EPSILON) first_index_points = partial(first_index, points) corners = v(v(-0.5, 0.5), v(-0.5, -0.5), v(0.5, -0.5), v(0.5, 0.5)) ul, dl, dr, ur = pipe(corners, map(np.array), map(first_index_points)) max_ind = len(points) cul = max_ind cdl = max_ind + 1 cdr = max_ind + 2 cur = max_ind + 3 left_c = v(ul, cul, cdl, dl) right_c = v(dr, cdr, cur, ur) down_c = v(dl, cdl, cdr, dr) up_c = v(ur, cur, cul, ul) red_base_cs = s(left_c, right_c) blue_base_cs = s(up_c, down_c) def border_edges(pts: np.ndarray, es: Complex, coord: int, side: float) -> Complex: return pset(edge for edge in es if all( np.linalg.norm(pts[vert][coord] - side) < EPSILON for vert in edge)) border_edges_from_square_side = partial(border_edges, points, edges) left_faces = faces_from_edges( border_edges_from_square_side(0, -0.5) | outer_edges_from_cycle(left_c)) right_faces = faces_from_edges( border_edges_from_square_side(0, 0.5) | outer_edges_from_cycle(right_c)) down_faces = faces_from_edges( border_edges_from_square_side(1, -0.5) | outer_edges_from_cycle(down_c)) up_faces = faces_from_edges( border_edges_from_square_side(1, 0.5) | outer_edges_from_cycle(up_c)) red_base = closure(left_faces | right_faces) blue_base = closure(down_faces | up_faces) border_points = np.array(corners) * BORDER_SCALE aug_points = np.concatenate((points, border_points)) return aug_points, blue_base, red_base, blue_base_cs, red_base_cs
def out_of_time_and_space_splitter(train_data: pd.DataFrame, n_splits: int, in_time_limit: DateType, time_column: str, space_column: str, holdout_gap: timedelta = timedelta(days=0)) -> SplitterReturnType: """ Makes K grouped train/test split folds for cross validation. The folds are made so that every ID is used at least once for evaluating and K-1 times for training. Also, for each fold, evaluation will always be out-of-ID and out-of-time. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame that will be split into K out-of-time and ID folds for cross validation. n_splits : int The number of folds K for the K-Fold cross validation strategy. in_time_limit : str or datetime.datetime A String representing the end time of the training data. It should be in the same format as the Date column in `train_data`. time_column : str The name of the Date column of `train_data`. space_column : str The name of the ID column of `train_data`. holdout_gap: datetime.timedelta Timedelta of the gap between the end of the training period and the start of the validation period. """ # first generate folds by space, using LabelKFold # GroupKFold is not supposed to be randomized, that's why there's no random_state here train_data = train_data.reset_index() space_folds = GroupKFold(n_splits).split(train_data, groups=train_data[space_column]) if isinstance(in_time_limit, str): in_time_limit = datetime.strptime(in_time_limit, "%Y-%m-%d") # train_indexes have time_column <= in_time_limit # test_indexes have time_column > in_time_limit folds = pipe(space_folds, partial(starmap, lambda f_train, f_test: [train_data.iloc[f_train][time_column], train_data.iloc[f_test][time_column]]), partial(starmap, lambda train, test: (train[train <= in_time_limit], # filter train time test[test > (in_time_limit + holdout_gap)])), # filter test time list) logs = list(map(_log_time_fold, folds)) # get fold logs folds_indexes = _lc_fold_to_indexes(folds) # final formatting with idx return folds_indexes, logs
def explode(b, components=10): return pipe( b, partial(find_peaks, distance=1000), get(0), sliding_window(2), map(lambda x: b[x[0]:x[1]]), map(lambda x: (x) / (np.quantile(np.abs(x), 0.9))), map(lambda x: x[np.round(np.linspace(0, x.shape[0] - 1, num=1000)). astype(np.int)]), map(partial(fft, n=components)), map(lambda x: np.hstack((np.real(x))).reshape(-1)), list, )
def output_feature_importance(result: ModelCVResult): try: return { 'label': "Feature importance", 'type': 'featureImportance', 'key': 'correlation_table', 'data': { "features": list( pipeline( enumerate(result['feature_importance']), [ map( lambda item: { "name": result['feature_importance'].index[item[0] ], "name_detailed": format_feature_detailed(result[ 'feature_importance'].index[item[0]]), "importance": item[1] }), list, partial(sorted, key=itemgetter("importance"), reverse=True), ], )), }, 'width': 700, } except (ValueError, TypeError): pass
def __init__(self) -> None: if SEED is not None: np.random.seed(SEED) def reflect(u: np.ndarray, w: np.ndarray, a: float) -> np.ndarray: return u - 2 * np.broadcast_to(w, u.shape) * (np.reshape( np.dot(u, w) - a, (len(u), 1))) control_points = np.random.rand(BOARD_SIZE, 2) - 0.5 reflect_control_points = partial(reflect, control_points) down_reflect = reflect_control_points(np.array([0, 1]), -0.5) up_reflect = reflect_control_points(np.array([0, 1]), 0.5) left_reflect = reflect_control_points(np.array([1, 0]), -0.5) right_reflect = reflect_control_points(np.array([1, 0]), 0.5) extended_points = np.concatenate( (control_points, up_reflect, down_reflect, left_reflect, right_reflect)) voronoi = sp.spatial.Voronoi(extended_points) self.cycles = freeze( np.array(voronoi.regions)[voronoi.point_region[:voronoi.npoints // 5]]) edges = edges_from_cycles(self.cycles) verts = verts_from_edges(edges) self.points, self.blue_base, self.red_base, self.blue_base_cs, self.red_base_cs = self.make_border( voronoi.vertices, edges) self.xs = verts | edges | self.blue_base | self.red_base
def composer(self, tokens): return compose(*pipe( tokens, reversed, filter(first), map( lambda arg: partial(arg[0], *arg[1], **arg[2]) if any(arg[1:]) else arg[0] ), list ))
def get_explotion(f, components=25): try: return pipe(f, wavfile.read, get(1), partial(explode, components=components)) except: return [np.zeros(int(round(components))) * np.nan]
def validator(train_data: pd.DataFrame, split_fn: SplitterFnType, train_fn: LearnerFnType, eval_fn: EvalFnType) -> ValidatorReturnType: """ Splits the training data into folds given by the split function and performs a train-evaluation sequence on each fold by calling ``validator_iteration``. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and returns a predict function, a dataset with training predictions and training logs. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. predict_oof : bool Whether to return out of fold predictions on the logs Returns ---------- A list of log-like dictionary evaluations. """ folds, logs = split_fn(train_data) def fold_iter(fold: Tuple[int, Tuple[pd.Index, pd.Index]]) -> LogType: (fold_num, (train_index, test_indexes)) = fold return validator_iteration(train_data, train_index, test_indexes, fold_num, train_fn, eval_fn) zipped_logs = pipe(folds, enumerate, map(fold_iter), partial(zip, logs)) def _join_split_log( log_tuple: Tuple[LogType, LogType]) -> Tuple[LogType, LogType]: train_log = {} split_log, validator_log = log_tuple train_log["train_log"] = validator_log["train_log"] return train_log, assoc(dissoc(validator_log, "train_log"), "split_log", split_log) train_logs, validator_logs = zip(*map(_join_split_log, zipped_logs)) first_train_log = first(train_logs) return assoc(first_train_log, "validator_log", list(validator_logs))
def get_latent_space(model, X_train, X_test, y_train, y_test, tag, labels, feature_names): latent = pipe( X_test, model.transform, partial(pd.DataFrame, columns=["Component 1", "Component 2"]), ) return (pd.concat([latent], axis=1).assign(label=y_test).assign( label=lambda d: d.label.replace(labels)).assign(tag=tag))
def get_components(model, X, y, tag): latent = pipe( X, model.fit_transform, StandardScaler().fit_transform, PCA(whiten=True).fit_transform, partial(pd.DataFrame, columns=["Component 1", "Component 2"]), ) return latent.assign(y=y).assign(tag=tag)
def _prepare_kwargs(self, **kwargs): """Filter keywords with the function arguments. Call any value that is callable, no arguments are applied to these function. """ return valmap( self._call_lazy_function, merge( keyfilter(partial(operator.contains, self.arguments), self.keywords), kwargs, ) )
def text_table(data, print_out=True, **tabulate_args): """A partial version of the tabulate.tabulate function.""" defaults = { "tablefmt": "fancy_grid", "floatfmt": ",.2f", "headers": "keys" } tabulate_args = {**defaults, **tabulate_args} tab = tz.partial(tabulate, **tabulate_args) if print_out: print(tab(data)) else: return tab(data)
def get_waveform(f, components=20): try: return pipe( f, read_wav, get(1), lambda x: x[::250], lambda x: (x) / (np.quantile(np.abs(x), 0.9)), partial(fft, n=components), np.real, pd.Series, ) except: return pd.Series(np.zeros(components) * np.nan)
def get_id_resources(resource_name: str, *, form_key: str = None, id_key: str = 'id', meta_f=empty_dict, unpack_f=do_nothing, single_unpack_f=do_nothing, help=None, memo=False, **iter_kw): def getter(parent_endpoint: IdResourceEndpoint, *, do_memo=True): if (memo and do_memo) and cache_has_key(parent_endpoint, resource_name): return cache_get(parent_endpoint, resource_name) return pipe( parent_endpoint(resource_name).iter( 'get', **merge( { 'iter_f': compose( from_multiple_response( form_key=form_key, id_key=id_key, meta_f=meta_f, unpack_f=unpack_f, single_unpack_f=single_unpack_f, ), ) }, iter_kw)), tuple, memoize_resources(parent_endpoint, resource_name), ) getter.__doc__ = help or '' getter.reset_cache = partial( reset_cache_for_endpoint_by_resource_name, resource_name=resource_name, ) getter.reset_cache.__doc__ = f''' Reset the "{resource_name}" cache for a given Endpoint. ''' return getter
def get_bag(build_dir: Path, base_dtype: str = "xml") -> db.Bag: """possible to do any text pre-processing here""" dtype_path = get_datapaths(build_dir).get(base_dtype) schema = get_schema(build_dir) filepaths = dtype_path.glob(f"**/*.{base_dtype}") _update_authors = flip(update_in(func=flatten_authors), ("authors", )) _update_keywords = lambda d: pipe( d, *[ update_in(func=split_keywords, keys=[kw]) for kw in (col for col in d.keys() if col.endswith("_keywords")) ], ) return (db.from_sequence(filepaths).map(partial( load_xml, schema)).map(_update_authors).map(_update_keywords))
def update(self, index): if index: self.files.value = clips[index[0]] wav_file = pipe(self.files.value, lambda f: os.path.join(data_path, f)) data = pipe(wav_file, wavfile.read, get(1)) time = pipe(data, lambda x: x[::400] / np.max(np.abs(x)), hv.Curve).opts(width=400, xlabel="time", ylabel="waveform", height=300) frequency = pipe( data, partial(fft, n=1000), np.real, lambda x: x / np.max(np.abs(x)), hv.Curve, ).opts(xlabel="frequency", ylabel="aplitude", width=400, height=300) return time + frequency
def __init__( self, data=None, index=None, columns=None, estimator=None, parent=None, feature_level=None, copy=False, extensions=[ 'harness.python.ext.base.JinjaExtension', 'harness.python.ext.SciKit.SciKitExtension', 'harness.python.ext.Bokeh.BokehModelsExtension', 'harness.python.ext.Bokeh.BokehPlottingExtension', 'harness.python.ext.Bokeh.BokehChartsExtension' ], ): kwargs = dict( estimator=estimator, parent=parent, feature_level=feature_level, extensions=extensions, ) self.set_params(**kwargs) for ext in self.extensions: if not ext in self.env.extensions: self.env.add_extension(ext) ext = self.env.extensions[ext] if (not (ext.mixin is None) and not (ext.mixin in self.__class__.__bases__)): self.__class__.__bases__ += (ext.mixin, ) kwargs = pipe(locals(), keyfilter(partial(operator.contains, self._blacklist)), valfilter(complement(lambda x: x is None))) super().__init__(**kwargs)
def find_files(start, filename): start_path = Path(start).resolve() log.info(f'Finding image {filename}') if Path(start_path, filename).exists(): path = Path(start_path, filename) log.info(f' found: {path}') return [path] file_re = re.compile(filename.lower().replace('.', '\\.').replace('*', '.*')) log.info(f' searching with re: {file_re.pattern}') paths = _.pipe( __.walk(start_path), filter(lambda p: file_re.search(p.name.lower())), _.partial(sorted, key=lambda p: p.name), tuple, ) if not paths: log.error(f'Could not find any filenames meeting the search string' f' "{filename}" looking in directory: {start_path}') return paths
sfilter = silent_filter def is_iterable(it): try: iter(it) except TypeError: return False else: return True # partial operators add = partial(reduce, op.add) sub = partial(reduce, op.sub) mul = partial(reduce, op.mul) div = partial(reduce, op.div) # recursive # TODO recursive sub-namespace... for example recursive.reduce, # recursive.map (?) # boolean operators
_.map(lambda b: int(b, 16)), _.map(hex), _.map(lambda h: h[2:]), _.map(lambda h: h.zfill(2)), ':'.join, lookup_mac, ) win_mac_conv = mac_conv('-') macos_mac_conv = mac_conv(':') # ----------------------------------------------------------------------- # ARP # ----------------------------------------------------------------------- arp_output_macos = _.partial(getoutput, 'arp -a') arp_macos_re = re.compile( fr'^(?P<name>[?.\w-]*)\s+\((?P<ip>{ip_re})\) at (?P<mac>.*?) on .*$' ) arp_output_win = _.partial(getoutput, 'arp -a') arp_win_re = re.compile( fr'^\s+(?P<ip>{ip_re})\s+(?P<mac>.*?)\s+\w+\s*$' ) def get_arp_data(arp_output, regex, mac_conv): return _.pipe( arp_output.splitlines(), _.map(regex.match), _.filter(None), _.map(__.call('groupdict')),
def into(_, b, **kwargs): return pipe(b, chunks, map(partial(into, np.ndarray(0))), map(partial(into, list)), concat)
import typing as t from datetime import datetime from toolz.curried import partial, flip, valmap, compose import snug from . import types xml = snug.xml registry = snug.load.PrimitiveRegistry({ bool: dict(true=True, false=False).__getitem__, datetime: partial(flip(datetime.strptime), '%Y-%m-%dT%H:%M:%S%z'), str: str.strip, **{ c: c for c in [ int, float, types.Journey.Status, types.Journey.Component.Status ] } }) | snug.load.GenericRegistry({ t.List: snug.load.list_loader, }) | snug.load.get_optional_loader | snug.load.DataclassRegistry({ types.Station: {**valmap(xml.textgetter, { 'code': 'Code', 'type': 'Type', 'country': 'Land',
def __iter__(self): return pipe(self.chunks(), map(partial(nd.as_py, tuple=True)), concat)
def validator(train_data: pd.DataFrame, split_fn: SplitterFnType, train_fn: LearnerFnType, eval_fn: EvalFnType, perturb_fn_train: PerturbFnType = identity, perturb_fn_test: PerturbFnType = identity, predict_oof: bool = False) -> ValidatorReturnType: """ Splits the training data into folds given by the split function and performs a train-evaluation sequence on each fold by calling ``validator_iteration``. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and returns a predict function, a dataset with training predictions and training logs. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. perturb_fn_train : PerturbFnType A partially defined corruption function that takes a dataset and returns a corrupted dataset. Perturbation applied at train-time. perturb_fn_test : PerturbFnType A partially defined corruption function that takes a dataset and returns a corrupted dataset. Perturbation applied at test-time. predict_oof : bool Whether to return out of fold predictions on the logs Returns ---------- A list of log-like dictionary evaluations. """ folds, logs = split_fn(train_data) train_fn = compose(train_fn, perturb_fn_train) eval_fn = compose(eval_fn, perturb_fn_test) def fold_iter(fold: Tuple[int, Tuple[pd.Index, pd.Index]]) -> LogType: (fold_num, (train_index, test_indexes)) = fold return validator_iteration(train_data, train_index, test_indexes, fold_num, train_fn, eval_fn, predict_oof) zipped_logs = pipe(folds, enumerate, map(fold_iter), partial(zip, logs)) def _join_split_log( log_tuple: Tuple[LogType, LogType]) -> Tuple[LogType, LogType]: train_log = {} split_log, validator_log = log_tuple train_log["train_log"] = validator_log["train_log"] return train_log, assoc(dissoc(validator_log, "train_log"), "split_log", split_log) def get_perturbed_columns(perturbator: PerturbFnType) -> List[str]: args = inspect.getfullargspec(perturbator).kwonlydefaults return args['cols'] if args else [] train_logs, validator_logs = zip(*map(_join_split_log, zipped_logs)) first_train_log = first(train_logs) perturbator_log = { 'perturbated_train': [], 'perturbated_test': [] } # type: LogType if perturb_fn_train != identity: perturbator_log['perturbated_train'] = get_perturbed_columns( perturb_fn_train) if perturb_fn_test != identity: perturbator_log['perturbated_test'] = get_perturbed_columns( perturb_fn_test) first_train_log = assoc(first_train_log, "perturbator_log", perturbator_log) return assoc(first_train_log, "validator_log", list(validator_logs))
def parallel_validator(train_data: pd.DataFrame, split_fn: SplitterFnType, train_fn: LearnerFnType, eval_fn: EvalFnType, n_jobs: int = 1, predict_oof: bool = False) -> ValidatorReturnType: """ Splits the training data into folds given by the split function and performs a train-evaluation sequence on each fold. Tries to run each fold in parallel using up to n_jobs processes. Parameters ---------- train_data : pandas.DataFrame A Pandas' DataFrame with training data split_fn : function pandas.DataFrame -> list of tuple Partially defined split function that takes a dataset and returns a list of folds. Each fold is a Tuple of arrays. The fist array in each tuple contains training indexes while the second array contains validation indexes. train_fn : function pandas.DataFrame -> prediction_function, predictions_dataset, logs A partially defined learning function that takes a training set and returns a predict function, a dataset with training predictions and training logs. eval_fn : function pandas.DataFrame -> dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. n_jobs : int Number of parallel processes to spawn. predict_oof : bool Whether to return out of fold predictions on the logs Returns ---------- A list log-like dictionary evaluations. """ folds, logs = split_fn(train_data) dumped_train_fn = cloudpickle.dumps(train_fn) dumped_eval_fn = cloudpickle.dumps(eval_fn) result = Parallel(n_jobs=n_jobs, backend="threading")( delayed(parallel_validator_iteration)(train_data, x, dumped_train_fn, dumped_eval_fn, predict_oof) for x in enumerate(folds)) gc.collect() train_log = { "train_log": [fold_result["train_log"] for fold_result in result] } @curry def kwdissoc(d: Dict, key: str) -> Dict: return dissoc(d, key) validator_logs = pipe( result, partial(zip, logs), map(lambda log_tuple: assoc(log_tuple[1], "split_log", log_tuple[0])), map(kwdissoc(key="train_log")), list) return assoc(train_log, "validator_log", validator_logs)
def _get_param_names(cls): """Ignore the parameters that are specific to the dataframe.""" return pipe( super()._get_param_names(), filter(complement(partial(operator.contains, cls._blacklist))), list)
def _register(method): get_ipython().register_magic_function(partial(_wraps_magic, method, **kwargs), magic_kind='line_cell', magic_name=name)
def register_mistune_magic(**kwargs): magical('mistune', display='HTML', lang='markdown')( compose(partial(markdown, **kwargs), _render_jinja2_with_globals, Template) )
def register_yaml_magic(loader=yaml.SafeLoader): magical('yaml', display=print, lang='yaml')( compose(partial(yaml.load, Loader=loader), _render_jinja2_with_globals, Template) )
def get_best_performing_log(log_list: LogListType, extractor: ExtractorFnType, metric_name: str) -> Dict: logs_eval = [get_avg_metric_from_extractor(log, extractor, metric_name) for log in log_list] return pipe(logs_eval, partial(zip, log_list), partial(sorted, reverse=True, key=lambda x: x[1]))[0][0]