def test_curried_namespace(): exceptions = import_module('toolz.curried.exceptions') namespace = {} def should_curry(func): if not callable(func) or isinstance(func, toolz.curry): return False nargs = toolz.functoolz.num_required_args(func) if nargs is None or nargs > 1: return True return nargs == 1 and toolz.functoolz.has_keywords(func) def curry_namespace(ns): return { name: toolz.curry(f) if should_curry(f) else f for name, f in ns.items() if '__' not in name } from_toolz = curry_namespace(vars(toolz)) from_exceptions = curry_namespace(vars(exceptions)) namespace.update(toolz.merge(from_toolz, from_exceptions)) namespace = toolz.valfilter(callable, namespace) curried_namespace = toolz.valfilter(callable, toolz.curried.__dict__) if namespace != curried_namespace: missing = set(namespace) - set(curried_namespace) if missing: raise AssertionError( 'There are missing functions in toolz.curried:\n %s' % ' \n'.join(sorted(missing))) extra = set(curried_namespace) - set(namespace) if extra: raise AssertionError( 'There are extra functions in toolz.curried:\n %s' % ' \n'.join(sorted(extra))) unequal = toolz.merge_with(list, namespace, curried_namespace) unequal = toolz.valfilter(lambda x: x[0] != x[1], unequal) messages = [] for name, (orig_func, auto_func) in sorted(unequal.items()): if name in from_exceptions: messages.append( '%s should come from toolz.curried.exceptions' % name) elif should_curry(getattr(toolz, name)): messages.append('%s should be curried from toolz' % name) else: messages.append( '%s should come from toolz and NOT be curried' % name) raise AssertionError('\n'.join(messages))
def fetch(content, prefix): return { "parts": pipe( parse("$..layers").find(content), mapcat(lambda m: m.value), filter(lambda v: v["exportOptions"]["exportFormats"]), filter(lambda v: re.match(prefix, v["name"])), map(lambda v: glom( v, { "key": "name", "layout": ( "frame", { "left": ("x", round), "top": ("y", round), "width": ("width", round), "height": ("height", round), }, ), }, )), sorted(key=lambda p: p["key"]), list, ) }
def iter_json_file_names(*pathnames): for json_file_path in sorted(mapcat( lambda pathname: glob.iglob(os.path.join(ast_dir_path, pathname)), pathnames, )): json_file_name = os.path.basename(json_file_path) yield json_file_name
def test_curried_namespace(): exceptions = import_module('toolz.curried.exceptions') namespace = {} def should_curry(func): if not callable(func) or isinstance(func, toolz.curry): return False nargs = toolz.functoolz.num_required_args(func) if nargs is None or nargs > 1: return True return nargs == 1 and toolz.functoolz.has_keywords(func) def curry_namespace(ns): return dict( (name, toolz.curry(f) if should_curry(f) else f) for name, f in ns.items() if '__' not in name ) from_toolz = curry_namespace(vars(toolz)) from_exceptions = curry_namespace(vars(exceptions)) namespace.update(toolz.merge(from_toolz, from_exceptions)) namespace = toolz.valfilter(callable, namespace) curried_namespace = toolz.valfilter(callable, toolz.curried.__dict__) if namespace != curried_namespace: missing = set(namespace) - set(curried_namespace) if missing: raise AssertionError('There are missing functions in toolz.curried:\n %s' % ' \n'.join(sorted(missing))) extra = set(curried_namespace) - set(namespace) if extra: raise AssertionError('There are extra functions in toolz.curried:\n %s' % ' \n'.join(sorted(extra))) unequal = toolz.merge_with(list, namespace, curried_namespace) unequal = toolz.valfilter(lambda x: x[0] != x[1], unequal) messages = [] for name, (orig_func, auto_func) in sorted(unequal.items()): if name in from_exceptions: messages.append('%s should come from toolz.curried.exceptions' % name) elif should_curry(getattr(toolz, name)): messages.append('%s should be curried from toolz' % name) else: messages.append('%s should come from toolz and NOT be curried' % name) raise AssertionError('\n'.join(messages))
def sort_y_proba_by_prevalence(y_proba: DataFrame, y_true: Series) -> DataFrame: y_proba_new = y_proba.copy() y_pred: Series = get_y_pred_from_y_proba(y_proba) class_mapping = get_cluster_mapping_by_prevalence(y_pred, y_true) for from_class, to_class in class_mapping.items(): y_proba_new[to_class] = y_proba[from_class] y_proba_new_reindexed = y_proba_new.reindex(sorted(y_proba_new.columns), axis=1) return y_proba_new_reindexed
def get_cluster_mapping_by_prevalence( y_pred: Series, y_true: Series, ) -> Dict: def get_1_prevalence(distribution: Series) -> int: try: prevalence_1 = (distribution[1] / distribution.sum()) except KeyError: prevalence_1 = 0 return prevalence_1 return pipe( get_counts_per_cluster(y_pred, y_true), dict.items, partial(map_tuples, lambda index, distribution: (index, get_1_prevalence(distribution))), sorted(key=get(1)), enumerate, partial(map_tuples, lambda index, item: (item[0], index)), list, from_items, )
def test_sorted(): assert sorted(key=second)([(1, 2), (2, 1)]) == [(2, 1), (1, 2)]
def sort(iterable: Iterable[Any], key: Callable[[Any], Any]): return sorted(iterable, key=key)
def measure_cluster_features_statistics(X: DataFrame, y_pred: Series): X = X.copy() log_transformed = ('LPRA', 'LINS', 'LLEPT', 'LFERR', 'LALDO', 'LCRTSL') for feature in log_transformed: if feature not in X: logging.warning(f'Feature {feature} not present') continue X[feature] = 10**X[feature] non_normal_features = ( 'LFERR', 'LGGT', 'SS', 'LFERR', 'LPRA', 'LINS', 'LLEPT', 'LALDO', 'LCRTSL', 'SA_V3', 'RA1_AVL' ) try: del X['DBIRTH'] except KeyError: pass try: del X['DATT'] except KeyError: pass try: del X['SFILE'] except KeyError: pass labels = get_cluster_identifiers(y_pred) X_clustered = [X[y_pred == label] for label in labels] categorical_features = get_categorical_features(X) continuous_features = [column for column in X.columns if column not in categorical_features] data_frame = DataFrame(index=X.columns) rename_features = {} for cluster_index, cluster in enumerate(X_clustered): cluster_feature_statistics = {} for feature in categorical_features: if feature == 'SOCK': total = len(cluster[feature].dropna()) classes = sorted(cluster[feature].unique()) value = '/'.join( [ f'{round((len(cluster[cluster[feature] == class_value]) / total) * 100, 1)}' for class_value in classes ] ) rename_features[feature] = feature else: value_count = cluster[feature].value_counts() if len(value_count) < 2: warning(f'Skipped feature {feature}') continue value = format_count_and_percentage(value_count, decimals=1) cluster_feature_statistics[feature] = value for column in continuous_features: mean_value = float(cluster[column].mean()) if column in non_normal_features: spread_statistic = f' ({round(cluster[column].quantile(0.1), 2)}' \ f'-{round(cluster[column].quantile(0.9), 2)})' else: spread_statistic = f' ± {round(std(cluster[column], ddof=1), 3)}' cluster_feature_statistics[column] = str(round_digits(mean_value, 3)) + spread_statistic cluster_column_key = f'cluster {cluster_index + 1} (n={len(cluster)})' data_frame[cluster_column_key] = Series(cluster_feature_statistics) for cluster1, cluster2 in combinations(range(len(X_clustered)), 2): # noinspection PyUnresolvedReferences continuous_statistics = { column: format_p_value( ztest( X_clustered[cluster1][column].dropna(), X_clustered[cluster2][column].dropna(), )[1] ) for column in continuous_features } # sklearn.feature_selection.chi2(DataFrame, y) categorical_statistics = { column: format_p_value( chi2_contingency( count_values_and_align( X_clustered[cluster1][column], X_clustered[cluster2][column] ), correction=False, )[1] ) for column in categorical_features } data_frame[f'p value {cluster1} ⇄ {cluster2} (95 %)'] = Series( { **categorical_statistics, **continuous_statistics } ) missing_values = Series(X.isnull().sum(), index=data_frame.index) data_frame = data_frame.assign(missing=missing_values) data_frame.rename(index=rename_features, inplace=True) return data_frame
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-d', '--debug', action='store_true', default=False, help='Display debug messages') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Increase output verbosity') parser.add_argument('json_dir', help='Directory containing the JSON AST and data files') global args args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.debug else (logging.INFO if args.verbose else logging.WARNING), stream=sys.stdout, ) if not os.path.exists(args.json_dir): parser.error('json_dir {!r} does not exist'.format(args.json_dir)) if not os.path.isdir(generated_dir_path): os.mkdir(generated_dir_path) # Initialize a variables_definitions object and set global variable in visitors variables_definitions = python_source_visitors.variables_definitions = VariablesDefinitions() # Transpile verification functions verif_sources = list( mapcat(load_verifs_file, iter_ast_json_file_names(filenames=['coc*.json', 'coi*.json'])) ) verifs_source = Template("""\ from ..formulas_helpers import arr, cached, inf, interval, null, positif, positif_ou_nul, present, somme def get_errors(formulas, saisie_variables): errors = [] $verifs return errors or None """).substitute(verifs=textwrap.indent('\n'.join(verif_sources), prefix=4 * ' ')) write_source_file( file_name='verifs.py', source=verifs_source, ) # Transpile formulas constants = loaders.load_constants() source_by_formula_name = dict(list(mapcat( load_regles_file, iter_ast_json_file_names(filenames=['chap-*.json', 'res-ser*.json']), ))) def get_formula_source(variable_name): source = source_by_formula_name.get(variable_name) if source is not None: return source if variables_definitions.is_saisie(variable_name): return python_source_visitors.make_formula_source( expression='saisie_variables.get({!r}, 0)'.format(variable_name), formula_name=variable_name, ) if variable_name in constants: return python_source_visitors.make_formula_source( expression='constants[{!r}]'.format(variable_name), formula_name=variable_name, ) if variables_definitions.is_calculee(variable_name): if not variables_definitions.is_calculee(variable_name, kind='base'): log.debug('Variable {!r} is declared in tgvH file but has no formula'.format(variable_name)) return python_source_visitors.make_formula_source( expression='0', formula_name=variable_name, ) assert False, variable_name # Merge variable names coming from dependencies graph and variables definitions # because some variables are missing in tgvH file; # or some constants are declared in tgvH but are not used in formulas, only in verifs. dependencies_by_formula_name = loaders.load_formulas_dependencies() all_variable_names = set(concatv( dependencies_by_formula_name.keys(), concat(dependencies_by_formula_name.values()), variables_definitions.definition_by_variable_name.keys(), constants.keys(), )) write_source_file( file_name='formulas.py', source=Template("""\ from __future__ import division import inspect from ..formulas_helpers import arr, cached, inf, interval, null, positif, positif_ou_nul, present, somme def get_formulas(cache, constants, saisie_variables): formulas = {} $formulas return formulas """).substitute( formulas=textwrap.indent( '\n'.join(map(get_formula_source, sorted(all_variable_names))), prefix=4 * ' ', ), ), ) return 0
def remove_by_feature_shuffling(log: LogType, predict_fn: PredictFnType, eval_fn: EvalFnType, eval_data: pd.DataFrame, extractor: ExtractorFnType, metric_name: str, max_removed_by_step: int = 50, threshold: float = 0.005, speed_up_by_importance: bool = False, parallel: bool = False, nthread: int = 1, seed: int = 7) -> List[str]: """ Performs feature selection based on the evaluation of the test vs the evaluation of the test with randomly shuffled features Parameters ---------- log : LogType Dictionaries evaluations. predict_fn: function pandas.DataFrame -> pandas.DataFrame A partially defined predictor that takes a DataFrame and returns the predicted score for this dataframe eval_fn : function DataFrame -> log dict A partially defined evaluation function that takes a dataset with prediction and returns the evaluation logs. eval_data: pandas.DataFrame Data used to evaluate the model after shuffling extractor: function str -> float A extractor that take a string and returns the value of that string on a dict metric_name: str String with the name of the column that refers to the metric column to be extracted max_removed_by_step: int (default 5) The maximum number of features to remove. It will only consider the least max_removed_by_step in terms of feature importance. If speed_up_by_importance=True it will first filter the least relevant feature an shuffle only those. If speed_up_by_importance=False it will shuffle all features and drop the last max_removed_by_step in terms of PIMP. In both cases, the features will only be removed if drop in performance is up to the defined threshold. threshold: float (default 0.005) Threshold for model performance comparison speed_up_by_importance: bool (default True) If it should narrow search looking at feature importance first before getting PIMP importance. If True, will only shuffle the top num_removed_by_step in terms of feature importance. parallel: bool (default False) nthread: int (default 1) seed: int (default 7) Random seed Returns ---------- features: list of str The remaining features after removing based on feature importance """ random.seed(seed) curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name) eval_size = eval_data.shape[0] features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \ if speed_up_by_importance else get_used_features(log) def shuffle(feature: str) -> pd.DataFrame: return eval_data.assign( **{feature: eval_data[feature].sample(frac=1.0)}) feature_to_delta_metric = compose( lambda m: curr_metric - m, get_avg_metric_from_extractor(extractor=extractor, metric_name=metric_name), gen_validator_log(fold_num=0, test_size=eval_size), eval_fn, predict_fn, shuffle) if parallel: metrics = Parallel(n_jobs=nthread, backend="threading")( delayed(feature_to_delta_metric)(feature) for feature in features_to_shuffle) feature_to_delta_metric = dict(zip(features_to_shuffle, metrics)) gc.collect() else: feature_to_delta_metric = { feature: feature_to_delta_metric(feature) for feature in features_to_shuffle } return pipe(feature_to_delta_metric, valfilter(lambda delta_metric: delta_metric < threshold), sorted(key=lambda f: feature_to_delta_metric.get(f)), take(max_removed_by_step), list)