def test_curried_namespace():
    exceptions = import_module('toolz.curried.exceptions')
    namespace = {}

    def should_curry(func):
        if not callable(func) or isinstance(func, toolz.curry):
            return False
        nargs = toolz.functoolz.num_required_args(func)
        if nargs is None or nargs > 1:
            return True
        return nargs == 1 and toolz.functoolz.has_keywords(func)

    def curry_namespace(ns):
        return {
            name: toolz.curry(f) if should_curry(f) else f
            for name, f in ns.items() if '__' not in name
        }

    from_toolz = curry_namespace(vars(toolz))
    from_exceptions = curry_namespace(vars(exceptions))
    namespace.update(toolz.merge(from_toolz, from_exceptions))

    namespace = toolz.valfilter(callable, namespace)
    curried_namespace = toolz.valfilter(callable, toolz.curried.__dict__)

    if namespace != curried_namespace:
        missing = set(namespace) - set(curried_namespace)
        if missing:
            raise AssertionError(
                'There are missing functions in toolz.curried:\n    %s' %
                '    \n'.join(sorted(missing)))
        extra = set(curried_namespace) - set(namespace)
        if extra:
            raise AssertionError(
                'There are extra functions in toolz.curried:\n    %s' %
                '    \n'.join(sorted(extra)))
        unequal = toolz.merge_with(list, namespace, curried_namespace)
        unequal = toolz.valfilter(lambda x: x[0] != x[1], unequal)
        messages = []
        for name, (orig_func, auto_func) in sorted(unequal.items()):
            if name in from_exceptions:
                messages.append(
                    '%s should come from toolz.curried.exceptions' % name)
            elif should_curry(getattr(toolz, name)):
                messages.append('%s should be curried from toolz' % name)
            else:
                messages.append(
                    '%s should come from toolz and NOT be curried' % name)
        raise AssertionError('\n'.join(messages))
Example #2
0
def fetch(content, prefix):
    return {
        "parts":
        pipe(
            parse("$..layers").find(content),
            mapcat(lambda m: m.value),
            filter(lambda v: v["exportOptions"]["exportFormats"]),
            filter(lambda v: re.match(prefix, v["name"])),
            map(lambda v: glom(
                v,
                {
                    "key":
                    "name",
                    "layout": (
                        "frame",
                        {
                            "left": ("x", round),
                            "top": ("y", round),
                            "width": ("width", round),
                            "height": ("height", round),
                        },
                    ),
                },
            )),
            sorted(key=lambda p: p["key"]),
            list,
        )
    }
def iter_json_file_names(*pathnames):
    for json_file_path in sorted(mapcat(
                lambda pathname: glob.iglob(os.path.join(ast_dir_path, pathname)),
                pathnames,
                )):
        json_file_name = os.path.basename(json_file_path)
        yield json_file_name
Example #4
0
def test_curried_namespace():
    exceptions = import_module('toolz.curried.exceptions')
    namespace = {}

    def should_curry(func):
        if not callable(func) or isinstance(func, toolz.curry):
            return False
        nargs = toolz.functoolz.num_required_args(func)
        if nargs is None or nargs > 1:
            return True
        return nargs == 1 and toolz.functoolz.has_keywords(func)


    def curry_namespace(ns):
        return dict(
            (name, toolz.curry(f) if should_curry(f) else f)
            for name, f in ns.items() if '__' not in name
        )

    from_toolz = curry_namespace(vars(toolz))
    from_exceptions = curry_namespace(vars(exceptions))
    namespace.update(toolz.merge(from_toolz, from_exceptions))

    namespace = toolz.valfilter(callable, namespace)
    curried_namespace = toolz.valfilter(callable, toolz.curried.__dict__)

    if namespace != curried_namespace:
        missing = set(namespace) - set(curried_namespace)
        if missing:
            raise AssertionError('There are missing functions in toolz.curried:\n    %s'
                                 % '    \n'.join(sorted(missing)))
        extra = set(curried_namespace) - set(namespace)
        if extra:
            raise AssertionError('There are extra functions in toolz.curried:\n    %s'
                                 % '    \n'.join(sorted(extra)))
        unequal = toolz.merge_with(list, namespace, curried_namespace)
        unequal = toolz.valfilter(lambda x: x[0] != x[1], unequal)
        messages = []
        for name, (orig_func, auto_func) in sorted(unequal.items()):
            if name in from_exceptions:
                messages.append('%s should come from toolz.curried.exceptions' % name)
            elif should_curry(getattr(toolz, name)):
                messages.append('%s should be curried from toolz' % name)
            else:
                messages.append('%s should come from toolz and NOT be curried' % name)
        raise AssertionError('\n'.join(messages))
Example #5
0
def sort_y_proba_by_prevalence(y_proba: DataFrame, y_true: Series) -> DataFrame:
    y_proba_new = y_proba.copy()

    y_pred: Series = get_y_pred_from_y_proba(y_proba)

    class_mapping = get_cluster_mapping_by_prevalence(y_pred, y_true)

    for from_class, to_class in class_mapping.items():
        y_proba_new[to_class] = y_proba[from_class]

    y_proba_new_reindexed = y_proba_new.reindex(sorted(y_proba_new.columns), axis=1)
    return y_proba_new_reindexed
Example #6
0
def get_cluster_mapping_by_prevalence(
    y_pred: Series,
    y_true: Series,
) -> Dict:

    def get_1_prevalence(distribution: Series) -> int:
        try:
            prevalence_1 = (distribution[1] / distribution.sum())
        except KeyError:
            prevalence_1 = 0
        return prevalence_1

    return pipe(
        get_counts_per_cluster(y_pred, y_true),
        dict.items,
        partial(map_tuples, lambda index, distribution: (index, get_1_prevalence(distribution))),
        sorted(key=get(1)),
        enumerate,
        partial(map_tuples, lambda index, item: (item[0], index)),
        list,
        from_items,
    )
Example #7
0
def test_sorted():
    assert sorted(key=second)([(1, 2), (2, 1)]) == [(2, 1), (1, 2)]
Example #8
0
def sort(iterable: Iterable[Any], key: Callable[[Any], Any]):
    return sorted(iterable, key=key)
Example #9
0
def measure_cluster_features_statistics(X: DataFrame, y_pred: Series):
    X = X.copy()

    log_transformed = ('LPRA', 'LINS', 'LLEPT', 'LFERR', 'LALDO', 'LCRTSL')

    for feature in log_transformed:
        if feature not in X:
            logging.warning(f'Feature {feature} not present')
            continue

        X[feature] = 10**X[feature]

    non_normal_features = (
        'LFERR', 'LGGT', 'SS', 'LFERR', 'LPRA', 'LINS', 'LLEPT', 'LALDO', 'LCRTSL', 'SA_V3',
        'RA1_AVL'
    )

    try:
        del X['DBIRTH']
    except KeyError:
        pass

    try:
        del X['DATT']
    except KeyError:
        pass

    try:
        del X['SFILE']
    except KeyError:
        pass

    labels = get_cluster_identifiers(y_pred)
    X_clustered = [X[y_pred == label] for label in labels]
    categorical_features = get_categorical_features(X)

    continuous_features = [column for column in X.columns if column not in categorical_features]
    data_frame = DataFrame(index=X.columns)
    rename_features = {}

    for cluster_index, cluster in enumerate(X_clustered):
        cluster_feature_statistics = {}

        for feature in categorical_features:
            if feature == 'SOCK':
                total = len(cluster[feature].dropna())
                classes = sorted(cluster[feature].unique())
                value = '/'.join(
                    [
                        f'{round((len(cluster[cluster[feature] == class_value]) / total) * 100, 1)}'
                        for class_value in classes
                    ]
                )
                rename_features[feature] = feature
            else:
                value_count = cluster[feature].value_counts()
                if len(value_count) < 2:
                    warning(f'Skipped feature {feature}')
                    continue
                value = format_count_and_percentage(value_count, decimals=1)

            cluster_feature_statistics[feature] = value

        for column in continuous_features:
            mean_value = float(cluster[column].mean())

            if column in non_normal_features:
                spread_statistic = f' ({round(cluster[column].quantile(0.1), 2)}' \
                                   f'-{round(cluster[column].quantile(0.9), 2)})'
            else:
                spread_statistic = f' ± {round(std(cluster[column], ddof=1), 3)}'

            cluster_feature_statistics[column] = str(round_digits(mean_value, 3)) + spread_statistic

        cluster_column_key = f'cluster {cluster_index + 1} (n={len(cluster)})'
        data_frame[cluster_column_key] = Series(cluster_feature_statistics)

    for cluster1, cluster2 in combinations(range(len(X_clustered)), 2):
        # noinspection PyUnresolvedReferences
        continuous_statistics = {
            column: format_p_value(
                ztest(
                    X_clustered[cluster1][column].dropna(),
                    X_clustered[cluster2][column].dropna(),
                )[1]
            )
            for column in continuous_features
        }
        # sklearn.feature_selection.chi2(DataFrame, y)
        categorical_statistics = {
            column: format_p_value(
                chi2_contingency(
                    count_values_and_align(
                        X_clustered[cluster1][column], X_clustered[cluster2][column]
                    ),
                    correction=False,
                )[1]
            )
            for column in categorical_features
        }

        data_frame[f'p value {cluster1} ⇄ {cluster2} (95 %)'] = Series(
            {
                **categorical_statistics,
                **continuous_statistics
            }
        )
    missing_values = Series(X.isnull().sum(), index=data_frame.index)
    data_frame = data_frame.assign(missing=missing_values)
    data_frame.rename(index=rename_features, inplace=True)

    return data_frame
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-d', '--debug', action='store_true', default=False, help='Display debug messages')
    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Increase output verbosity')
    parser.add_argument('json_dir', help='Directory containing the JSON AST and data files')
    global args
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.DEBUG if args.debug else (logging.INFO if args.verbose else logging.WARNING),
        stream=sys.stdout,
        )

    if not os.path.exists(args.json_dir):
        parser.error('json_dir {!r} does not exist'.format(args.json_dir))

    if not os.path.isdir(generated_dir_path):
        os.mkdir(generated_dir_path)

    # Initialize a variables_definitions object and set global variable in visitors

    variables_definitions = python_source_visitors.variables_definitions = VariablesDefinitions()

    # Transpile verification functions

    verif_sources = list(
        mapcat(load_verifs_file, iter_ast_json_file_names(filenames=['coc*.json', 'coi*.json']))
        )
    verifs_source = Template("""\
from ..formulas_helpers import arr, cached, inf, interval, null, positif, positif_ou_nul, present, somme


def get_errors(formulas, saisie_variables):
    errors = []

$verifs
    return errors or None
""").substitute(verifs=textwrap.indent('\n'.join(verif_sources), prefix=4 * ' '))
    write_source_file(
        file_name='verifs.py',
        source=verifs_source,
        )

    # Transpile formulas

    constants = loaders.load_constants()
    source_by_formula_name = dict(list(mapcat(
        load_regles_file,
        iter_ast_json_file_names(filenames=['chap-*.json', 'res-ser*.json']),
        )))

    def get_formula_source(variable_name):
        source = source_by_formula_name.get(variable_name)
        if source is not None:
            return source
        if variables_definitions.is_saisie(variable_name):
            return python_source_visitors.make_formula_source(
                expression='saisie_variables.get({!r}, 0)'.format(variable_name),
                formula_name=variable_name,
                )
        if variable_name in constants:
            return python_source_visitors.make_formula_source(
                expression='constants[{!r}]'.format(variable_name),
                formula_name=variable_name,
                )
        if variables_definitions.is_calculee(variable_name):
            if not variables_definitions.is_calculee(variable_name, kind='base'):
                log.debug('Variable {!r} is declared in tgvH file but has no formula'.format(variable_name))
            return python_source_visitors.make_formula_source(
                expression='0',
                formula_name=variable_name,
                )
        assert False, variable_name

    # Merge variable names coming from dependencies graph and variables definitions
    # because some variables are missing in tgvH file;
    # or some constants are declared in tgvH but are not used in formulas, only in verifs.
    dependencies_by_formula_name = loaders.load_formulas_dependencies()
    all_variable_names = set(concatv(
        dependencies_by_formula_name.keys(),
        concat(dependencies_by_formula_name.values()),
        variables_definitions.definition_by_variable_name.keys(),
        constants.keys(),
        ))
    write_source_file(
        file_name='formulas.py',
        source=Template("""\
from __future__ import division

import inspect

from ..formulas_helpers import arr, cached, inf, interval, null, positif, positif_ou_nul, present, somme


def get_formulas(cache, constants, saisie_variables):
    formulas = {}

$formulas
    return formulas
""").substitute(
            formulas=textwrap.indent(
                '\n'.join(map(get_formula_source, sorted(all_variable_names))),
                prefix=4 * ' ',
                ),
            ),
        )

    return 0
Example #11
0
def remove_by_feature_shuffling(log: LogType,
                                predict_fn: PredictFnType,
                                eval_fn: EvalFnType,
                                eval_data: pd.DataFrame,
                                extractor: ExtractorFnType,
                                metric_name: str,
                                max_removed_by_step: int = 50,
                                threshold: float = 0.005,
                                speed_up_by_importance: bool = False,
                                parallel: bool = False,
                                nthread: int = 1,
                                seed: int = 7) -> List[str]:
    """
        Performs feature selection based on the evaluation of the test vs the
        evaluation of the test with randomly shuffled features

        Parameters
        ----------
        log : LogType
            Dictionaries evaluations.

        predict_fn: function pandas.DataFrame -> pandas.DataFrame
            A partially defined predictor that takes a DataFrame and returns the
            predicted score for this dataframe

        eval_fn : function DataFrame -> log dict
            A partially defined evaluation function that takes a dataset with prediction and
            returns the evaluation logs.

        eval_data: pandas.DataFrame
            Data used to evaluate the model after shuffling

        extractor: function str -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        max_removed_by_step: int (default 5)
            The maximum number of features to remove. It will only consider the least max_removed_by_step in terms of
            feature importance. If speed_up_by_importance=True it will first filter the least relevant feature an
            shuffle only those. If speed_up_by_importance=False it will shuffle all features and drop the last
            max_removed_by_step in terms of PIMP. In both cases, the features will only be removed if drop in
            performance is up to the defined threshold.

        threshold: float (default 0.005)
            Threshold for model performance comparison

        speed_up_by_importance: bool (default True)
            If it should narrow search looking at feature importance first before getting PIMP importance. If True,
            will only shuffle the top num_removed_by_step in terms of feature importance.

        parallel: bool (default False)

        nthread: int (default 1)

        seed: int (default 7)
            Random seed

        Returns
        ----------
        features: list of str
            The remaining features after removing based on feature importance

    """
    random.seed(seed)

    curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name)
    eval_size = eval_data.shape[0]

    features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \
        if speed_up_by_importance else get_used_features(log)

    def shuffle(feature: str) -> pd.DataFrame:
        return eval_data.assign(
            **{feature: eval_data[feature].sample(frac=1.0)})

    feature_to_delta_metric = compose(
        lambda m: curr_metric - m,
        get_avg_metric_from_extractor(extractor=extractor,
                                      metric_name=metric_name),
        gen_validator_log(fold_num=0, test_size=eval_size), eval_fn,
        predict_fn, shuffle)

    if parallel:
        metrics = Parallel(n_jobs=nthread, backend="threading")(
            delayed(feature_to_delta_metric)(feature)
            for feature in features_to_shuffle)
        feature_to_delta_metric = dict(zip(features_to_shuffle, metrics))
        gc.collect()

    else:
        feature_to_delta_metric = {
            feature: feature_to_delta_metric(feature)
            for feature in features_to_shuffle
        }

    return pipe(feature_to_delta_metric,
                valfilter(lambda delta_metric: delta_metric < threshold),
                sorted(key=lambda f: feature_to_delta_metric.get(f)),
                take(max_removed_by_step), list)