Exemple #1
0
def test_function_has_named_argument():
    def f1(a, b):
        pass

    def f2(a, b, **kwargs):
        pass

    def f3(a=None, b=10, *args, **kwargs):
        pass

    class Foo(object):
        def f(self, a, b):
            pass

        @staticmethod
        def f2(a, b):
            pass

    class Functor(object):
        def __call__(self, a, b):
            pass

    for f in (f1, f2, f3, Foo.f, Foo().f, Foo.f2, Foo().f2, Functor()):
        assert function_has_named_argument(f, "a")
        assert function_has_named_argument(f, "b")
        assert not function_has_named_argument(f, "c")
Exemple #2
0
def _multimetric_score_with_group(estimator, X_test, y_test, groups_test,
                                  scorers):
    """Return a dict of score for multimetric scoring"""
    # Copy of sklearn '_multimetric_score' but where the 'groups' can be passed to the scorer
    scores = {}

    for name, scorer in scorers.items():
        has_group = groups_test is not None and function_has_named_argument(
            scorer, "groups")
        if y_test is None:
            if has_group:
                score = scorer(estimator, X_test, groups_test)
            else:
                score = scorer(estimator, X_test)

        else:
            if has_group:
                score = scorer(estimator, X_test, y_test, groups_test)
            else:
                score = scorer(estimator, X_test, y_test)

        if hasattr(score, 'item'):
            try:
                # e.g. unwrap memmapped scalars
                score = score.item()
            except ValueError:
                # non-scalar?
                pass
        scores[name] = score

        if not isinstance(score, numbers.Number):
            raise ValueError("scoring must return a number, got %s (%s) "
                             "instead. (scorer=%s)" %
                             (str(score), type(score), name))
    return scores
Exemple #3
0
    def fit_command(self, job_ids):
        """ this command is to launch the final fit one (or more) model(s)
        It can be executed using the 'fit' command keyword followed by '--job_ids ***'
        
        It will:
            * reload the data
            * fit a model on all the data
            * save the pickled object

        """
        all_models = []
        for job_id in job_ids:
            print("fitting of job_id '%s'" % job_id)
            self.reload()
            
            job_param = self.data_persister.read(job_id, path = "job_param", write_type = SavingType.json)
            model = sklearn_model_from_param(job_param["model_json"])
            print("start fitting...")
            
            if function_has_named_argument(model.fit, "groups") and self.groups is not None:
                model.fit(self.dfX, self.y, groups=self.groups)
            else:
                model.fit(self.dfX, self.y)
                
            print("...model fitted!")
            
            self.data_persister.write(model, job_id, path="saved_models", write_type=SavingType.pickle)
            self.data_persister.write(job_param["model_json"], job_id, path="saved_models", write_type=SavingType.json)
            
            print("model persisted")
            
            all_models.append(model)

        return all_models
Exemple #4
0
def try_to_find_features_names(model, input_features=None):
    # TODO : il faudrait que ca prenne en entree un champs 'input_features_names' a passer a get_features_names
    # TODO : il faut tester si le model accept 'input_features_names'
    # TODO : il faudrait que pour les pipelines ca iter avec 'input_features_names' = get_features_names(last step)

    if hasattr(model, "get_feature_names"):
        # It already has a 'get_feature_names' method
        f = None

        if input_features is not None and function_has_named_argument(
                model.get_feature_names, "input_features"):
            # I have an input_features argument AND the method accepts it
            # => I'll use it
            try:
                f = model.get_feature_names(input_features)
            except (ValueError, AttributeError):
                pass

        else:

            try:
                f = model.get_feature_names()
            except (ValueError, AttributeError):
                pass

        if f is not None:
            return f

    if hasattr(model, "steps"):
        # It is a pipeline
        last_step = model.steps[-1][1]

        return try_to_find_features_names(last_step,
                                          input_features=input_features)

    if hasattr(model, "transformer_list"):

        features = []
        for name, transformer in model.transformer_list:
            fs = try_to_find_features_names(transformer,
                                            input_features=input_features)
            if fs is None:
                return None
            features += [name + "__" + f for f in fs]

        return features
        # Rmk : FeatureUnion, already implemented
    else:
        # I don't know

        return None  # don't know
Exemple #5
0
def _score_with_group(estimator,
                      X_test,
                      y_test,
                      groups_test,
                      scorer,
                      is_multimetric=False):
    """Compute the score(s) of an estimator on a given test set.

    Will return a single float if is_multimetric is False and a dict of floats,
    if is_multimetric is True
    """
    # Copy of sklearn '_score' but where the 'groups' can be passed to the scorer
    if isinstance(y_test, pd.DataFrame):
        y_test = y_test.values

    if is_multimetric:
        return _multimetric_score_with_group(estimator, X_test, y_test,
                                             groups_test, scorer)
    else:
        has_group = groups_test is not None and function_has_named_argument(
            scorer, "groups")
        # True if :
        # * group is passed to the function
        # * the scorer accepts a 'group' argument

        if y_test is None:
            if has_group:
                score = scorer(estimator, X_test, groups_test)
            else:
                score = scorer(estimator, X_test)
        else:
            if has_group:
                score = scorer(estimator, X_test, y_test, groups_test)
            else:
                score = scorer(estimator, X_test, y_test)

        if hasattr(score, "item"):
            try:
                # e.g. unwrap memmapped scalars
                score = score.item()
            except ValueError:
                # non-scalar?
                pass

        if not isinstance(score, numbers.Number):
            raise ValueError("scoring must return a number, got %s (%s) "
                             "instead. (scorer=%r)" %
                             (str(score), type(score), scorer))
    return score
Exemple #6
0
def _compute_one_fold(
    fold_index,
    train,
    test,
    multi_output_proba,
    all_classes,
    classes,
    estimator,
    X,
    y,
    groups,
    scorers,
    verbose,
    fit_params,
    return_predict,
    method,
    no_scoring,
):
    if verbose:
        print("cv %d started\n" % fold_index)

    ### Clone the estimator ###
    cloned_estimator = sklearn.base.clone(estimator)

    ### split train test ###
    X_train, y_train = sklearn.model_selection._validation._safe_split(
        estimator, X, y, train)
    if groups is not None:
        groups_train, _ = sklearn.model_selection._validation._safe_split(
            estimator, groups, None, train)
    else:
        groups_train = None

    X_test, y_test = sklearn.model_selection._validation._safe_split(
        estimator, X, y, test, train)
    if groups is not None:
        groups_test, _ = sklearn.model_selection._validation._safe_split(
            estimator, groups, None, test, train)
    else:
        groups_test = None

    if hasattr(X_test, "index"):
        index_test = X_test.index
    else:
        index_test = test

    fit_params = fit_params if fit_params is not None else {}
    fit_params = _check_fit_params(X, fit_params, train)
    # Try to subset the fit_params if that is possible, Ex : 'sample_weight=np.array(....)' should be subsetted but not 'epochs=10'
    start_fit = time()

    ### Fit estimator ###
    if y_train is None:
        if groups_train is not None and function_has_named_argument(
                cloned_estimator.fit, "groups"):
            cloned_estimator.fit(X_train, groups=groups_train, **fit_params)
        else:
            cloned_estimator.fit(X_train, **fit_params)
    else:
        if groups_train is not None and function_has_named_argument(
                cloned_estimator.fit, "groups"):
            cloned_estimator.fit(X_train,
                                 y_train,
                                 groups=groups_train,
                                 **fit_params)
        else:
            cloned_estimator.fit(X_train, y_train, **fit_params)

    fit_time = time() - start_fit

    result_predict = None
    if return_predict:
        func = getattr(cloned_estimator, method)
        predictions = func(X_test)

        ## re-alignement with class ##
        if method in ("predict_proba", "predict_log_proba",
                      "decision_function"):

            def _align_predict(predictions, classes,
                               cloned_estimator_classes_):

                float_min = np.finfo(predictions.dtype).min
                default_values = {
                    "decision_function": float_min,
                    "predict_log_proba": float_min,
                    "predict_proba": 0
                }

                predictions_for_all_classes = pd.DataFrame(
                    default_values[method], index=index_test, columns=classes)

                for j, c in enumerate(cloned_estimator_classes_):
                    predictions_for_all_classes[c] = predictions[:, j]

                return predictions_for_all_classes

            if multi_output_proba:
                predictions = [
                    _align_predict(p, c, cloned_c) for p, c, cloned_c in zip(
                        predictions, all_classes, cloned_estimator.classes_)
                ]
            else:
                predictions = _align_predict(predictions, classes,
                                             cloned_estimator.classes_)

        result_predict = (predictions, test)

    result = OrderedDict()

    ### Score test ###
    test_scores_dictionary = None
    if not no_scoring:
        start_score = time()
        test_scores_dictionary = _score_with_group(cloned_estimator,
                                                   X_test,
                                                   y_test,
                                                   groups_test,
                                                   scorer=scorers,
                                                   is_multimetric=True)
        # Here : scorers is a dictionary of scorers, hence is_multimetric = True
        score_time = time() - start_score

        ### Score train ###
        train_scores_dictionary = _score_with_group(cloned_estimator,
                                                    X_train,
                                                    y_train,
                                                    groups_train,
                                                    scorer=scorers,
                                                    is_multimetric=True)

        ### Put everything into a dictionnary ###
        for k, v in test_scores_dictionary.items():
            result["test_%s" % k] = v

        for k, v in train_scores_dictionary.items():
            result["train_%s" % k] = v

    result["fit_time"] = fit_time

    if not no_scoring:
        result["score_time"] = score_time

    result[
        "n_test_samples"] = sklearn.model_selection._validation._num_samples(
            X_test)
    result["fold_nb"] = fold_index

    return result, result_predict, test_scores_dictionary
Exemple #7
0
    def _approx_cross_validation_pre_calculation(
        self,
        X,
        y,
        groups,
        scoring,
        cv,
        verbose,
        fit_params_step,
        return_predict,
        method,
        no_scoring,
        stopping_round,
        stopping_threshold,
        nodes_not_to_crossvalidate,
        nodes_cant_cv_transform,
        kwargs_step,
    ):
        """ sub-method to loop through the nodes of the pipeline and pre-compute everything that can be pre-computed """

        data_dico = {}  # Will contain transformed blocks at each node

        nodes_done = set()
        for node in self._nodes_order:

            concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes
            if not concat_at_this_node:
                raise NotImplementedError(
                    "Approx cross-validation does't work if no concatenation (node %s)"
                    % str(node))

            nodes_done.add(node)

            if self.verbose:
                print("start processing node %s ..." % node)

            ### Debugging Help ###
            # if getattr(self,"_return_before_node",None) is not None and getattr(self,"_return_before_node",None) == node:
            #    return data_dico

            model = self._models[node]

            predecessors = list(self.complete_graph.predecessors(node))
            # Carefull : here it is not necessary always in the same order

            #### I'll use the order in which the edges were given

            # Concatenation : alphabetical order

            if len(predecessors) == 0:
                #########################
                ###  No predecessors  ###
                #########################

                # ==> Apply on original data
                lastX = X

            elif len(predecessors) == 1:
                ########################
                ###  One predecessor ###
                ########################

                # ==> Apply on data coming out of last node
                lastX = data_dico[predecessors[0]]
                # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] )

            elif len(predecessors) > 1:
                #######################
                ###  More than one  ###
                #######################
                # ==> concat all the predecessors node and apply it

                ### Fix concatenation order ###
                edges_number = self._get_edges_number(predecessors, node)
                predecessors = sorted(predecessors,
                                      key=lambda p:
                                      (edges_number.get(p, -1), p))
                self._all_concat_order[node] = predecessors

                all_lastX = [
                    data_dico[predecessor] for predecessor in predecessors
                ]

                if self.verbose:
                    print("start aggregation...")

                # if do_fit:
                output_type = guess_output_type(all_lastX)
                self._all_concat_type[node] = output_type
                # else:
                #    output_type = self._all_concat_type[node]
                has_none = False
                for x in all_lastX:
                    if x is None:
                        has_none = True
                        break

                # None in all_lastX

                if has_none:
                    lastX = None
                else:
                    lastX = generic_hstack(all_lastX, output_type=output_type)

            if node != self._terminal_node and lastX is not None:
                # This is not the end of the graph

                if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform:
                    ### 1) Node should BE crossvalitaded  ...
                    ### 2) ... and we CAN use 'cv_transform'

                    if self.verbose:
                        print("do crossvalidation on %s" % node)

                    _, data_dico[node] = cross_validation(
                        model,
                        lastX,
                        y,
                        groups=groups,
                        cv=cv,
                        verbose=verbose,
                        fit_params=fit_params_step[node],
                        return_predict=True,
                        method="transform",
                        no_scoring=True,
                        stopping_round=None,
                        stopping_threshold=None,
                        **kwargs_step[node])

                elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform:
                    ### 1) Node should BE crossvalitated ...
                    ### 2) ... but we can't use 'cv_transform'

                    if self.verbose:
                        print("can't do node %s" % node)
                    data_dico[node] = None  # Can't compute this node

                else:
                    ### Node that shouldn't be cross-validated ###

                    if self.verbose:
                        print("skip crossvalidation on %s" % node)
                    cloned_model = clone(model)
                    if groups is not None and function_has_named_argument(
                            cloned_model.fit_transform, "groups"):
                        data_dico[node] = cloned_model.fit_transform(
                            lastX, y, groups, **fit_params_step[node])
                    else:
                        data_dico[node] = cloned_model.fit_transform(
                            lastX, y, **fit_params_step[node])

            elif lastX is not None:

                ### CV no matter what at the last node ###

                #                if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform:
                #
                #                    # This is the last node of the Graph
                #                    result = approx_cross_validation( model, lastX, y, groups = groups, scoring = scoring, cv = cv ,
                #                                                verbose = verbose, fit_params = fit_params_step[node],
                #                                                return_predict = return_predict , method = method, no_scoring = no_scoring,
                #                                                stopping_round = stopping_round, stopping_threshold = stopping_threshold,
                #                                                **kwargs_step[node])
                #
                #                elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform:
                #                    pass
                #
                #                else:

                # This is the last node of the Graph
                result = cross_validation(
                    model,
                    lastX,
                    y,
                    groups=groups,
                    scoring=scoring,
                    cv=cv,
                    verbose=verbose,
                    fit_params=fit_params_step[node],
                    return_predict=return_predict,
                    method=method,
                    no_scoring=no_scoring,
                    stopping_round=stopping_round,
                    stopping_threshold=stopping_threshold,
                    **kwargs_step[node])

                # Rmk : if we do that so column regarding the time of fit are 'false' : they will only account for the time spent in the last node

                return True, data_dico, result
            #                return result

            else:
                ###
                if self.verbose:
                    print("can't compute node %s because lastX is None" % node)
                data_dico[node] = None
                # return result

        return False, data_dico, None  # None : no result yet
Exemple #8
0
    def _fit_transform(self,
                       X,
                       y=None,
                       groups=None,
                       method=None,
                       fit_params=None):
        """ main method of GraphPipeline, handles the fit and predict of object """
        do_fit = method in ("fit", "fit_transform", "fit_predict")

        if not self._already_fitted and not do_fit:
            raise NotFittedError("Please fit the model before")

        # Split fit_params into a 'step-by-step' dictionnary
        fit_params_step = {name: {} for name in self.complete_graph.nodes}
        if fit_params is not None:
            for key, value in fit_params.items():
                step, param = key.split("__", 1)
                fit_params_step[step][param] = value

        data_dico = {}  # Will contain transformed blocks at each node
        feature_dico = {}  # Will contain the get_feature_names() of each node

        if do_fit:
            input_features = getattr(X, "columns", None)
            if input_features is not None:
                input_features = list(input_features)

            self._Xinput_features = input_features

        else:
            input_features = self._Xinput_features

        nodes_done = set()
        for node in self._nodes_order:

            nodes_done.add(node)

            if self.verbose:
                print("start processing node %s ..." % node)

            ### Debugging Help ###
            if (getattr(self, "_return_before_node", None) is not None
                    and getattr(self, "_return_before_node", None) == node):
                return data_dico

            model = self._models[node]

            predecessors = list(self.complete_graph.predecessors(node))
            # Carefull : here it is not necessary always in the same order

            #### I'll use the order in which the edges were given

            # Concatenation : alphabetical order
            concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes

            if len(predecessors) == 0:
                #########################
                ###  No predecessors  ###
                #########################
                if concat_at_this_node:
                    lastX = X

                else:
                    lastX = {"_data": X}
                # ==> Apply on original data

                last_features = input_features

            elif len(predecessors) == 1:
                ########################
                ###  One predecessor ###
                ########################

                # ==> Apply on data coming out of last node
                if concat_at_this_node:
                    lastX = data_dico[predecessors[0]]
                else:
                    lastX = {
                        predecessor: data_dico[predecessor]
                        for predecessor in predecessors
                    }

                last_features = feature_dico[predecessors[0]]

            elif len(predecessors) > 1:
                #######################
                ###  More than one  ###
                #######################
                # ==> concat all the predecessors node and apply it

                ### Fix concatenation order ###
                if do_fit:
                    edges_number = self._get_edges_number(predecessors, node)
                    predecessors = sorted(predecessors,
                                          key=lambda p:
                                          (edges_number.get(p, -1), p))
                    self._all_concat_order[node] = predecessors
                else:
                    predecessors = self._all_concat_order[node]

                all_lastX = [
                    data_dico[predecessor] for predecessor in predecessors
                ]
                all_last_features = [
                    feature_dico[predecessor] for predecessor in predecessors
                ]

                if all_last_features is None or None in all_last_features:
                    last_features = None
                else:
                    last_features = unlist(all_last_features)

                # all_columns_names = [try_to_find_features_names( self._models[predecessor], input_features = input_features)
                #        for predecessor, input_features in zip(predecessors, all_last_features)]

                # for predecessor, input_features in zip(predecessors,all_last_features):
                #    try_to_find_features_names( self._models[predecessor], input_features = input_features)

                if self.verbose:
                    print("start aggregation...")

                if do_fit:
                    output_type = guess_output_type(all_lastX)
                    self._all_concat_type[node] = output_type
                else:
                    output_type = self._all_concat_type[node]

                if concat_at_this_node:
                    lastX = generic_hstack(all_lastX,
                                           output_type=output_type,
                                           all_columns_names=all_last_features)
                else:
                    lastX = {
                        predecessor: data_dico[predecessor]
                        for predecessor in predecessors
                    }

            if node != self._terminal_node:
                # This is not the end of the graph
                if do_fit:
                    if groups is not None and function_has_named_argument(
                            model.fit_transform, "groups"):
                        data_dico[node] = model.fit_transform(
                            lastX, y, groups=groups, **fit_params_step[node])
                    else:
                        data_dico[node] = model.fit_transform(
                            lastX, y, **fit_params_step[node])

                    # ICI : on pourrait sauté le fit pour certains models dans le fit params
                    # Quelque-chose comme :

                    # if node in preffited_models:
                    #
                    # self._model[node] = preffited_models[node]
                    # model = preffited_models[node]
                    # + copy model into pipeline

                    #    data_dico[node] = model.transform(lastX, y)
                    # else:
                    #    data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] )

                else:
                    data_dico[node] = model.transform(lastX)

                feature_dico[node] = try_to_find_features_names(
                    model, input_features=last_features)

            else:
                # This is the last node of the Graph
                if method == "fit":
                    if groups is not None and function_has_named_argument(
                            model.fit, "groups"):
                        model.fit(lastX, y, groups, **fit_params_step[node])
                    else:
                        model.fit(lastX, y, **fit_params_step[node])
                    result = self

                elif method == "fit_predict":
                    if groups is not None and function_has_named_argument(
                            model.fit_predict, "groups"):
                        result = model.fit_predict(lastX, y, groups,
                                                   **fit_params_step[node])
                    else:
                        result = model.fit_predict(lastX, y,
                                                   **fit_params_step[node])

                elif method == "fit_transform":
                    if groups is not None and function_has_named_argument(
                            model.fit_transform, "groups"):
                        result = model.fit_transform(lastX, y, groups,
                                                     **fit_params_step[node])
                    else:
                        result = model.fit_transform(lastX, y,
                                                     **fit_params_step[node])

                elif method == "transform":
                    result = model.transform(lastX)

                elif method == "predict":
                    result = model.predict(lastX)

                elif method == "predict_proba":
                    result = model.predict_proba(lastX)

                elif method == "predict_log_proba":
                    result = model.predict_log_proba(lastX)

                elif method == "decision_function":
                    result = model.decision_function(lastX)

                elif method == "score":
                    result = model.score(lastX, y)

                else:
                    raise ValueError("I don't know that kind of method '%s' " %
                                     method)

                feature_dico[node] = try_to_find_features_names(
                    model, input_features=last_features)
                return result

            #######################
            #### Dico cleaning ####
            #######################
            # I'll do a step of cleaning to remove useless blocks in memory
            # I need to remove data in nodes that wont be accessed anymore
            still_usefull = set()
            for n in self.complete_graph.nodes:
                if n in nodes_done:
                    continue

                p = list(self.complete_graph.predecessors(n))
                still_usefull.update(p)

            for n in data_dico.keys():
                if data_dico[n] is None:
                    continue
                if n not in still_usefull:
                    if self.verbose:
                        print("deleting useless node %s" % n)
                    data_dico[n] = None