Beispiel #1
0
 def build_target(self):
     y, ff = build_target_safe(self.target, self.data)
     self.y = y
     self.negatives = y[~y.astype('bool')].index
     self.positives = y[y.astype('bool')].index
     self.n_positives = len(self.positives)
     self.n_negatives = len(self.negatives)
Beispiel #2
0
def cross_validate(model_def, data, folds, repeat=1):
    """
    """
    results = []

    if isinstance(folds, int):
        folds = make_default_folds(num_folds=folds, data=data)

    for i in range(repeat):
        for fold in folds:
            if len(fold) == 2:
                train_index, test_index = fold
                prep_index = None
            elif len(fold) == 3:
                train_index, test_index, prep_index = fold
            else:
                raise ValueError("Fold is not of right dimension (%d, not 2 or 3)"%len(fold))
            assert len(train_index & test_index) == 0, "train and test overlap!!! %s, %s" % (train_index, test_index)
            x_train, y_train, fitted_model = fit_model(model_def, data, prep_index, train_index)
            test_data = data.loc[test_index]
            x_test, y_test = generate_test(model_def, test_data, fitted_model)
            assert len(x_train.index & x_test.index) == 0, "train and test overlap!!! %s" % (x_train.index & x_test.index)
            y_preds = predict(fitted_model, x_test)
            if model_def.evaluation_target is not None:
                y_test, ff = build_target_safe(model_def.evaluation_target, test_data)
            result = Result(x_train, x_test, y_train, y_test, y_preds, model_def, fitted_model, data)
            results.append(result)

            # for reporter in reporters:
            #     reporter.update(result)
    return results
Beispiel #3
0
def generate_train(model_def, data, prep_index=None, train_index=None):
    # create training set
    data, prep_index, train_index = filter_data_and_indexes(model_def, data, prep_index, train_index)
    x_train, fitted_features = build_featureset_safe(model_def.features, data, prep_index, train_index)
    y_train, fitted_target = build_target_safe(model_def.target, data, prep_index, train_index)
    x_train = x_train.reindex(train_index)
    y_train = y_train.reindex(train_index)
    return x_train, y_train, fitted_features, fitted_target
Beispiel #4
0
def predict(fitted_model, x_data):
    model_def = fitted_model.model_def
    predictions = fitted_model.fitted_estimator.predict(x_data)
    predictions = pd.Series(predictions, index=x_data.index)
    if model_def.evaluation_transformation is not None:
        x_data[model_def.predictions_name] = predictions
        predictions, ff = build_target_safe(model_def.evaluation_transformation, x_data)
        del x_data[model_def.predictions_name]
    return predictions
Beispiel #5
0
 def _train(self, train_datas):
     train_data = concat(train_datas, axis=1)
     y, ff = build_target_safe(self.target, self.data)
     y = reindex_safe(y, train_data.index)
     arg = self.threshold_arg
     if arg is None:
         arg = self.n_keep
     cols = self.selector.select(train_data, y, arg)
     return cols
Beispiel #6
0
 def _train(self, train_datas):
     train_data = concat(train_datas, axis=1)
     y, ff = build_target_safe(self.target, self.data)
     y = reindex_safe(y, train_data.index)
     arg = self.threshold_arg
     if arg is None:
         arg = self.n_keep
     cols = self.selector.select(train_data, y, arg)
     return cols
Beispiel #7
0
def predict(fitted_model, x_data):
    model_def = fitted_model.model_def
    predictions = fitted_model.fitted_estimator.predict(x_data)
    predictions = pd.Series(predictions, index=x_data.index)
    if model_def.evaluation_transformation is not None:
        x_data[model_def.predictions_name] = predictions
        predictions, ff = build_target_safe(
            model_def.evaluation_transformation, x_data)
        del x_data[model_def.predictions_name]
    return predictions
Beispiel #8
0
def generate_train(model_def, data, prep_index=None, train_index=None):
    # create training set
    data, prep_index, train_index = filter_data_and_indexes(
        model_def, data, prep_index, train_index)
    x_train, fitted_features = build_featureset_safe(model_def.features, data,
                                                     prep_index, train_index)
    y_train, fitted_target = build_target_safe(model_def.target, data,
                                               prep_index, train_index)
    x_train = x_train.reindex(train_index)
    y_train = y_train.reindex(train_index)
    return x_train, y_train, fitted_features, fitted_target
Beispiel #9
0
 def _train(self, train_data):
     y, ff = build_target_safe(self.target, train_data)
     vc = train_data[self.group_by].value_counts()
     keys = [k for k, v in vc.iterkv() if v >= self.min_sample]
     train_data['__grouping'] = train_data[self.group_by].map(lambda x: x if x in keys else '__other')
     train_data['__target'] = y
     vals = train_data.groupby('__grouping').agg({'__target': self.func})['__target'].to_dict()
     logging.debug("Preparing Target Aggregations:")
     logging.debug(str(vals.items()[:10]))
     del train_data['__target']
     del train_data['__grouping']
     return (keys, vals)
Beispiel #10
0
def fit_model(model_def, data, prep_index=None, train_index=None):
    # create training set
    x_train, fitted_features = build_featureset_safe(model_def.features, data, prep_index, train_index)
    y_train, fitted_target = build_target_safe(model_def.target, data, prep_index, train_index)

    # fit estimator
    model_def.estimator.fit(x_train, y_train)

    # unnecesary?
    fitted_estimator = FittedEstimator(model_def.estimator, x_train, y_train)

    fitted_model = FittedModel(model_def, fitted_features, fitted_target, fitted_estimator)
    return x_train, y_train, fitted_model
Beispiel #11
0
 def _train(self, train_data):
     y, ff = build_target_safe(self.target, train_data)
     train_data['__target'] = y
     global_value = self.func(y)
     if self.regularize:
         keys = train_data[self.group_by].unique()
         f = lambda x: (self.func(x) * x.size + global_value * self.min_sample) / (x.size + self.min_sample)
         vals = train_data.groupby(self.group_by).agg({'__target': f})['__target'].to_dict()
     else:
         vc = train_data[self.group_by].value_counts()
         keys = [k for k, v in vc.iterkv() if v >= self.min_sample]
         train_data['__grouping'] = train_data[self.group_by].map(lambda x: x if x in keys else '__other')
         vals = train_data.groupby('__grouping').agg({'__target': self.func})['__target'].to_dict()
         del train_data['__grouping']
     if '__other' not in vals:
         vals['__other'] = global_value
     logging.debug("Preparing Target Aggregations:")
     logging.debug(str(vals.items()[:10]))
     del train_data['__target']
     return vals
Beispiel #12
0
def cross_validate(model_def, data, folds, repeat=1):
    """
    """
    results = []

    if isinstance(folds, int):
        folds = make_default_folds(num_folds=folds, data=data)

    for i in range(repeat):
        for fold in folds:
            if len(fold) == 2:
                train_index, test_index = fold
                prep_index = None
            elif len(fold) == 3:
                train_index, test_index, prep_index = fold
            else:
                raise ValueError(
                    "Fold is not of right dimension (%d, not 2 or 3)" %
                    len(fold))
            assert len(
                train_index
                & test_index) == 0, "train and test overlap!!! %s, %s" % (
                    train_index, test_index)
            x_train, y_train, fitted_model = fit_model(model_def, data,
                                                       prep_index, train_index)
            test_data = data.loc[test_index]
            x_test, y_test = generate_test(model_def, test_data, fitted_model)
            assert len(x_train.index
                       & x_test.index) == 0, "train and test overlap!!! %s" % (
                           x_train.index & x_test.index)
            y_preds = predict(fitted_model, x_test)
            if model_def.evaluation_target is not None:
                y_test, ff = build_target_safe(model_def.evaluation_target,
                                               test_data)
            result = Result(x_train, x_test, y_train, y_test, y_preds,
                            model_def, fitted_model, data)
            results.append(result)

            # for reporter in reporters:
            #     reporter.update(result)
    return results
Beispiel #13
0
 def _train(self, train_data):
     y, ff = build_target_safe(self.target, train_data)
     train_data['__target'] = y
     global_value = self.func(y)
     if self.regularize:
         keys = train_data[self.group_by].unique()
         f = lambda x: (self.func(x) * x.size + global_value * self.
                        min_sample) / (x.size + self.min_sample)
         vals = train_data.groupby(self.group_by).agg(
             {'__target': f})['__target'].to_dict()
     else:
         vc = train_data[self.group_by].value_counts()
         keys = [k for k, v in vc.iterkv() if v >= self.min_sample]
         train_data['__grouping'] = train_data[self.group_by].map(
             lambda x: x if x in keys else '__other')
         vals = train_data.groupby('__grouping').agg(
             {'__target': self.func})['__target'].to_dict()
         del train_data['__grouping']
     if '__other' not in vals:
         vals['__other'] = global_value
     logging.debug("Preparing Target Aggregations:")
     logging.debug(str(vals.items()[:10]))
     del train_data['__target']
     return vals
Beispiel #14
0
def generate_train(model_def, data, prep_index=None, train_index=None):
    # create training set
    x_train, fitted_features = build_featureset_safe(model_def.features, data, prep_index, train_index)
    y_train, fitted_target = build_target_safe(model_def.target, data, prep_index, train_index)
    return x_train, y_train, fitted_features, fitted_target