def extract_feature_labels_weights( df: pd.DataFrame, features_and_labels, **kwargs ) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: features = get_pandas_object(df, features_and_labels.features, **kwargs).dropna() labels = get_pandas_object(df, features_and_labels.labels, **kwargs).dropna() targets = call_if_not_none( get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna') sample_weights = call_if_not_none( get_pandas_object(df, features_and_labels.sample_weights, **kwargs), 'dropna') gross_loss = call_if_not_none( get_pandas_object(df, features_and_labels.gross_loss, **kwargs), 'dropna') common_index = intersection_of_index(features, labels, targets, sample_weights, gross_loss) if features_and_labels.label_type is not None: labels = labels.astype(features_and_labels.label_type) return ((features.loc[common_index], len(df) - len(features) + 1), labels.loc[common_index], loc_if_not_none(targets, common_index), loc_if_not_none(sample_weights, common_index), loc_if_not_none(gross_loss, common_index))
def extract_feature_labels_weights(df: Typing.PatchedDataFrame, features_and_labels, **kwargs) -> FeaturesWithLabels: features, targets, latent = extract_features(df, features_and_labels, **kwargs) labels = extract_labels(df, features_and_labels, **kwargs) sample_weights = call_if_not_none( get_pandas_object(df, features_and_labels.sample_weights, **kwargs), 'dropna') gross_loss = call_if_not_none( get_pandas_object(df, features_and_labels.gross_loss, **kwargs), 'dropna') # do some sanity check for any non numeric values in any of the data frames for frame in [features, labels, targets, sample_weights, gross_loss]: if frame is not None: # we could have nested arrays so we need to use the un-nested values values = flatten_nested_list(frame._.values, np.max) max_value = max([v.max() for v in values]) if np.isscalar(max_value) and np.isinf(max_value): _log.warning( f"features containing infinit number\n" f"{frame[frame.apply(lambda r: np.isinf(r.values).any(), axis=1)]}" ) frame.replace([np.inf, -np.inf], np.nan, inplace=True) frame.dropna(inplace=True) # now get the common index and return the filtered data frames common_index = intersection_of_index(features, labels, targets, sample_weights, gross_loss) return FeaturesWithLabels( FeaturesWithRequiredSamples( tuple([f.loc[common_index] for f in features]) if isinstance( features, tuple) else features.loc[common_index], len(df) - len(features) + 1, len(features.columns)), labels.loc[common_index], loc_if_not_none(latent, common_index), loc_if_not_none(targets, common_index), loc_if_not_none(sample_weights, common_index), loc_if_not_none(gross_loss, common_index))
def extract_feature_labels_weights( df: Typing.PatchedDataFrame, features_and_labels, **kwargs ) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: _, features, targets = extract_features(df, features_and_labels, **kwargs) labels = get_pandas_object(df, features_and_labels.labels, **kwargs).dropna() sample_weights = call_if_not_none( get_pandas_object(df, features_and_labels.sample_weights, **kwargs), 'dropna') gross_loss = call_if_not_none( get_pandas_object(df, features_and_labels.gross_loss, **kwargs), 'dropna') if features_and_labels.label_type is not None: labels = labels.astype(features_and_labels.label_type) # do some sanity check for any non numeric values in any of the data frames for frame in [features, labels, targets, sample_weights, gross_loss]: if frame is not None: max = frame._.max() if np.isscalar(max) and np.isinf(max): _log.warning( "features containing infinit number\n", frame[frame.apply(lambda r: np.isinf(r.values).any(), axis=1)]) frame.replace([np.inf, -np.inf], np.nan, inplace=True) frame.dropna(inplace=True) # now get the common index and return the filtered data frames common_index = intersection_of_index(features, labels, targets, sample_weights, gross_loss) return ((tuple([f.loc[common_index] for f in features]) if isinstance(features, tuple) else features.loc[common_index], len(df) - len(features) + 1), labels.loc[common_index], loc_if_not_none(targets, common_index), loc_if_not_none(sample_weights, common_index), loc_if_not_none(gross_loss, common_index))
def extract_features(df: pd.DataFrame, features_and_labels, **kwargs) -> Tuple[List, pd.DataFrame, pd.DataFrame]: features = get_pandas_object(df, features_and_labels.features, **kwargs).dropna() targets = call_if_not_none( get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna') common_index = intersection_of_index(features, targets) if len(features) <= 0: raise ValueError("not enough data!") return (features_and_labels.label_columns, features.loc[common_index], loc_if_not_none(targets, common_index))
def extract_feature_labels_weights( df: Typing.PatchedDataFrame, features_and_labels, **kwargs ) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: features = get_pandas_object(df, features_and_labels.features, **kwargs).dropna() labels = get_pandas_object(df, features_and_labels.labels, **kwargs).dropna() targets = call_if_not_none( get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna') sample_weights = call_if_not_none( get_pandas_object(df, features_and_labels.sample_weights, **kwargs), 'dropna') gross_loss = call_if_not_none( get_pandas_object(df, features_and_labels.gross_loss, **kwargs), 'dropna') if features_and_labels.label_type is not None: labels = labels.astype(features_and_labels.label_type) for frame in [features, labels, targets, sample_weights, gross_loss]: if frame is not None: max = frame._.values.max() if np.isscalar(max) and np.isinf(max): _log.warning( "features containing infinit number\n", frame[frame.apply(lambda r: np.isinf(r.values).any(), axis=1)]) frame.replace([np.inf, -np.inf], np.nan, inplace=True) frame.dropna(inplace=True) common_index = intersection_of_index(features, labels, targets, sample_weights, gross_loss) return ((features.loc[common_index], len(df) - len(features) + 1), labels.loc[common_index], loc_if_not_none(targets, common_index), loc_if_not_none(sample_weights, common_index), loc_if_not_none(gross_loss, common_index))
def extract_features(df: pd.DataFrame, features_and_labels, **kwargs) -> FeaturesWithTargets: if isinstance(features_and_labels.features, tuple): # allow multiple feature sets i.e. for multi input layered networks features = MultiFrameDecorator([ get_pandas_object(df, f, **kwargs).dropna() for f in features_and_labels.features ], True) else: features = get_pandas_object(df, features_and_labels.features, **kwargs).dropna() targets = call_if_not_none( get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna') latent = call_if_not_none( get_pandas_object(df, features_and_labels.latent, **kwargs), 'dropna') common_index = intersection_of_index(features, targets) if len(features) <= 0: raise ValueError("not enough data!") return FeaturesWithTargets(features.loc[common_index], loc_if_not_none(targets, common_index), loc_if_not_none(latent, common_index))
def train_test_sampler(self) -> Sampler: train_idx, test_idx = self.splitter.train_test_split( self.frames[0].index) train = [loc_if_not_none(frame, train_idx) for frame in self.frames] test = [loc_if_not_none(frame, test_idx) for frame in self.frames] return Sampler(train, test, self.splitter.cross_validation)
def __init__(self, frames: XYWeight, splitter: Callable[[Any], Tuple[pd.Index, pd.Index]] = None, filter: Callable[[Any], bool] = None, cross_validation: Union['BaseCrossValidator', Callable[[Any], Generator[Tuple[np.ndarray, np.ndarray], None, None]]] = None, epochs: int = 1, batch_size: int = None, fold_epochs: int = 1, on_start: Callable = None, on_epoch: Callable = None, on_batch: Callable = None, on_fold: Callable = None, on_fold_epoch: Callable = None, after_epoch: Callable = None, after_batch: Callable = None, after_fold: Callable = None, after_fold_epoch: Callable = None, after_end: Callable = None, **kwargs): self.common_index = intersection_of_index(*frames).sort_values() self.frames = XYWeight( *[loc_if_not_none(f, self.common_index) for f in frames]) self.epochs = epochs self.batch_size = batch_size self.fold_epochs = fold_epochs self.splitter = splitter self.filter = filter # callbacks self.on_start = on_start self.on_epoch = on_epoch self.on_batch = on_batch self.on_fold = on_fold self.on_fold_epoch = on_fold_epoch self.after_epoch = after_epoch self.after_batch = after_batch self.after_fold = after_fold self.after_fold_epoch = after_fold_epoch self.after_end = after_end # split training and test data if self.splitter is not None: if isinstance(self.common_index, pd.MultiIndex): _log.warning( "The Data provided uses a `MultiIndex`, eventually you want to set the " "`partition_row_multi_index` parameter in your splitter") self.train_idx, self.test_idx = call_callable_dynamic_args( self.splitter, self.common_index, **self.frames.to_dict()) else: self.train_idx, self.test_idx = self.common_index, pd.Index([]) if cross_validation is not None: if isinstance(self.common_index, pd.MultiIndex) and not isinstance( cross_validation, PartitionedOnRowMultiIndexCV): # cross validators need to fold within each group of a multi index row index, a wrapper can be provided _log.warning( "The Data provided uses a `MultiIndex` but the cross validation is not wrapped in " "`PartitionedOnRowMultiIndexCV`") if epochs is None or epochs > 1: _log.warning( f"using epochs > 1 together with cross folding may lead to different folds for each epoch!" f"{cross_validation}") self.nr_folds = cross_validation.get_n_splits() if hasattr( cross_validation, "get_n_splits") else -1 self.cross_validation = cross_validation.split if hasattr( cross_validation, "split") else cross_validation else: self.nr_folds = None self.cross_validation = None
def to_dict(self, loc=None): d = {"x": self.x, "y": self.y, "weight": self.weight} if loc is not None: d = {k: loc_if_not_none(v, loc) for k, v in d.items()} return d
def sample_for_training(self) -> Generator[FoldXYWeight, None, None]: cross_validation = self.cross_validation if self.cross_validation is not None else lambda x: [ (None, None) ] # filter samples if self.filter is not None: train_idx = [ idx for idx in self.train_idx if call_callable_dynamic_args( self.filter, idx, **self.frames.to_dict(idx)) ] else: train_idx = self.train_idx # update frame views train_frames = XYWeight( *[loc_if_not_none(f, train_idx) for f in self.frames]) test_frames = XYWeight( *[loc_if_not_none(f, self.test_idx) for f in self.frames]) # call for start ... call_callable_dynamic_args( self.on_start, epochs=self.epochs, batch_size=self.batch_size, fold_epochs=self.fold_epochs, features=exec_if_not_none(lambda x: x.columns.tolist(), self.frames.x), labels=exec_if_not_none(lambda y: y.columns.tolist(), self.frames.y), cross_validation=self.nr_folds is not None) # generate samples for epoch in (range(self.epochs) if self.epochs is not None else iter( int, 1)): call_callable_dynamic_args(self.on_epoch, epoch=epoch) fold_iter = enumerate( call_callable_dynamic_args(cross_validation, train_idx, **train_frames.to_dict())) for fold, (cv_train_i, cv_test_i) in fold_iter: call_callable_dynamic_args(self.on_fold, epoch=epoch, fold=fold) # if we dont have any cross validation the training and test sets stay unchanged cv_train_idx = train_idx if cv_train_i is None else train_idx[ cv_train_i] # build our test data sets if cv_test_i is not None: if cv_test_i.ndim > 1: cv_test_frames = [ XYWeight(*[ loc_if_not_none(f, train_idx[cv_test_i[:, i]]) for f in self.frames ]) for i in range(cv_test_i.shape[1]) ] else: cv_test_frames = [ XYWeight(*[ loc_if_not_none(f, train_idx[cv_test_i]) for f in self.frames ]) ] else: if len(self.test_idx) <= 0: cv_test_frames = [] else: cv_test_frames = [ XYWeight(*[ loc_if_not_none(f, self.test_idx) for f in self.frames ]) ] for fold_epoch in range(self.fold_epochs): call_callable_dynamic_args(self.on_fold, epoch=epoch, fold=fold, fold_epoch=fold_epoch) # build our training data sets aka batches cv_train_frames = XYWeight(*[ loc_if_not_none(f, cv_train_idx) for f in self.frames ]) # theoretically we could already yield cv_train_frames, cv_test_frames # but lets create batches first and then yield all together nr_instances = len(cv_train_idx) nice_i = max(nr_instances - 2, 0) bs = min(nr_instances, self.batch_size ) if self.batch_size is not None else nr_instances batch_iter = range(0, nr_instances, bs) for i in batch_iter: call_callable_dynamic_args(self.on_batch, epoch=epoch, fold=fold, fold_epoch=fold_epoch, batch=i) yield FoldXYWeight( epoch, fold, fold_epoch, *(f.iloc[i if i < nice_i else i - 1:i + bs] if f is not None else None for f in cv_train_frames)) call_callable_dynamic_args(self.after_batch, epoch=epoch, fold=fold, fold_epoch=fold_epoch, batch=i) # end of fold epoch try: call_callable_dynamic_args(self.after_fold_epoch, epoch=epoch, fold=fold, fold_epoch=fold_epoch, train_data=cv_train_frames, test_data=cv_test_frames) except StopIteration as sie: call_callable_dynamic_args(self.after_fold, epoch=epoch, fold=fold, train_data=cv_train_frames, test_data=cv_test_frames) if str(sie).isnumeric() and int(str(sie)) == fold: # we just want to stop this fold break else: # we need to stop any further generation of sample and call all left callbacks call_callable_dynamic_args(self.after_epoch, epoch=epoch, train_data=train_frames, test_data=test_frames) call_callable_dynamic_args(self.after_end) return # end of fold call_callable_dynamic_args(self.after_fold, epoch=epoch, fold=fold, train_data=cv_train_frames, test_data=cv_test_frames) # end of epoch call_callable_dynamic_args(self.after_epoch, epoch=epoch, train_data=train_frames, test_data=test_frames) # end of generator call_callable_dynamic_args(self.after_end)
def test_loc_if_not_none(self): df1 = pd.DataFrame({"A": [1, 2, 3, 4]}, index=[1, 2, 3, 4]) df2 = None self.assertEqual(1, loc_if_not_none(df1, 1).values[0]) self.assertIsNone(loc_if_not_none(df2, 1))
def fit(df: pd.DataFrame, model_provider: Callable[[int], Model], test_size: float = 0.4, youngest_size: float = None, cross_validation: Tuple[int, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]] = None, test_validate_split_seed=42, hyper_parameter_space: Dict = None, **kwargs) -> Fit: """ :param df: the DataFrame you apply this function to :param model_provider: a callable which provides a new :class:`.Model` instance i.e. for each hyper parameter if hyper parameter tuning is enforced. Usually all the Model subclasses implement __call__ thus they are a provider of itself :param test_size: the fraction [0, 1] of random samples which are used for a test set :param youngest_size: the fraction [0, 1] of the test samples which are not random but are the youngest :param cross_validation: tuple of number of epochs for each fold provider and a cross validation provider :param test_validate_split_seed: seed if train, test splitting needs to be reproduceable. A magic seed 'youngest' is available, which just uses the youngest data as test data :param hyper_parameter_space: space of hyper parameters passed as kwargs to your model provider :return: returns a :class:`pandas_ml_utils.model.fitting.fit.Fit` object """ trails = None model = model_provider() kwargs = merge_kwargs(model.features_and_labels.kwargs, model.kwargs, kwargs) (features, min_required_samples), labels, targets, weights = \ extract(model.features_and_labels, df, extract_feature_labels_weights, **kwargs) start_performance_count = perf_counter() _log.info("create model") # get indices and make training and test data sets train_idx, test_idx = train_test_split(features.index, test_size, youngest_size, test_validate_split_seed) train = (features.loc[train_idx], labels.loc[train_idx], loc_if_not_none(weights, train_idx)) test = (features.loc[test_idx], labels.loc[test_idx], loc_if_not_none(weights, test_idx)) # eventually perform a hyper parameter optimization first if hyper_parameter_space is not None: # next isolate hyperopt parameters and constants only used for hyper parameter tuning like early stopping constants = {} hyperopt_params = {} for k, v in list(hyper_parameter_space.items()): if k.startswith("__"): hyperopt_params[k[2:]] = hyper_parameter_space.pop(k) elif isinstance(v, (int, float, bool)): constants[k] = hyper_parameter_space.pop(k) # optimize hyper parameters model, trails = __hyper_opt(hyper_parameter_space, hyperopt_params, constants, model_provider, cross_validation, train, test) # finally train the model with eventually tuned hyper parameters __train_loop(model, cross_validation, train, test) _log.info( f"fitting model done in {perf_counter() - start_performance_count: .2f} sec!" ) # assemble result objects prediction_train = to_pandas(model.predict(train[0].ml.values), train_idx, labels.columns) prediction_test = to_pandas(model.predict(test[0].ml.values), test_idx, labels.columns) targets = (loc_if_not_none(targets, train_idx), loc_if_not_none(targets, test_idx)) df_train = assemble_prediction_frame({ TARGET_COLUMN_NAME: targets[0], PREDICTION_COLUMN_NAME: prediction_train, LABEL_COLUMN_NAME: train[1], FEATURE_COLUMN_NAME: train[0] }) df_test = assemble_prediction_frame({ TARGET_COLUMN_NAME: targets[1], PREDICTION_COLUMN_NAME: prediction_test, LABEL_COLUMN_NAME: test[1], FEATURE_COLUMN_NAME: test[0] }) # update model properties and return the fit model._validation_indices = test_idx model.features_and_labels._min_required_samples = min_required_samples model.features_and_labels._label_columns = labels.columns return Fit(model, model.summary_provider(df_train), model.summary_provider(df_test), trails)