def test_intersection_of_tuples(self): df1 = pd.DataFrame({}, index=[1, 2, 3, 4]) df2 = pd.DataFrame({}, index=[2, 3, 4]) df3 = pd.DataFrame({}, index=[1, 3, 4]) index1 = intersection_of_index(df1, MultiFrameDecorator([df2, df3], True)) index2 = intersection_of_index(MultiFrameDecorator([df1, df2], True), df3) self.assertListEqual([3, 4], index1.tolist()) self.assertListEqual([3, 4], index2.tolist())
def extract_feature_labels_weights( df: pd.DataFrame, features_and_labels, **kwargs ) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: features = get_pandas_object(df, features_and_labels.features, **kwargs).dropna() labels = get_pandas_object(df, features_and_labels.labels, **kwargs).dropna() targets = call_if_not_none( get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna') sample_weights = call_if_not_none( get_pandas_object(df, features_and_labels.sample_weights, **kwargs), 'dropna') gross_loss = call_if_not_none( get_pandas_object(df, features_and_labels.gross_loss, **kwargs), 'dropna') common_index = intersection_of_index(features, labels, targets, sample_weights, gross_loss) if features_and_labels.label_type is not None: labels = labels.astype(features_and_labels.label_type) return ((features.loc[common_index], len(df) - len(features) + 1), labels.loc[common_index], loc_if_not_none(targets, common_index), loc_if_not_none(sample_weights, common_index), loc_if_not_none(gross_loss, common_index))
def test_intersection_of_index(self): df1 = pd.DataFrame({}, index=[1, 2, 3, 4]) df2 = pd.DataFrame({}, index=[2, 3, 4]) df3 = pd.DataFrame({}, index=[1, 3, 4]) index = intersection_of_index(df1, df2, df3) self.assertListEqual([3, 4], index.tolist())
def extract_features(df: pd.DataFrame, features_and_labels, **kwargs) -> Tuple[List, pd.DataFrame, pd.DataFrame]: features = get_pandas_object(df, features_and_labels.features, **kwargs).dropna() targets = call_if_not_none( get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna') common_index = intersection_of_index(features, targets) if len(features) <= 0: raise ValueError("not enough data!") return (features_and_labels.label_columns, features.loc[common_index], loc_if_not_none(targets, common_index))
def ta_delta_hedged_price(df: Typing.PatchedDataFrame, benchmark): df_bench = get_pandas_object(df, benchmark) idx = intersection_of_index(df, df_bench) df = df.loc[idx] df_bench = df_bench.loc[idx] if hasattr(df, "columns") and not isinstance( benchmark, Typing.AnyPandasObject) and benchmark in df.columns: df = df.drop(benchmark, axis=1) bench_returns = ta_log_returns(df_bench) if df.ndim > 1: bench_returns = np.repeat(bench_returns.values.reshape(-1, 1), df.shape[1], axis=1) delta_hedged = ta_log_returns(df) - bench_returns return np.exp(delta_hedged.cumsum())
def extract_feature_labels_weights(df: Typing.PatchedDataFrame, features_and_labels, **kwargs) -> FeaturesWithLabels: features, targets, latent = extract_features(df, features_and_labels, **kwargs) labels = extract_labels(df, features_and_labels, **kwargs) sample_weights = call_if_not_none( get_pandas_object(df, features_and_labels.sample_weights, **kwargs), 'dropna') gross_loss = call_if_not_none( get_pandas_object(df, features_and_labels.gross_loss, **kwargs), 'dropna') # do some sanity check for any non numeric values in any of the data frames for frame in [features, labels, targets, sample_weights, gross_loss]: if frame is not None: # we could have nested arrays so we need to use the un-nested values values = flatten_nested_list(frame._.values, np.max) max_value = max([v.max() for v in values]) if np.isscalar(max_value) and np.isinf(max_value): _log.warning( f"features containing infinit number\n" f"{frame[frame.apply(lambda r: np.isinf(r.values).any(), axis=1)]}" ) frame.replace([np.inf, -np.inf], np.nan, inplace=True) frame.dropna(inplace=True) # now get the common index and return the filtered data frames common_index = intersection_of_index(features, labels, targets, sample_weights, gross_loss) return FeaturesWithLabels( FeaturesWithRequiredSamples( tuple([f.loc[common_index] for f in features]) if isinstance( features, tuple) else features.loc[common_index], len(df) - len(features) + 1, len(features.columns)), labels.loc[common_index], loc_if_not_none(latent, common_index), loc_if_not_none(targets, common_index), loc_if_not_none(sample_weights, common_index), loc_if_not_none(gross_loss, common_index))
def extract_features(df: pd.DataFrame, features_and_labels, **kwargs) -> Tuple[List, pd.DataFrame, pd.DataFrame]: if isinstance(features_and_labels.features, tuple): # allow multiple feature sets i.e. for multi input layered networks features = MultiFrameDecorator([ get_pandas_object(df, f, **kwargs).dropna() for f in features_and_labels.features ], True) else: features = get_pandas_object(df, features_and_labels.features, **kwargs).dropna() targets = call_if_not_none( get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna') common_index = intersection_of_index(features, targets) if len(features) <= 0: raise ValueError("not enough data!") return (features_and_labels.label_columns, features.loc[common_index], loc_if_not_none(targets, common_index))
def extract_feature_labels_weights( df: Typing.PatchedDataFrame, features_and_labels, **kwargs ) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: _, features, targets = extract_features(df, features_and_labels, **kwargs) labels = get_pandas_object(df, features_and_labels.labels, **kwargs).dropna() sample_weights = call_if_not_none( get_pandas_object(df, features_and_labels.sample_weights, **kwargs), 'dropna') gross_loss = call_if_not_none( get_pandas_object(df, features_and_labels.gross_loss, **kwargs), 'dropna') if features_and_labels.label_type is not None: labels = labels.astype(features_and_labels.label_type) # do some sanity check for any non numeric values in any of the data frames for frame in [features, labels, targets, sample_weights, gross_loss]: if frame is not None: max = frame._.max() if np.isscalar(max) and np.isinf(max): _log.warning( "features containing infinit number\n", frame[frame.apply(lambda r: np.isinf(r.values).any(), axis=1)]) frame.replace([np.inf, -np.inf], np.nan, inplace=True) frame.dropna(inplace=True) # now get the common index and return the filtered data frames common_index = intersection_of_index(features, labels, targets, sample_weights, gross_loss) return ((tuple([f.loc[common_index] for f in features]) if isinstance(features, tuple) else features.loc[common_index], len(df) - len(features) + 1), labels.loc[common_index], loc_if_not_none(targets, common_index), loc_if_not_none(sample_weights, common_index), loc_if_not_none(gross_loss, common_index))
def extract_feature_labels_weights( df: Typing.PatchedDataFrame, features_and_labels, **kwargs ) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: features = get_pandas_object(df, features_and_labels.features, **kwargs).dropna() labels = get_pandas_object(df, features_and_labels.labels, **kwargs).dropna() targets = call_if_not_none( get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna') sample_weights = call_if_not_none( get_pandas_object(df, features_and_labels.sample_weights, **kwargs), 'dropna') gross_loss = call_if_not_none( get_pandas_object(df, features_and_labels.gross_loss, **kwargs), 'dropna') if features_and_labels.label_type is not None: labels = labels.astype(features_and_labels.label_type) for frame in [features, labels, targets, sample_weights, gross_loss]: if frame is not None: max = frame._.values.max() if np.isscalar(max) and np.isinf(max): _log.warning( "features containing infinit number\n", frame[frame.apply(lambda r: np.isinf(r.values).any(), axis=1)]) frame.replace([np.inf, -np.inf], np.nan, inplace=True) frame.dropna(inplace=True) common_index = intersection_of_index(features, labels, targets, sample_weights, gross_loss) return ((features.loc[common_index], len(df) - len(features) + 1), labels.loc[common_index], loc_if_not_none(targets, common_index), loc_if_not_none(sample_weights, common_index), loc_if_not_none(gross_loss, common_index))
def __init__(self, frames: XYWeight, splitter: Callable[[Any], Tuple[pd.Index, pd.Index]] = None, filter: Callable[[Any], bool] = None, cross_validation: Union['BaseCrossValidator', Callable[[Any], Generator[Tuple[np.ndarray, np.ndarray], None, None]]] = None, epochs: int = 1, batch_size: int = None, fold_epochs: int = 1, on_start: Callable = None, on_epoch: Callable = None, on_batch: Callable = None, on_fold: Callable = None, on_fold_epoch: Callable = None, after_epoch: Callable = None, after_batch: Callable = None, after_fold: Callable = None, after_fold_epoch: Callable = None, after_end: Callable = None, **kwargs): self.common_index = intersection_of_index(*frames).sort_values() self.frames = XYWeight( *[loc_if_not_none(f, self.common_index) for f in frames]) self.epochs = epochs self.batch_size = batch_size self.fold_epochs = fold_epochs self.splitter = splitter self.filter = filter # callbacks self.on_start = on_start self.on_epoch = on_epoch self.on_batch = on_batch self.on_fold = on_fold self.on_fold_epoch = on_fold_epoch self.after_epoch = after_epoch self.after_batch = after_batch self.after_fold = after_fold self.after_fold_epoch = after_fold_epoch self.after_end = after_end # split training and test data if self.splitter is not None: if isinstance(self.common_index, pd.MultiIndex): _log.warning( "The Data provided uses a `MultiIndex`, eventually you want to set the " "`partition_row_multi_index` parameter in your splitter") self.train_idx, self.test_idx = call_callable_dynamic_args( self.splitter, self.common_index, **self.frames.to_dict()) else: self.train_idx, self.test_idx = self.common_index, pd.Index([]) if cross_validation is not None: if isinstance(self.common_index, pd.MultiIndex) and not isinstance( cross_validation, PartitionedOnRowMultiIndexCV): # cross validators need to fold within each group of a multi index row index, a wrapper can be provided _log.warning( "The Data provided uses a `MultiIndex` but the cross validation is not wrapped in " "`PartitionedOnRowMultiIndexCV`") if epochs is None or epochs > 1: _log.warning( f"using epochs > 1 together with cross folding may lead to different folds for each epoch!" f"{cross_validation}") self.nr_folds = cross_validation.get_n_splits() if hasattr( cross_validation, "get_n_splits") else -1 self.cross_validation = cross_validation.split if hasattr( cross_validation, "split") else cross_validation else: self.nr_folds = None self.cross_validation = None
def __init__(self, frames: Tuple[Typing.PatchedDataFrame], use_index_intersection=False): self._frames = frames self._index = intersection_of_index(*frames) if use_index_intersection else frames[0].index