def extract_feature_labels_weights(
    df: pd.DataFrame, features_and_labels, **kwargs
) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series,
           pd.Series]:
    features = get_pandas_object(df, features_and_labels.features,
                                 **kwargs).dropna()
    labels = get_pandas_object(df, features_and_labels.labels,
                               **kwargs).dropna()
    targets = call_if_not_none(
        get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna')
    sample_weights = call_if_not_none(
        get_pandas_object(df, features_and_labels.sample_weights, **kwargs),
        'dropna')
    gross_loss = call_if_not_none(
        get_pandas_object(df, features_and_labels.gross_loss, **kwargs),
        'dropna')
    common_index = intersection_of_index(features, labels, targets,
                                         sample_weights, gross_loss)

    if features_and_labels.label_type is not None:
        labels = labels.astype(features_and_labels.label_type)

    return ((features.loc[common_index], len(df) - len(features) + 1),
            labels.loc[common_index], loc_if_not_none(targets, common_index),
            loc_if_not_none(sample_weights, common_index),
            loc_if_not_none(gross_loss, common_index))
    def test_call_if_not_none(self):
        df1 = pd.DataFrame({"a": [np.nan]})
        df2 = None

        self.assertIsNone(call_if_not_none(df2, 'dropna'))
        self.assertEqual(0, len(call_if_not_none(df1, 'dropna')))
        self.assertEqual(1, len(df1))
def extract_feature_labels_weights(df: Typing.PatchedDataFrame,
                                   features_and_labels,
                                   **kwargs) -> FeaturesWithLabels:
    features, targets, latent = extract_features(df, features_and_labels,
                                                 **kwargs)
    labels = extract_labels(df, features_and_labels, **kwargs)
    sample_weights = call_if_not_none(
        get_pandas_object(df, features_and_labels.sample_weights, **kwargs),
        'dropna')
    gross_loss = call_if_not_none(
        get_pandas_object(df, features_and_labels.gross_loss, **kwargs),
        'dropna')

    # do some sanity check for any non numeric values in any of the data frames
    for frame in [features, labels, targets, sample_weights, gross_loss]:
        if frame is not None:
            # we could have nested arrays so we need to use the un-nested values
            values = flatten_nested_list(frame._.values, np.max)
            max_value = max([v.max() for v in values])

            if np.isscalar(max_value) and np.isinf(max_value):
                _log.warning(
                    f"features containing infinit number\n"
                    f"{frame[frame.apply(lambda r: np.isinf(r.values).any(), axis=1)]}"
                )
                frame.replace([np.inf, -np.inf], np.nan, inplace=True)
                frame.dropna(inplace=True)

    # now get the common index and return the filtered data frames
    common_index = intersection_of_index(features, labels, targets,
                                         sample_weights, gross_loss)

    return FeaturesWithLabels(
        FeaturesWithRequiredSamples(
            tuple([f.loc[common_index] for f in features]) if isinstance(
                features, tuple) else features.loc[common_index],
            len(df) - len(features) + 1, len(features.columns)),
        labels.loc[common_index], loc_if_not_none(latent, common_index),
        loc_if_not_none(targets, common_index),
        loc_if_not_none(sample_weights, common_index),
        loc_if_not_none(gross_loss, common_index))
Example #4
0
def extract_feature_labels_weights(
    df: Typing.PatchedDataFrame, features_and_labels, **kwargs
) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series,
           pd.Series]:
    _, features, targets = extract_features(df, features_and_labels, **kwargs)
    labels = get_pandas_object(df, features_and_labels.labels,
                               **kwargs).dropna()
    sample_weights = call_if_not_none(
        get_pandas_object(df, features_and_labels.sample_weights, **kwargs),
        'dropna')
    gross_loss = call_if_not_none(
        get_pandas_object(df, features_and_labels.gross_loss, **kwargs),
        'dropna')

    if features_and_labels.label_type is not None:
        labels = labels.astype(features_and_labels.label_type)

    # do some sanity check for any non numeric values in any of the data frames
    for frame in [features, labels, targets, sample_weights, gross_loss]:
        if frame is not None:
            max = frame._.max()

            if np.isscalar(max) and np.isinf(max):
                _log.warning(
                    "features containing infinit number\n",
                    frame[frame.apply(lambda r: np.isinf(r.values).any(),
                                      axis=1)])
                frame.replace([np.inf, -np.inf], np.nan, inplace=True)
                frame.dropna(inplace=True)

    # now get the common index and return the filtered data frames
    common_index = intersection_of_index(features, labels, targets,
                                         sample_weights, gross_loss)

    return ((tuple([f.loc[common_index] for f in features])
             if isinstance(features, tuple) else features.loc[common_index],
             len(df) - len(features) + 1), labels.loc[common_index],
            loc_if_not_none(targets, common_index),
            loc_if_not_none(sample_weights, common_index),
            loc_if_not_none(gross_loss, common_index))
def extract_features(df: pd.DataFrame, features_and_labels,
                     **kwargs) -> Tuple[List, pd.DataFrame, pd.DataFrame]:
    features = get_pandas_object(df, features_and_labels.features,
                                 **kwargs).dropna()
    targets = call_if_not_none(
        get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna')
    common_index = intersection_of_index(features, targets)

    if len(features) <= 0:
        raise ValueError("not enough data!")

    return (features_and_labels.label_columns, features.loc[common_index],
            loc_if_not_none(targets, common_index))
Example #6
0
def extract_feature_labels_weights(
    df: Typing.PatchedDataFrame, features_and_labels, **kwargs
) -> Tuple[Tuple[pd.DataFrame, int], pd.DataFrame, pd.DataFrame, pd.Series,
           pd.Series]:
    features = get_pandas_object(df, features_and_labels.features,
                                 **kwargs).dropna()
    labels = get_pandas_object(df, features_and_labels.labels,
                               **kwargs).dropna()
    targets = call_if_not_none(
        get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna')
    sample_weights = call_if_not_none(
        get_pandas_object(df, features_and_labels.sample_weights, **kwargs),
        'dropna')
    gross_loss = call_if_not_none(
        get_pandas_object(df, features_and_labels.gross_loss, **kwargs),
        'dropna')

    if features_and_labels.label_type is not None:
        labels = labels.astype(features_and_labels.label_type)

    for frame in [features, labels, targets, sample_weights, gross_loss]:
        if frame is not None:
            max = frame._.values.max()

            if np.isscalar(max) and np.isinf(max):
                _log.warning(
                    "features containing infinit number\n",
                    frame[frame.apply(lambda r: np.isinf(r.values).any(),
                                      axis=1)])
                frame.replace([np.inf, -np.inf], np.nan, inplace=True)
                frame.dropna(inplace=True)

    common_index = intersection_of_index(features, labels, targets,
                                         sample_weights, gross_loss)

    return ((features.loc[common_index], len(df) - len(features) + 1),
            labels.loc[common_index], loc_if_not_none(targets, common_index),
            loc_if_not_none(sample_weights, common_index),
            loc_if_not_none(gross_loss, common_index))
def extract_features(df: pd.DataFrame, features_and_labels,
                     **kwargs) -> FeaturesWithTargets:
    if isinstance(features_and_labels.features, tuple):
        # allow multiple feature sets i.e. for multi input layered networks
        features = MultiFrameDecorator([
            get_pandas_object(df, f, **kwargs).dropna()
            for f in features_and_labels.features
        ], True)
    else:
        features = get_pandas_object(df, features_and_labels.features,
                                     **kwargs).dropna()

    targets = call_if_not_none(
        get_pandas_object(df, features_and_labels.targets, **kwargs), 'dropna')
    latent = call_if_not_none(
        get_pandas_object(df, features_and_labels.latent, **kwargs), 'dropna')
    common_index = intersection_of_index(features, targets)

    if len(features) <= 0:
        raise ValueError("not enough data!")

    return FeaturesWithTargets(features.loc[common_index],
                               loc_if_not_none(targets, common_index),
                               loc_if_not_none(latent, common_index))