Beispiel #1
0
def test_load_feature():
    df = pd.DataFrame()

    df['a'] = np.arange(100)

    with get_temp_directory() as tmp:
        fs.save_feature(df, 0, tmp)

        df_loaded = fs.load_feature(0, tmp)
        assert_frame_equal(df, df_loaded)
Beispiel #2
0
def test_load_feature_ignore_all_columns():
    df = pd.DataFrame()

    df['a'] = np.arange(100).astype(float)
    df['b'] = np.arange(100).astype(int)
    df['c'] = np.arange(100).astype(int)

    with get_temp_directory() as tmp:
        fs.save_feature(df, 0, tmp)

        df_loaded = fs.load_feature(0, tmp, ignore_columns=['a', 'b', 'c', 'X'])

        assert_frame_equal(df_loaded, df.drop(['a', 'b', 'c'], axis=1))
Beispiel #3
0
def test_various_dtypes():
    df = pd.DataFrame()

    df['a'] = np.arange(100).astype(float)
    df['b'] = np.arange(100).astype(int)
    df['c'] = np.arange(100).astype(np.uint8)
    df['d'] = np.arange(100).astype(np.uint16)
    df['e'] = np.arange(100).astype(np.uint32)
    df['f'] = np.arange(100).astype(np.int8)
    df['g'] = np.arange(100).astype(np.int16)
    df['h'] = np.arange(100).astype(np.int32)
    df['i'] = np.arange(100).astype(np.int64)

    with get_temp_directory() as tmp:
        fs.save_feature(df, 0, tmp)

        df_loaded = fs.load_feature(0, tmp)
        assert_frame_equal(df, df_loaded)
Beispiel #4
0
import pandas as pd
from nyaggle.experiment import run_experiment
from nyaggle.feature_store import load_features, load_feature
from sklearn.metrics import average_precision_score

from src.utils import prauc, get_folds

submission = pd.read_csv("input/atmaCup5__sample_submission.csv")
all_df = load_feature("all", "working")

data = load_features(
    all_df,
    feature_names=[
        "fitting",
        "peak_around",
        "intensity_stats",
        "savgol_peak",
        "spec_percentile",
        "fitting_combination",
    ],
    ignore_columns=["spectrum_id", "spectrum_filename", "chip_id"],
)

train = data[data.target.notnull()].copy()
test = data[data.target.isnull()].copy()

target_col = "target"
drop_cols = ["spectrum_id", "spectrum_filename", "chip_id"]
X_train = train.drop(drop_cols + [target_col], axis=1)
y_train = train[target_col]
X_test = test.drop(drop_cols + [target_col], axis=1)
Beispiel #5
0
    spec = spec.copy()
    spec["wave_index"] = spec.groupby("spectrum_filename").intensity.transform(
        lambda x: np.arange(len(x)))
    feat = pd.pivot(spec,
                    index="spectrum_filename",
                    columns="wave_index",
                    values="intensity").ffill(axis=1)
    feat.columns = [f"intensity_{i:03d}" for i in range(512)]
    df = df.merge(feat, left_on="spectrum_filename", right_index=True)
    return df.iloc[:, -len(feat.columns):]


if __name__ == "__main__":
    submission = pd.read_csv("input/atmaCup5__sample_submission.csv")
    train = pd.read_csv("input/train.csv")
    all_df = load_feature("all", "working")
    spec = load_feature("spec", "working")
    pad_spec = create_pad_spectrum(all_df, spec)

    # add derivative spectra
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html
    spec_array = np.stack(
        [
            pad_spec.values,
            scipy.signal.savgol_filter(pad_spec, 5, 2, deriv=0, axis=1),
            scipy.signal.savgol_filter(pad_spec, 5, 2, deriv=1, axis=1),
            scipy.signal.savgol_filter(pad_spec, 5, 2, deriv=2, axis=1),
        ],
        axis=1,
    )  # (14388, 4, 512)