コード例 #1
0
def test_determine_survival():
    df = load_dataset(os.path.join(data_dir, 'train.csv'),
                      os.path.join(data_dir, 'test.csv'))
    prob = determine_survival(df, 20, age=33, sex="male")
    assert (pytest.approx(prob, 0.01) == 0.1914)
    prob = determine_survival(df, 20, age=150, sex="male")
    assert (prob == 0)
コード例 #2
0
def test_create_new_features():
    df = load_dataset(os.path.join(data_dir, 'train.csv'),
                      os.path.join(data_dir, 'test.csv'))
    df_new_features = create_new_features(df)
    assert (pytest.approx(np.exp(df_new_features.Fare_scaled).mean(),
                          0.01) == 34.5500)
    assert (pytest.approx(df_new_features.Age_log.sum(), 0.01) == 3355.0958)
    assert (df_new_features.Sex.sum() == 466)
コード例 #3
0
def test_loading():
    df = load_dataset(os.path.join(data_dir, 'train.csv'),
                      os.path.join(data_dir, 'test.csv'))

    assert (isinstance(df, pd.DataFrame))
    assert (df.shape == (1309, 10))
    assert (df.index.tolist() == list(range(1309)))
    assert (set(df['Label'].unique().tolist()) == {'Train', 'Test'})
    assert (df.Label.value_counts().Train == 891)
コード例 #4
0
def test_normalise():
    df = load_dataset(os.path.join(data_dir, 'train.csv'),
                      os.path.join(data_dir, 'test.csv'))
    scaled_df = normalise(df, "Fare")  # 0.1708061762649229 pro col="Fare"
    scaled_mean = scaled_df.loc[scaled_df.Pclass.eq(1), ["Fare"]].mean()[0]
    assert (pytest.approx(scaled_mean, 0.01) == 0.1708)
    scaled_mean = scaled_df.loc[scaled_df.Pclass.eq(2), ["Fare"]].mean()[0]
    assert (pytest.approx(scaled_mean, 0.01) == 0.2881)
    scaled_mean = scaled_df.loc[scaled_df.Pclass.eq(3), ["Fare"]].mean()[0]
    assert (pytest.approx(scaled_mean, 0.01) == 0.1912)
コード例 #5
0
def test_miss_data():
    df = load_dataset(os.path.join(data_dir, 'train.csv'),
                      os.path.join(data_dir, 'test.csv'))
    df_miss = get_missing_values(df)
    miss_mean = df_miss.Total.sum()
    assert (miss_mean == 682)
    assert (df_miss.iloc[0].Total == 418)
    assert (pytest.approx(df_miss.iloc[0].Percent, 0.01) == 31.9328)
    assert (df_miss.iloc[-1].Total == 0)
    assert (pytest.approx(df_miss.iloc[-1].Percent, 0.01) == 0)
コード例 #6
0
def test_get_survived_per_class():
    df = load_dataset(os.path.join(data_dir, 'train.csv'),
                      os.path.join(data_dir, 'test.csv'))
    survived = get_survived_per_class(df, group_by_column_name="Pclass")
    assert (
        survived.to_json() ==
        '{"Pclass":{"0":1,"1":2,"2":3},"Survived":{"0":0.63,"1":0.47,"2":0.24}}'
    )
    survived = get_survived_per_class(df, group_by_column_name="Sex")
    assert (survived.to_json() ==
            '{"Sex":{"0":"female","1":"male"},"Survived":{"0":0.74,"1":0.19}}')
コード例 #7
0
def test_outliers():
    df = load_dataset(os.path.join(data_dir, 'train.csv'),
                      os.path.join(data_dir, 'test.csv'))
    n_outliers, name_max_fare = get_outliers(df)  # 171, "Ward, Miss. Anna"
    assert (n_outliers == 171)
    assert (name_max_fare == 'Ward, Miss. Anna')
コード例 #8
0
def test_correlation():
    df = load_dataset(os.path.join(data_dir, 'train.csv'),
                      os.path.join(data_dir, 'test.csv'))
    result = get_correlation(df)
    assert (pytest.approx(result, 0.01) == 0.178)
コード例 #9
0
def test_substitute_value():
    df = load_dataset(os.path.join(data_dir, 'train.csv'),
                      os.path.join(data_dir, 'test.csv'))
    result = substitute_missing_values(df)
    assert (pytest.approx(result.iloc[17].Age, 0.01) == 29.8811)
    assert (pytest.approx(result.iloc[1043].Fare, 0.01) == 14.4542)