Example #1
0
 def test_imputeFeature_value(self):
     data = self.dfs['nan2'].copy()
     utdata.imputeFeature(data=data,
                          feature='b',
                          method='value',
                          methodValue=-5)
     self.assertAlmostEqual(data.loc[2, 'b'], -5)
Example #2
0
 def test_imputeFeature_regress(self):
     data = self.dfs['nan3'].copy()
     utdata.imputeFeature(data=data,
                          feature='b',
                          method='linear',
                          methodExclude=['a'])
     self.assertAlmostEqual(data.loc[1, 'b'], 1)
     self.assertAlmostEqual(data.loc[2, 'b'], 2)
Example #3
0
 def test_imputeFeature_trivial_mean_median_mode(self):
     for m in ['mean', 'median', 'mode']:
         with self.subTest(method=m):
             data = self.dfs['nan1'].copy()
             utdata.imputeFeature(data=data, feature='a', method=m)
             pd.testing.assert_frame_equal(data,
                                           self.dfs['full'],
                                           check_dtype=False)
Example #4
0
def _featuresPipeline(data: pd.DataFrame,
                      sibSpCutoff: Union[None, int] = 1,
                      parchCutoff: Union[None, int] = 1,
                      ageImputeMethod: str = 'mean',
                      syntheticFeatures: bool = False) -> pd.DataFrame:
    """
    Data cleaning of features, the full pipeline

    Args
        data: DataFrame with features as columns
        sibSpCutoff: Level to clip SibSp feature
        parchCutoff: Level to clip Parch feature
        ageImputeMethod: Method used to impute Age feature: 'mean', 'median', 'mode', 'logistic', 'tree'
        syntheticFeatures: Add new synthetic features CabinVal and Title

    Returns
        DataFrame with transformed features as columns
    """

    dataC = data.copy()  # type: pd.DataFrame

    # -- Embarked, Sex
    assert 'male' in dataC.Sex.values
    assert 'female' in dataC.Sex.values
    assert 'S' in dataC.Embarked.values
    assert 'C' in dataC.Embarked.values
    assert 'Q' in dataC.Embarked.values
    utdata.imputeFeature(dataC,
                         feature='Embarked',
                         method='mode',
                         verbose=False)
    dataC = pd.get_dummies(dataC,
                           columns=['Embarked', 'Sex'],
                           prefix_sep='',
                           dtype=utdata.CATEGORICAL_TYPE)
    dataC.drop(columns=['EmbarkedQ', 'Sexmale'], inplace=True)
    dataC.rename(columns={'Sexfemale': 'Female'}, inplace=True)

    # -- Cabin, Name, Ticket
    if syntheticFeatures:
        dataC['CabinNan'] = dataC['Cabin'].isna().astype(
            utdata.CATEGORICAL_TYPE)
        dataC['AgeNan'] = dataC['Age'].isna().astype(utdata.CATEGORICAL_TYPE)
        titles = dataC.Name.apply(utdata.getTitle)
        titles[(titles == 'Mlle')] = 'Miss'
        titles[(titles == 'Mme')] = 'Mrs'
        titles[(titles != 'Mr') & (titles != 'Miss') & (titles != 'Mrs') &
               (titles != 'Master')] = 'Rare'
        dataC['Title'] = titles
        dataC = pd.get_dummies(dataC,
                               columns=['Title'],
                               prefix_sep='',
                               dtype=utdata.CATEGORICAL_TYPE)
        dataC.drop(columns=['TitleMr', 'TitleMrs'], inplace=True)
    dataC.drop(columns=['Cabin', 'Ticket', 'Name'], inplace=True)

    # -- Fare, Age
    utdata.clipFeature(dataC, 'Fare', nStd=3)
    utdata.clipFeature(dataC, 'Age', nStd=3)
    utdata.imputeFeature(dataC, feature='Fare', method='mean', verbose=False)
    utdata.imputeFeature(dataC,
                         feature='Age',
                         method=ageImputeMethod,
                         methodValue=-100,
                         methodExclude=['Survived', 'EmbarkedS', 'EmbarkedC'],
                         verbose=False)
    assert dataC.isna().sum().sum() == 0, dataC.isna().sum()

    # -- SibSp, Parch
    if sibSpCutoff is not None:
        dataC.loc[dataC['SibSp'] > sibSpCutoff, 'SibSp'] = sibSpCutoff
    if parchCutoff is not None:
        dataC.loc[dataC['Parch'] > parchCutoff, 'Parch'] = parchCutoff

    # dataC.Survived = dataC.Survived.astype(CATEGORICAL_TYPE)

    return dataC
Example #5
0
 def test_imputeFeature_median(self):
     data = self.dfs['nan2'].copy()
     utdata.imputeFeature(data=data, feature='b', method='median')
     self.assertAlmostEqual(data.loc[2, 'b'], 0.)