Esempio n. 1
0
class TestFeatureAdder(unittest2.TestCase):
    def setUp(self):
        self.stats_calc = StatsCalculator()
        self.preprocessor = Preprocessor()
        self.feature_adder = FeatureAdder()
        self.col_names = [f'feature_{i}' for i in range(FEATURES)]

    def tearDown(self):
        self.stats_calc = None
        self.preprocessor = None
        self.feature_adder = None

    def _get_df(self):
        df = pd.read_csv('data/train.tsv', sep='\t')
        df = self.preprocessor.split_features(df)
        df = self.preprocessor.f_to_int(df)
        return df

    def test_max_index_feature(self):
        """
        Test that new feature 'max_feature_2_index' lies in proper range and has dtype 'int64'
        """
        df = self._get_df()
        new_feature = 'max_feature_2_index'

        df = self.feature_adder.max_index_feature(df)
        valid_range, valid_dtype = (0, 255), 'int64'

        assert df[new_feature].between(*valid_range).all() and df[new_feature].dtype == valid_dtype, \
            "max_feature_2_index feature not in range OR has wrong dtype"

    def test_abs_mean_diff_feature(self):
        """
        Test that new feature 'max_feature_2_abs_mean_diff' is valid
        """
        df = self._get_df()
        df = self.feature_adder.max_index_feature(df)
        new_feature = 'max_feature_2_abs_mean_diff'
        cols = np.array(self.col_names)[df['max_feature_2_index'].values]
        train_stats = find_train_stats('data/train.tsv', chunksize=10000)
        df = self.feature_adder.abs_mean_diff_feature(
            df.loc[:, df.columns != 'id_job'], train_stats)
        results = []

        for i, col in enumerate(cols):
            # keep in mind outliers in test data
            lower_bound, upper_bound = 0, train_stats[col]['std']
            results.append(lower_bound <= df[new_feature][i] <= upper_bound)

        self.assertTrue(
            np.all(results),
            "max_feature_2_index feature not in expected range OR has wrong dtype"
        )
class TestStatsCalculator(unittest2.TestCase):
    def setUp(self):
        self.stats_calc = StatsCalculator()
        self.preprocessor = Preprocessor()
        self.col_names = [f'feature_{i}' for i in range(FEATURES)]

    def tearDown(self):
        self.stats_calc = None
        self.preprocessor = None

    def _get_df(self):
        df = pd.read_csv('data/train.tsv', sep='\t')
        df = self.preprocessor.split_features(df)
        df = self.preprocessor.f_to_int(df)
        return df

    def test_mean_calc(self):
        df = self._get_df()
        col = random.choice(self.col_names)

        res = self.stats_calc.calc_mean(df, col)
        valid_res = np.mean(df[col])

        self.assertEqual(res, valid_res, "Wrong mean calculation")

    def test_std_calc(self):
        df = self._get_df()
        col = random.choice(self.col_names)

        res = self.stats_calc.calc_std(df, col)
        valid_res = np.std(df[col])

        self.assertEqual(res, valid_res, "Wrong std calculation")

    def test_speed(self):
        """
        Test parallelized mean calculation
        """
        df = self._get_df()
        col = random.choice(self.col_names)

        def wrapper(func):
            def inner(df, col, multiproc=False):
                start = time.time()
                result = func(df, col)
                end = time.time()
                print(f'\nResult of calculation: {result}')
                if multiproc:
                    print(f'Timing of calc in parallel: {end - start}')
                else:
                    print(f'Timing of sequential calc: {end - start}')
                return result

            return inner

        seq_calc = wrapper(self.stats_calc.calc_mean)
        res = seq_calc(df, col)
        parallel_calc = wrapper(self.stats_calc.calc_mean)
        parallel_calc(df, col, multiproc=True)

        true_value = np.mean(df[col])
        self.assertEqual(res, true_value, "Wrong mean calculation")