Esempio n. 1
0
    def test_contains_works_with_AgeBin_and_string(self):
        big_ab = AgeBin(10, 99)

        ab = "[10, 15)"
        self.assertTrue(big_ab.contains(ab))
        ab = AgeBin.from_string(ab)
        self.assertTrue(big_ab.contains(ab))
    def test_upsample_agebin_raises_if_data_does_not_contain_requested_agebin(
            self):
        # overlap lower age
        age_bin = AgeBin.from_string('[0:10)')
        self.assertRaises(NotUpsampleable,
                          self.grouped_data.apply,
                          upsample_agebin,
                          age_bin=age_bin,
                          aggregated_cols=self.aggregation_columns,
                          weighted_cols=self.weighted_columns,
                          weighting_col=self.weighting_column)

        # overlap upper age
        age_bin = AgeBin.from_string('[10:18)')
        self.assertRaises(NotUpsampleable,
                          self.grouped_data.apply,
                          upsample_agebin,
                          age_bin=age_bin,
                          aggregated_cols=self.aggregation_columns,
                          weighted_cols=self.weighted_columns,
                          weighting_col=self.weighting_column)

        # overlap both lower and upper ages (requesting too large an age range on both sides)
        age_bin = AgeBin.from_string('[4:16)')
        self.assertRaises(NotUpsampleable,
                          self.grouped_data.apply,
                          upsample_agebin,
                          age_bin=age_bin,
                          aggregated_cols=self.aggregation_columns,
                          weighted_cols=self.weighted_columns,
                          weighting_col=self.weighting_column)
    def test_upsample_agebin_raises_if_data_not_edge_aligned(self):
        # not lower-edge aligned
        age_bin = AgeBin.from_string('[6:10)')
        self.assertRaises(NotUpsampleable,
                          self.grouped_data.apply,
                          upsample_agebin,
                          age_bin=age_bin,
                          aggregated_cols=self.aggregation_columns,
                          weighted_cols=self.weighted_columns,
                          weighting_col=self.weighting_column)

        # not upper-edge aligned
        age_bin = AgeBin.from_string('[5:14)')
        self.assertRaises(NotUpsampleable,
                          self.grouped_data.apply,
                          upsample_agebin,
                          age_bin=age_bin,
                          aggregated_cols=self.aggregation_columns,
                          weighted_cols=self.weighted_columns,
                          weighting_col=self.weighting_column)

        # not lower or upper edge aligned
        age_bin = AgeBin.from_string('[6:14)')
        self.assertRaises(NotUpsampleable,
                          self.grouped_data.apply,
                          upsample_agebin,
                          age_bin=age_bin,
                          aggregated_cols=self.aggregation_columns,
                          weighted_cols=self.weighted_columns,
                          weighting_col=self.weighting_column)
Esempio n. 4
0
 def test_equality_comparison(self):
     ab1 = AgeBin(0, 99)
     ab2 = AgeBin(0, 99)
     self.assertTrue(ab1 == ab2)
     ab2 = AgeBin(0, 98)
     self.assertFalse(ab1 == ab2)
     ab2 = AgeBin(1, 99)
     self.assertFalse(ab1 == ab2)
Esempio n. 5
0
    def test_instantiation(self):
        ab = AgeBin(15,49)
        self.assertEqual(ab.start, 15)
        self.assertEqual(ab.end, 49)
        self.assertEqual(ab.delimiter, AgeBin.DEFAULT_DELIMITER)

        ab = AgeBin(0,99,delimiter=', ')
        self.assertEqual(ab.start, 0)
        self.assertEqual(ab.end, 99)
        self.assertEqual(ab.delimiter, ', ')
Esempio n. 6
0
    def test_merge_raises_if_not_consecutive_ages(self):
        ab1 = AgeBin(10, 15)
        ab2 = AgeBin(100, 200)
        self.assertRaises(AgeBin.NotMergeable, ab1.merge, other_bin=ab2)
        self.assertRaises(AgeBin.NotMergeable, ab2.merge, other_bin=ab1)

        ab2=AgeBin(5, 10)
        # wrong order
        self.assertRaises(AgeBin.NotMergeable, ab1.merge, other_bin=ab2)
        # right order
        expected_ab = AgeBin(5, 15)
        self.assertEqual(ab2.merge(ab1), expected_ab)
Esempio n. 7
0
    def test_merge_works_with_AgeBin_and_string(self):
        expected_ab = AgeBin(10, 99)

        ab1 = AgeBin(10, 15)
        ab2 = "[15:99)"
        self.assertEqual(ab1.merge(ab2), expected_ab)

        ab2 = AgeBin.from_string(ab2)
        self.assertEqual(ab1.merge(ab2), expected_ab)
Esempio n. 8
0
def upsample_agebin(grouped_data, age_bin, aggregated_cols, weighted_cols, weighting_col):
    """
    Upsample a pandas DataFrame object containing a AgeBin column to the requested age_bin. Intended to be supplied
    as a dataframe groupby argument (to run on each group). It is ok for data outside the requested upsample range
    to be in this dataframe; it will simply be excluded in the result.
    Example usage: age_stratified_dataframe.groupby(['Year', 'Gender'].apply(upsample_agebin, AgeBin(15, 49))
    :param grouped_data: a pandas DataFrameGroupBy object, see above.
    :param age_bin: an AgeBin object representing inclusive lower and exclusive upper bounds.
    :param weighted_cols: columns in the grouped data/dataframe to do weighted sums of
    :return: A pandas DataFrame object with one row conataining the requested AgeBin-upsampled result
    """
    # Further notes:
    # verify we can do the requested upsample; this requires EXACT stitching of 'AgeBin' values to contain age_bin,
    # though data outside the requested range will be ignored here and in the upsaampling.
    if not AgeBin.can_upsample_bins(grouped_data['AgeBin'], age_bin):
        raise NotUpsampleable('Cannot upsample to age bin: %s . Data is missing.' % age_bin)

    # filter out data rows that are out of our requested age range, e.g. [50:55) is not in range of [15:49)
    filtered_df = grouped_data.loc[[age_bin.contains(ab) for ab in grouped_data['AgeBin']]]

    # grab row 0 and keep it as our base result; apply upsampled AgeBin
    result = grouped_data[0:1].reset_index(drop=True)
    result['AgeBin'] = str(age_bin)

    # aggregated data items
    total_weight = None
    for channel in aggregated_cols:
        total = np.sum(filtered_df[channel])
        result[channel] = total
        if channel == weighting_col: # hacky special case for use next
            total_weight = total

    # weighted sum items: model and reference data
    fraction = filtered_df[weighting_col] / total_weight
    for channel in weighted_cols:
        result[channel] = np.sum(fraction * filtered_df[channel])

    return result
Esempio n. 9
0
 def test_from_string_works_properly(self):
     ab_string = '[0:;:99)'
     ab = AgeBin.from_string(ab_string)
     self.assertEqual(ab.start, 0)
     self.assertEqual(ab.end, 99)
     self.assertEqual(ab.delimiter, ':;:')
Esempio n. 10
0
    def test_can_upample_bins_raises_if_target_bin_not_contained_by_bins(self):
        bins = [AgeBin(0,5), AgeBin(5, 10), AgeBin(10, 15)]
        self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 16)))
        self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(-1, 15)))

        self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(15, 99)))
        self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(16, 99)))

        self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(-5, 0)))
        self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(-5, 1)))
Esempio n. 11
0
 def test_merge_bins_works_with_AgeBin_and_string(self):
     bins = ['[0:49)', AgeBin(49, 99)]
     merged = AgeBin.merge_bins(bins=bins)
     expected = AgeBin(0,99)
     self.assertEqual(merged, expected)
    def test_upsample_agebin_works(self):
        age_bin = AgeBin.from_string('[5:10)')
        result = self.grouped_data.apply(
            upsample_agebin,
            age_bin=age_bin,
            aggregated_cols=self.aggregation_columns,
            weighted_cols=self.weighted_columns,
            weighting_col=self.weighting_column).sort_values(
                'Gender').reset_index(drop=True).sort_index()
        expected_result = [{
            'Gender': 'Male',
            'AgeBin': '[5:10)',
            'Prevalence': 0.1,
            'Sim_Prevalence': 0.4,
            'Count': 5
        }, {
            'Gender': 'Female',
            'AgeBin': '[5:10)',
            'Prevalence': 0.3,
            'Sim_Prevalence': 0.2,
            'Count': 20
        }]
        expected_result = pd.DataFrame(expected_result).sort_values(
            'Gender').reset_index(drop=True).sort_index()
        self.assertTrue(result.equals(expected_result))

        age_bin = AgeBin.from_string('[10:15)')
        result = self.grouped_data.apply(
            upsample_agebin,
            age_bin=age_bin,
            aggregated_cols=self.aggregation_columns,
            weighted_cols=self.weighted_columns,
            weighting_col=self.weighting_column).sort_values(
                'Gender').reset_index(drop=True).sort_index()
        expected_result = [{
            'Gender': 'Male',
            'AgeBin': '[10:15)',
            'Prevalence': 0.2,
            'Sim_Prevalence': 0.3,
            'Count': 15
        }, {
            'Gender': 'Female',
            'AgeBin': '[10:15)',
            'Prevalence': 0.4,
            'Sim_Prevalence': 0.1,
            'Count': 20
        }]
        expected_result = pd.DataFrame(expected_result).sort_values(
            'Gender').reset_index(drop=True).sort_index()
        self.assertTrue(result.equals(expected_result))

        age_bin = AgeBin.from_string('[5:15)')
        result = self.grouped_data.apply(
            upsample_agebin,
            age_bin=age_bin,
            aggregated_cols=self.aggregation_columns,
            weighted_cols=self.weighted_columns,
            weighting_col=self.weighting_column).sort_values(
                'Gender').reset_index(drop=True).sort_index()
        expected_result = [{
            'Gender': 'Male',
            'AgeBin': '[5:15)',
            'Prevalence': 0.175,
            'Sim_Prevalence': 0.325,
            'Count': 20
        }, {
            'Gender': 'Female',
            'AgeBin': '[5:15)',
            'Prevalence': 0.35,
            'Sim_Prevalence': 0.15,
            'Count': 40
        }]
        expected_result = pd.DataFrame(expected_result).sort_values(
            'Gender').reset_index(drop=True).sort_index()

        numerical_cols = ['Prevalence', 'Sim_Prevalence', 'Count']
        other_cols = ['Gender', 'AgeBin']

        # checking that numerical values are REALLY close; off a bit due to division in algorithm
        self.assertTrue(
            np.allclose(result[numerical_cols],
                        expected_result[numerical_cols],
                        atol=1e-16,
                        rtol=0))

        # checking non-numerical values are EXACT
        self.assertTrue(result[other_cols].equals(expected_result[other_cols]))
Esempio n. 13
0
 def test_merge_bins_works_with_unsorted_AgeBins(self):
     bins = [AgeBin(49, 99), AgeBin(0,49)]
     merged = AgeBin.merge_bins(bins=bins)
     expected = AgeBin(0,99)
     self.assertEqual(merged, expected)
Esempio n. 14
0
    def test_contains_works_properly(self):
        big_ab = AgeBin(10, 99)

        # testing a variety of edge cases, both 'contained' and not 'contained'
        ab = AgeBin(0, 9)
        self.assertFalse(big_ab.contains(ab))
        ab = AgeBin(0, 10)
        self.assertFalse(big_ab.contains(ab))
        ab = AgeBin(0, 11)
        self.assertFalse(big_ab.contains(ab))
        ab = AgeBin(98, 200)
        self.assertFalse(big_ab.contains(ab))
        ab = AgeBin(99, 200)
        self.assertFalse(big_ab.contains(ab))
        ab = AgeBin(100, 200)
        self.assertFalse(big_ab.contains(ab))

        ab = AgeBin(10, 99)
        self.assertTrue(big_ab.contains(ab))
        self.assertTrue(ab.contains(big_ab))
        ab = AgeBin(10, 15)
        self.assertTrue(big_ab.contains(ab))
        ab = AgeBin(15, 30)
        self.assertTrue(big_ab.contains(ab))
        ab = AgeBin(90, 99)
        self.assertTrue(big_ab.contains(ab))
        self.assertFalse(ab.contains(big_ab)) # and check the inverse case...
Esempio n. 15
0
 def test_merge_bins_works_properly(self):
     bins = [AgeBin(0,5), AgeBin(5, 10), AgeBin(10, 15)]
     merged = AgeBin.merge_bins(bins=bins)
     expected = AgeBin(0, 15)
     self.assertEqual(merged, expected)
Esempio n. 16
0
 def test_merge_sets_proper_delimiter(self):
     ab1 = AgeBin(5, 10, delimiter='###')
     ab2 = AgeBin(10, 15)
     merged = ab1.merge(ab2)
     self.assertNotEqual(ab1.delimiter, ab2.delimiter)
     self.assertEqual(merged.delimiter, '###')
Esempio n. 17
0
 def test_merge_bins_raises_if_unmergeable(self):
     bins = [AgeBin(0, 5), AgeBin(6, 10)]
     self.assertRaises(AgeBin.NotMergeable, AgeBin.merge_bins, bins=bins)
Esempio n. 18
0
 def test_can_upsample_bins_works_properly(self):
     bins = [AgeBin(0,5), AgeBin(5, 10), AgeBin(10, 15)]
     self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 15)))
     self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 10)))
     self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(5, 10)))
     self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(5, 15)))
     self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 5)))
     self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(10, 15)))
Esempio n. 19
0
 def test_can_upsample_bins_works_if_no_bins_are_provided(self):
     bins = []
     self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 5)))
Esempio n. 20
0
    def test_can_upsample_bins_raises_if_target_bin_edges_do_not_line_up(self):
        bins = [AgeBin(0,5), AgeBin(5, 10), AgeBin(10, 15)]
        self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 15)))

        # a variety of misalignments relative to stated bins
        self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(0, 14)))
        self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(5, 11)))
        self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(4, 10)))
        self.assertFalse(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(1, 15)))
Esempio n. 21
0
 def test_test_can_upsample_bins_works_with_AgeBin_and_string(self):
     bins = [AgeBin(0, 5), '[5:::10)', AgeBin(10, 15)]
     self.assertTrue(AgeBin.can_upsample_bins(bins=bins, target_bin=AgeBin(5, 15)))