def test_all_missing(self):
        index = ["bla", "blue", "blo"]
        values0 = [np.nan, np.nan, np.nan]
        values1 = [0.4, np.nan, 0.8]
        values2 = [0.3, 0.0, np.nan]

        util.mean_impute_frame(self.make_input(index, [values0, values1, values2]))
Beispiel #2
0
    def test_all_missing(self):
        index = ["bla", "blue", "blo"]
        values0 = [np.nan, np.nan, np.nan]
        values1 = [0.4, np.nan, 0.8]
        values2 = [0.3, 0.0, np.nan]

        util.mean_impute_frame(
            self.make_input(index, [values0, values1, values2]))
    def test_impute_one_series(self):
        index = ["bla", "blue", "blo"]
        values0 = [0.2, 0.4, 0.4]
        values1 = [0.4, np.nan, 0.8]
        values2 = [0.3, 0.0, 0.8]

        expected0 = values0, [0, 0, 0]
        expected1 = [0.4, 0.6, 0.8], [0, 1, 0]
        expected2 = values2, [0, 0, 0]
        expected = self.make_expected(index, [expected0, expected1, expected2])

        actual = util.mean_impute_frame(self.make_input(index, [values0, values1, values2]))
        assert_frame_equal(expected, actual)
Beispiel #4
0
    def test_no_impute(self):
        index = ["bla", "blue", "blo"]
        values0 = [0.2, 0.4, 0.4]
        values1 = [0.4, 0.8, 0.8]
        values2 = [0.3, 0.0, 0.8]

        expected0 = values0, [0, 0, 0]
        expected1 = values1, [0, 0, 0]
        expected2 = values2, [0, 0, 0]
        expected = self.make_expected(index, [expected0, expected1, expected2])

        actual = util.mean_impute_frame(
            self.make_input(index, [values0, values1, values2]))
        assert_frame_equal(expected, actual)
Beispiel #5
0
    def test_impute_three_series(self):
        index = ["bla", "blue", "blo"]
        values0 = [np.nan, 0.4, np.nan]
        values1 = [0.4, np.nan, 0.8]
        values2 = [0.3, 0.0, np.nan]

        expected0 = [0.4, 0.4, 0.4], [1, 0, 1]
        expected1 = [0.4, 0.6, 0.8], [0, 1, 0]
        expected2 = [0.3, 0.0, 0.15], [0, 0, 1]
        expected = self.make_expected(index, [expected0, expected1, expected2])

        actual = util.mean_impute_frame(
            self.make_input(index, [values0, values1, values2]))
        assert_frame_equal(expected, actual)
    def test_all_missing_not_in_subset(self):
        index = ["bla", "blue", "blo"]
        values0 = [np.nan, np.nan, np.nan]
        values1 = [0.4, np.nan, 0.8]
        values2 = [0.3, 0.0, np.nan]
        subset = ["original2"]

        expected0 = values0, None
        expected1 = values1, None
        expected2 = [0.3, 0.0, 0.15], [0, 0, 1]
        expected = self.make_expected(index, [expected0, expected1, expected2])

        actual = util.mean_impute_frame(self.make_input(index, [values0, values1, values2]), subset=subset)
        assert_frame_equal(expected, actual)
    def test_impute_empty_subset(self):
        index = ["bla", "blue", "blo"]
        values0 = [np.nan, 0.4, np.nan]
        values1 = [0.4, np.nan, 0.8]
        values2 = [0.3, 0.0, np.nan]
        subset = []

        expected0 = values0, None
        expected1 = values1, None
        expected2 = values2, None
        expected = self.make_expected(index, [expected0, expected1, expected2])

        actual = util.mean_impute_frame(self.make_input(index, [values0, values1, values2]), subset=subset)
        assert_frame_equal(expected, actual)
Beispiel #8
0
    def test_all_missing_not_in_subset(self):
        index = ["bla", "blue", "blo"]
        values0 = [np.nan, np.nan, np.nan]
        values1 = [0.4, np.nan, 0.8]
        values2 = [0.3, 0.0, np.nan]
        subset = ["original2"]

        expected0 = values0, None
        expected1 = values1, None
        expected2 = [0.3, 0.0, 0.15], [0, 0, 1]
        expected = self.make_expected(index, [expected0, expected1, expected2])

        actual = util.mean_impute_frame(self.make_input(
            index, [values0, values1, values2]),
                                        subset=subset)
        assert_frame_equal(expected, actual)
Beispiel #9
0
    def test_impute_empty_subset(self):
        index = ["bla", "blue", "blo"]
        values0 = [np.nan, 0.4, np.nan]
        values1 = [0.4, np.nan, 0.8]
        values2 = [0.3, 0.0, np.nan]
        subset = []

        expected0 = values0, None
        expected1 = values1, None
        expected2 = values2, None
        expected = self.make_expected(index, [expected0, expected1, expected2])

        actual = util.mean_impute_frame(self.make_input(
            index, [values0, values1, values2]),
                                        subset=subset)
        assert_frame_equal(expected, actual)
Beispiel #10
0
def get_dataset(schema, features, start_date, end_date, only_residential):
    start_date = start_date.strftime('%Y-%m-%d')
    end_date = end_date.strftime('%Y-%m-%d')
    loader = FeatureLoader(schema, start_date, end_date)

    # make sure that all features to be loaded actually exist
    for feature in features:
        if feature not in feature_loaders:
            raise UnknownFeatureError(feature)
    features = group_features_by_loader(features)

    # load inspections
    inspections = loader.load_labels(only_residential)

    # load each group of features and merge into a full dataset
    # merging makes sure we have the same index and sorting for all features and inspections
    dataset = inspections
    for loading_method, feature_group in features:
        feature_df = loader.load_feature_group(loading_method, feature_group)
        dataset = dataset.join(feature_df, how='left')
        #dataset = dataset.dropna(subset=['viol_outcome'])

    # randomize the ordering
    dataset = dataset.reset_index()
    dataset = dataset.reindex(np.random.permutation(dataset.index))
    dataset = dataset.set_index(["parcel_id", "inspection_date"])

    dataset["mean_market_value"].to_csv("tax.csv")

    # split up the dataset into features, labels, etc
    labels = dataset["viol_outcome"].values
    features = dataset.drop('viol_outcome', axis=1)
    parcels_inspections = dataset.index.values

    # impute missing feature values
    features = util.mean_impute_frame(features)

    logger.debug("Dataset has {} rows and {} features".format(
        len(labels), len(features.columns)))
    return Dataset(parcels_inspections, features, labels, features.columns)
def get_dataset(schema, features, start_date, end_date, only_residential):
    start_date = start_date.strftime('%Y-%m-%d')
    end_date = end_date.strftime('%Y-%m-%d')
    loader = FeatureLoader(schema, start_date, end_date)

    # make sure that all features to be loaded actually exist
    for feature in features:
        if feature not in feature_loaders:
            raise UnknownFeatureError(feature)
    features = group_features_by_loader(features)

    # load inspections
    inspections = loader.load_labels(only_residential)

    # load each group of features and merge into a full dataset
    # merging makes sure we have the same index and sorting for all features and inspections
    dataset = inspections
    for loading_method, feature_group in features:
        feature_df = loader.load_feature_group(loading_method, feature_group)
        dataset = dataset.join(feature_df, how='left')
        #dataset = dataset.dropna(subset=['viol_outcome'])

    # randomize the ordering
    dataset = dataset.reset_index()
    dataset = dataset.reindex(np.random.permutation(dataset.index))
    dataset = dataset.set_index(["parcel_id", "inspection_date"])

    dataset["mean_market_value"].to_csv("tax.csv")

    # split up the dataset into features, labels, etc
    labels = dataset["viol_outcome"].values
    features = dataset.drop('viol_outcome', axis=1)
    parcels_inspections = dataset.index.values

    # impute missing feature values
    features = util.mean_impute_frame(features)

    logger.debug("Dataset has {} rows and {} features".format(len(labels), len(features.columns)))
    return Dataset(parcels_inspections, features, labels, features.columns)