def test_all_missing(self): index = ["bla", "blue", "blo"] values0 = [np.nan, np.nan, np.nan] values1 = [0.4, np.nan, 0.8] values2 = [0.3, 0.0, np.nan] util.mean_impute_frame(self.make_input(index, [values0, values1, values2]))
def test_all_missing(self): index = ["bla", "blue", "blo"] values0 = [np.nan, np.nan, np.nan] values1 = [0.4, np.nan, 0.8] values2 = [0.3, 0.0, np.nan] util.mean_impute_frame( self.make_input(index, [values0, values1, values2]))
def test_impute_one_series(self): index = ["bla", "blue", "blo"] values0 = [0.2, 0.4, 0.4] values1 = [0.4, np.nan, 0.8] values2 = [0.3, 0.0, 0.8] expected0 = values0, [0, 0, 0] expected1 = [0.4, 0.6, 0.8], [0, 1, 0] expected2 = values2, [0, 0, 0] expected = self.make_expected(index, [expected0, expected1, expected2]) actual = util.mean_impute_frame(self.make_input(index, [values0, values1, values2])) assert_frame_equal(expected, actual)
def test_no_impute(self): index = ["bla", "blue", "blo"] values0 = [0.2, 0.4, 0.4] values1 = [0.4, 0.8, 0.8] values2 = [0.3, 0.0, 0.8] expected0 = values0, [0, 0, 0] expected1 = values1, [0, 0, 0] expected2 = values2, [0, 0, 0] expected = self.make_expected(index, [expected0, expected1, expected2]) actual = util.mean_impute_frame( self.make_input(index, [values0, values1, values2])) assert_frame_equal(expected, actual)
def test_impute_three_series(self): index = ["bla", "blue", "blo"] values0 = [np.nan, 0.4, np.nan] values1 = [0.4, np.nan, 0.8] values2 = [0.3, 0.0, np.nan] expected0 = [0.4, 0.4, 0.4], [1, 0, 1] expected1 = [0.4, 0.6, 0.8], [0, 1, 0] expected2 = [0.3, 0.0, 0.15], [0, 0, 1] expected = self.make_expected(index, [expected0, expected1, expected2]) actual = util.mean_impute_frame( self.make_input(index, [values0, values1, values2])) assert_frame_equal(expected, actual)
def test_all_missing_not_in_subset(self): index = ["bla", "blue", "blo"] values0 = [np.nan, np.nan, np.nan] values1 = [0.4, np.nan, 0.8] values2 = [0.3, 0.0, np.nan] subset = ["original2"] expected0 = values0, None expected1 = values1, None expected2 = [0.3, 0.0, 0.15], [0, 0, 1] expected = self.make_expected(index, [expected0, expected1, expected2]) actual = util.mean_impute_frame(self.make_input(index, [values0, values1, values2]), subset=subset) assert_frame_equal(expected, actual)
def test_impute_empty_subset(self): index = ["bla", "blue", "blo"] values0 = [np.nan, 0.4, np.nan] values1 = [0.4, np.nan, 0.8] values2 = [0.3, 0.0, np.nan] subset = [] expected0 = values0, None expected1 = values1, None expected2 = values2, None expected = self.make_expected(index, [expected0, expected1, expected2]) actual = util.mean_impute_frame(self.make_input(index, [values0, values1, values2]), subset=subset) assert_frame_equal(expected, actual)
def test_all_missing_not_in_subset(self): index = ["bla", "blue", "blo"] values0 = [np.nan, np.nan, np.nan] values1 = [0.4, np.nan, 0.8] values2 = [0.3, 0.0, np.nan] subset = ["original2"] expected0 = values0, None expected1 = values1, None expected2 = [0.3, 0.0, 0.15], [0, 0, 1] expected = self.make_expected(index, [expected0, expected1, expected2]) actual = util.mean_impute_frame(self.make_input( index, [values0, values1, values2]), subset=subset) assert_frame_equal(expected, actual)
def test_impute_empty_subset(self): index = ["bla", "blue", "blo"] values0 = [np.nan, 0.4, np.nan] values1 = [0.4, np.nan, 0.8] values2 = [0.3, 0.0, np.nan] subset = [] expected0 = values0, None expected1 = values1, None expected2 = values2, None expected = self.make_expected(index, [expected0, expected1, expected2]) actual = util.mean_impute_frame(self.make_input( index, [values0, values1, values2]), subset=subset) assert_frame_equal(expected, actual)
def get_dataset(schema, features, start_date, end_date, only_residential): start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') loader = FeatureLoader(schema, start_date, end_date) # make sure that all features to be loaded actually exist for feature in features: if feature not in feature_loaders: raise UnknownFeatureError(feature) features = group_features_by_loader(features) # load inspections inspections = loader.load_labels(only_residential) # load each group of features and merge into a full dataset # merging makes sure we have the same index and sorting for all features and inspections dataset = inspections for loading_method, feature_group in features: feature_df = loader.load_feature_group(loading_method, feature_group) dataset = dataset.join(feature_df, how='left') #dataset = dataset.dropna(subset=['viol_outcome']) # randomize the ordering dataset = dataset.reset_index() dataset = dataset.reindex(np.random.permutation(dataset.index)) dataset = dataset.set_index(["parcel_id", "inspection_date"]) dataset["mean_market_value"].to_csv("tax.csv") # split up the dataset into features, labels, etc labels = dataset["viol_outcome"].values features = dataset.drop('viol_outcome', axis=1) parcels_inspections = dataset.index.values # impute missing feature values features = util.mean_impute_frame(features) logger.debug("Dataset has {} rows and {} features".format( len(labels), len(features.columns))) return Dataset(parcels_inspections, features, labels, features.columns)
def get_dataset(schema, features, start_date, end_date, only_residential): start_date = start_date.strftime('%Y-%m-%d') end_date = end_date.strftime('%Y-%m-%d') loader = FeatureLoader(schema, start_date, end_date) # make sure that all features to be loaded actually exist for feature in features: if feature not in feature_loaders: raise UnknownFeatureError(feature) features = group_features_by_loader(features) # load inspections inspections = loader.load_labels(only_residential) # load each group of features and merge into a full dataset # merging makes sure we have the same index and sorting for all features and inspections dataset = inspections for loading_method, feature_group in features: feature_df = loader.load_feature_group(loading_method, feature_group) dataset = dataset.join(feature_df, how='left') #dataset = dataset.dropna(subset=['viol_outcome']) # randomize the ordering dataset = dataset.reset_index() dataset = dataset.reindex(np.random.permutation(dataset.index)) dataset = dataset.set_index(["parcel_id", "inspection_date"]) dataset["mean_market_value"].to_csv("tax.csv") # split up the dataset into features, labels, etc labels = dataset["viol_outcome"].values features = dataset.drop('viol_outcome', axis=1) parcels_inspections = dataset.index.values # impute missing feature values features = util.mean_impute_frame(features) logger.debug("Dataset has {} rows and {} features".format(len(labels), len(features.columns))) return Dataset(parcels_inspections, features, labels, features.columns)