def test_all_values(): possible_values = ["val1", "val2", "val3"] data = pd.Series(["val1", "val3", "val1", "val2"]) expected = pd.DataFrame([(1, 0, 0), (0, 0, 1), (1, 0, 0), (0, 1, 0)], columns=possible_values) actual = get_dummies(data, possible_values) assert_frame_equal(expected, actual)
def test_empty_input(): possible_values = ["val1", "val2", "val3"] data = pd.Series([]) expected = pd.DataFrame([], columns=possible_values) actual = get_dummies(data, possible_values) assert_frame_equal(expected, actual)
def test_only_nan(): possible_values = ["val1", "val2", "val3"] data = pd.Series([np.nan, np.nan, np.nan, np.nan]) expected = pd.DataFrame([(np.nan, np.nan, np.nan), (np.nan, np.nan, np.nan), (np.nan, np.nan, np.nan), (np.nan, np.nan, np.nan)], columns=possible_values) actual = get_dummies(data, possible_values) assert_frame_equal(expected, actual)
def make_house_type_features(db_connection): """ Get information whether a house is a single-family, two-family, three-family, multi-family home or mixed used (residential + commercial) Input: db_connection: connection to postgres database. "set schema ..." must have been called on this connection to select the correct schema from which to load inspections Output: A pandas dataframe, with one row per parcels and one column per feature. """ query = ("SELECT inspections.parcel_id, parcels.class " "FROM parcels_inspections AS inspections " "JOIN shape_files.parcels_cincy AS parcels " "ON parcels.parcelid = inspections.parcel_id") df = pd.read_sql(query, con=db_connection) df = df.set_index("parcel_id") # map use code to type of home use_codes = {423: "mixed-used", 510: "single-family", 520: "two-family", 530: "three-family", 550: "multi-family", 554: "multi-family", 552: "multi-family", 599: "multi-family"} df["type"] = df["class"].apply(lambda cl: use_codes.get(cl, np.nan)) df = util.get_dummies(df["type"], possible_values=["single-family", "two-family", "three-family", "multi-family", "mixed-use"]) df = df.fillna(0) return df
def test_illegal_value(): possible_values = ["val1", "val2", "val3"] data = pd.Series(["val1", "val3", "val1", "val4"]) get_dummies(data, possible_values)