def cast_column_to_type(col: dd.Series, expected_type: str): """Cast the given column to the expected type""" current_type = col.dtype if similar_type(current_type, expected_type): logger.debug("...not converting.") return None current_float = pd.api.types.is_float_dtype(current_type) expected_integer = pd.api.types.is_integer_dtype(expected_type) if current_float and expected_integer: logger.debug("...truncating...") # Currently "trunc" can not be applied to NA (the pandas missing value type), # because NA is a different type. It works with np.NaN though. # For our use case, that does not matter, as the conversion to integer later # will convert both NA and np.NaN to NA. col = da.trunc(col.fillna(value=np.NaN)) logger.debug(f"Need to cast from {current_type} to {expected_type}") return col.astype(expected_type)
def coerce_code(v: dd.Series, codes: List[int]) -> dd.Series: # Set non-ints and unexpected codes to missing (-1) v = dd.to_numeric(v, errors="coerce") v = v.where(v.isin(codes), np.nan) return v.fillna(-1).astype("int8")