def test_dataset_profile_metrics(): x1 = DatasetProfile(name="test", model_profile=ModelProfile()) assert x1.tags["name"] == "test" targets_1 = ["cat", "dog", "pig"] predictions_1 = ["cat", "dog", "dog"] scores_1 = [0.1, 0.2, 0.4] x1.track_metrics(predictions_1, targets_1, scores_1) assert x1.model_profile.metrics.confusion_matrix.labels is not None
def __init__( self, dataset_name: str, dataset_timestamp: Optional[datetime.datetime] = None, session_timestamp: Optional[datetime.datetime] = None, writers=List[Writer], verbose: bool = False, ): if session_timestamp is None: session_timestamp = datetime.datetime.now(datetime.timezone.utc) self.dataset_name = dataset_name self.writers = writers self.verbose = verbose self._profile = DatasetProfile( dataset_name, data_timestamp=dataset_timestamp, session_timestamp=session_timestamp, ) self._active = True
def _write_protobuf(profile: DatasetProfile, rotation_suffix: str = None, **kwargs): """ Write a protobuf the dataset profile to disk in binary format to MlFlow """ import mlflow # import whylogs.mlflow.patcher as patcher name = profile.name tmp_dir = tempfile.mkdtemp() logger.debug("Using tmp dir: %s", tmp_dir) dataset_dir = name or "default" output_dir = os.path.join(tmp_dir, dataset_dir) os.makedirs(output_dir, exist_ok=True) output = os.path.join(output_dir, "profile.bin") logger.debug("Writing logger %s's data to %s", output, f"whylogs/{dataset_dir}") profile.write_protobuf(output) mlflow.log_artifact(output, artifact_path=f"whylogs/{dataset_dir}") logger.debug("Successfully uploaded logger %s data to MLFlow", name)
def log_df_segment(self, df, segment: Segment): segment = sorted(segment, key=lambda x: x["key"]) segment_profile = self.get_segment(segment) if segment_profile is None: segment_profile = DatasetProfile( self.dataset_name, dataset_timestamp=datetime.datetime.now(datetime.timezone.utc), session_timestamp=self.session_timestamp, tags={ **self.tags, **{ "segment": json.dumps(segment) } }, metadata=self.metadata, session_id=self.session_id, constraints=self.constraints, ) segment_profile.track_dataframe(df) hashed_seg = hash_segment(segment) self._profiles[-1]["segmented_profiles"][ hashed_seg] = segment_profile else: segment_profile.track_dataframe(df)
def test_track_metrics(): import pandas as pd mean_absolute_error = 85.94534216005789 mean_squared_error = 11474.89611670205 root_mean_squared_error = 107.12094154133472 x1 = DatasetProfile(name="test") df = pd.read_parquet( os.path.join( os.path.join(TEST_DATA_PATH, "metrics", "2021-02-12.parquet"))) x1.track_metrics(df["predictions"].to_list(), df["targets"].to_list()) regression_metrics = x1.model_profile.metrics.regression_metrics assert regression_metrics is not None assert regression_metrics.count == len(df["predictions"].to_list()) assert regression_metrics.mean_squared_error() == pytest.approx( mean_squared_error, 0.01) assert regression_metrics.mean_absolute_error() == pytest.approx( mean_absolute_error, 0.01) assert regression_metrics.root_mean_squared_error() == pytest.approx( root_mean_squared_error, 0.01)
def _write_flat(self, profile: DatasetProfile, indent: int = 4, rotation_suffix: Optional[str] = None): """ Write output data for flat format Parameters ---------- profile : DatasetProfile the dataset profile to output indent : int The JSON indentation to use. Default is 4 """ summary = profile.to_summary() flat_table_path = os.path.join(self.output_path, self.path_suffix(profile), "flat_table") summary_df = get_dataset_frame(summary) with open( os.path.join(flat_table_path, self.file_name(profile, ".csv", rotation_suffix)), "wt") as f: summary_df.to_csv(f, index=False) json_flat_file = self.file_name(profile, ".json") _suffix = rotation_suffix or "" frequent_numbers_path = os.path.join(self.output_path, self.path_suffix(profile), f"freq_numbers{_suffix}") with open(os.path.join(frequent_numbers_path, json_flat_file), "wt") as f: hist = flatten_dataset_histograms(summary) json.dump(hist, f, indent=indent) frequent_strings_path = os.path.join(self.output_path, self.path_suffix(profile), f"frequent_strings{_suffix}") with open(os.path.join(frequent_strings_path, json_flat_file), "wt") as f: frequent_strings = flatten_dataset_frequent_strings(summary) json.dump(frequent_strings, f, indent=indent) histogram_path = os.path.join(self.output_path, self.path_suffix(profile), f"histogram{_suffix}") with open(os.path.join(histogram_path, json_flat_file), "wt") as f: histogram = flatten_dataset_histograms(summary) json.dump(histogram, f, indent=indent)
def _write_protobuf(self, profile: DatasetProfile, rotation_suffix: Optional[str] = None): """ Write a datasetprofile protobuf serialization to S3 """ path = os.path.join(self.output_path, self.path_suffix(profile), "protobuf") with open( os.path.join(path, self.file_name(profile, ".bin", rotation_suffix)), "wb") as f: f.write(profile.serialize_delimited())
def _write_protobuf(self, profile: DatasetProfile, rotation_suffix: Optional[str] = None): """ Write a protobuf serialization of the DatasetProfile to disk """ path = self.ensure_path( os.path.join(self.path_suffix(profile), "protobuf")) with open( os.path.join(path, self.file_name(profile, ".bin", rotation_suffix)), "wb") as f: f.write(profile.serialize_delimited())
def test_parse_from_protobuf_with_regression(): os.path.dirname(os.path.realpath(__file__)) prof = DatasetProfile.read_protobuf( os.path.join(TEST_DATA_PATH, "metrics", "regression_java.bin")) assert prof.name == "my-model-name" assert prof.model_profile is not None assert prof.model_profile.metrics is not None prof.model_profile.metrics.confusion_matrix regression_met = prof.model_profile.metrics.regression_metrics assert regression_met is not None # metrics assert regression_met.count == 89 assert regression_met.sum_abs_diff == pytest.approx(7649.1, 0.1) assert regression_met.sum_diff == pytest.approx(522.7, 0.1) assert regression_met.sum2_diff == pytest.approx(1021265.7, 0.1)
def _write_json(self, profile: DatasetProfile, rotation_suffix: Optional[str] = None, transport_params: Optional[dict] = None): """ Write a dataset profile JSON summary to disk """ output_file = os.path.join( self.output_path, self.path_suffix(profile), "json", self.file_name(profile, ".json", rotation_suffix), ) summary = profile.to_summary() with open(output_file, "wt", transport_params=transport_params) as f: f.write(message_to_json(summary))
class Logger: """ Class for logging WhyLogs statistics. TODO: logger overrides for session config Parameters ---------- dataset_name : str The name of the dataset. Gets included in the DatasetProfile metadata and can be used in generated filenames. dataset_timestamp : datetime.datetime Timestamp of the data. session_timestamp : datetime.datetime Timestamp of the logging session writers : list A list of `Writer` objects which will be used to write the dataset profile. verbose : bool Control output verbosity """ def __init__( self, dataset_name: str, dataset_timestamp: Optional[datetime.datetime] = None, session_timestamp: Optional[datetime.datetime] = None, writers=List[Writer], verbose: bool = False, ): if session_timestamp is None: session_timestamp = datetime.datetime.now(datetime.timezone.utc) self.dataset_name = dataset_name self.writers = writers self.verbose = verbose self._profile = DatasetProfile( dataset_name, data_timestamp=dataset_timestamp, session_timestamp=session_timestamp, ) self._active = True def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def flush(self): """ Synchronously perform all remaining write tasks """ if not self._active: print("WARNING: attempting to flush a closed logger") return for writer in self.writers: writer.write(self._profile) def close(self): """ Flush and close out the logger. """ if not self._active: print("WARNING: attempting to close a closed logger") return self.flush() self._active = False def log_dataframe(self, df: pd.DataFrame): """ Generate and log a WhyLogs DatasetProfile from a pandas dataframe Parameters ---------- df : pd.DataFrame Dataframe to log """ if not self.is_active(): return self._profile.track_dataframe(df) def is_active(self): """ Return the boolean state of the logger """ return self._active