def get_or_create_session(path_to_config: Optional[str] = None, report_progress: Optional[bool] = False): """ Retrieve the current active global session. If no active session exists, attempt to load config and create a new session. If an active session exists, return the session without loading new config. :return: The global active session :rtype: Session :type path_to_config: str """ global _session if _session is not None and _session.is_active(): _getLogger(__name__).debug("Active session found, ignoring session kwargs") else: config = load_config(path_to_config) if config is None: print("WARN: Missing config") config = SessionConfig( "default-project", "default-pipeline", [WriterConfig(type="local", output_path="output", formats=["all"])], MetadataConfig(type="local", output_path="output", input_path=""), False, ) if report_progress is not None: config.report_progress = report_progress _session = session_from_config(config) return _session
def __init__( self, project: str, pipeline: str, writers: List[Writer], metadata_writer: Optional[MetadataWriter] = None, verbose: bool = False, with_rotation_time: str = None, cache_size: int = None, report_progress: bool = False, ): self._py_logger = _getLogger(__name__) if writers is None: writers = [] self.project = project self.pipeline = pipeline self.writers = writers self.metadata_writer = metadata_writer self.verbose = verbose self._active = True self._loggers = {} self._session_time = datetime.datetime.now() self._session_id = str(uuid4()) self._config = SessionConfig(project, pipeline, writers, metadata_writer, verbose) self.with_rotation_time = with_rotation_time self.cache_size = cache_size self.report_progress = report_progress # enable special logic when starting/closing a Session if we're using whylabs client to save dataset profiles whylabs_writer_is_present = any(isinstance(w, WhyLabsWriter) for w in self.writers) self.use_whylabs_writer = _use_whylabs_client or whylabs_writer_is_present # add WhyLabs writer if it's not already present (which can happen if it's not specified in the config) if _use_whylabs_client and whylabs_writer_is_present is False: self.writers.append(WhyLabsWriter(output_path=None, formats=["protobuf"]))
def test_log_metrics_with_boolean_labels(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) targets = [True, False, True] predictions = [False, True, False] scores = [0.2, 0.5, 0.6] with session.logger("metrics_test") as logger: logger.log_metrics(targets, predictions, scores) profile = logger.profile metrics_profile = profile.model_profile assert metrics_profile is not None assert len(metrics_profile.metrics.confusion_matrix.labels) == 2 shutil.rmtree(output_path, ignore_errors=True)
def test_segments_with_rotation(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig( "project", "pipeline", writers=[writer_config]) with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time: session = session_from_config(session_config) with session.logger("test", with_rotation_time='s', segments=["home_ownership"], profile_full_dataset=True, cache=1) as logger: logger.log_dataframe(df_lending_club) frozen_time.tick(delta=datetime.timedelta(seconds=1)) logger.log_dataframe(df_lending_club) frozen_time.tick(delta=datetime.timedelta(seconds=1)) df = util.testing.makeDataFrame() with pytest.raises(KeyError): logger.log_dataframe(df) output_files = [] for root, subdirs, files in os.walk(output_path): output_files += files assert len(output_files) == 8 shutil.rmtree(output_path)
def test_log_dataframe(tmpdir, df_lending_club): p = tmpdir.mkdir("whylogs") writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig( "project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("lendingclub") as logger: assert logger is not None logger.log_dataframe(df_lending_club) profile = logger.profile assert profile is not None summary = profile.flat_summary() flat_summary = summary['summary'] assert len(flat_summary) == 151 output_files = [] for root, subdirs, files in os.walk(p): output_files += files assert len(output_files) == 5
def test_log_multiple_calls(tmpdir, df_lending_club): original_dir = os.curdir os.chdir(script_dir) p = tmpdir.mkdir("whylogs") writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath( ), filename_template="dataset_summary-$dataset_timestamp") yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig( "project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) now = datetime.datetime.now() for i in range(0, 5): with session.logger(dataset_timestamp=now + datetime.timedelta(days=i)) as logger: logger.log_dataframe(df_lending_club) output_files = [] for root, subdirs, files in os.walk(p): output_files += files # we run 5 times, so we should have five times more files than the above test assert len(output_files) == 25 os.chdir(original_dir)
def test_log_rotation_days(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time: session = session_from_config(session_config) with session.logger("test", with_rotation_time='d', cache_size=1) as logger: df = util.testing.makeDataFrame() logger.log_dataframe(df) frozen_time.tick(delta=datetime.timedelta(days=1)) df = util.testing.makeDataFrame() logger.log_dataframe(df) df = util.testing.makeDataFrame() logger.log_dataframe(df) frozen_time.tick(delta=datetime.timedelta(days=2)) df = util.testing.makeDataFrame() logger.log_dataframe(df) output_files = [] for root, subdirs, files in os.walk(output_path): output_files += files assert len(output_files) == 3 shutil.rmtree(output_path)
def test_log_pil_image(tmpdir, image_files): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("image_pil_test", with_rotation_time="1m", cache_size=1) as logger: for image_file_path in image_files: img = Image.open(image_file_path) logger.log_image(img) profile = logger.profile columns = profile.columns for column_name in _EXPECTED_COLUMNS: assert column_name in columns, f"{column_name} not found in {columns}" shutil.rmtree(output_path, ignore_errors=True)
def test_log_metrics(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) targets = ["class_name1", "class_name2", "class_name3"] predictions = ["class_name1", "class_name2", "class_name2"] scores = [0.2, 0.5, 0.6] num_labels = 3 with session.logger("metrics_test") as logger: logger.log_metrics(targets, predictions, scores) profile = logger.profile metrics_profile = profile.model_profile assert metrics_profile is not None assert len( metrics_profile.metrics.confusion_matrix.labels) == num_labels shutil.rmtree(output_path)
def get_or_create_session(): """ Retrieve the current active global session. If no active session exists, attempt to load config and create a new session. If an active session exists, return the session without loading new config. Returns ------- session : Session The global active session """ global _session if _session is not None and _session.is_active(): _getLogger(__name__).debug( "Active session found, ignoring session kwargs") else: config = load_config() if config is None: print("WARN: Missing config") writer = WriterConfig(type="local", output_path="output", formats=["all"]) config = SessionConfig("default-project", "default-pipeline", [writer], False) _session = session_from_config(config) return _session
def test_session_log_dataframe(df): pass session = session_from_config( SessionConfig("default-project", "default-pipeline", [], False)) session.log_dataframe(df) assert session.logger() is not None assert session.logger("default-project").dataset_name == "default-project"
def test_session_log_dataframe(): _session = None session = session_from_config( SessionConfig("default-project", "default-pipeline", [], False)) df = util.testing.makeDataFrame() profile = session.log_dataframe(df) assert session.logger() is not None assert session.logger("default-project").dataset_name == "default-project"
def test_session_profile(df): session = session_from_config( SessionConfig("default-project", "default-pipeline", [], False)) profile = session.log_dataframe(df) assert profile is not None summary = profile.flat_summary() flat_summary = summary["summary"] assert len(flat_summary) == 4
def __init__(self, project: str, pipeline: str, writers: List[Writer], verbose: bool = False, with_rotation_time: str = None, cache: int = None): if writers is None: writers = [] self.project = project self.pipeline = pipeline self.writers = writers self.verbose = verbose self._active = True self._loggers = {} self._session_time = datetime.datetime.now() self._session_id = str(uuid4()) self._config = SessionConfig(project, pipeline, writers, verbose) self.with_rotation_time = with_rotation_time self.cache = cache
def test_session_profile(): session = session_from_config( SessionConfig("default-project", "default-pipeline", [], False)) df = util.testing.makeDataFrame() profile = session.log_dataframe(df) assert profile is not None summary = profile.flat_summary() flat_summary = summary['summary'] assert len(flat_summary) == 4
def reset_default_session(): """ Reset and deactivate the global whylogs logging session. """ global _session if _session is not None: _session.close() config: SessionConfig = load_config() if config is None: config = SessionConfig("default-project", "default-pipeline", [ WriterConfig(type="local", output_path="output", formats=["all"]) ], False) _session = session_from_config(config)
def test_segments(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with session_from_config(session_config) as session: with session.logger( "test", segments=[ [{ "key": "home_ownership", "value": "RENT" }], [{ "key": "home_ownership", "value": "MORTGAGE" }], ], cache_size=1, ) as logger: logger.log_dataframe(df_lending_club) profile = logger.profile profiles = logger.segmented_profiles mortage_segment = logger.get_segment([{ "key": "home_ownership", "value": "MORTGAGE" }]) assert profile is None assert len(profiles) == 2 assert profiles[list(profiles.keys())[0]].tags["segment"] == json.dumps([{ "key": "home_ownership", "value": "RENT" }]) assert profiles[list(profiles.keys())[1]].tags["segment"] == json.dumps([{ "key": "home_ownership", "value": "MORTGAGE" }]) check_segment = profiles[list(profiles.keys())[1]] assert mortage_segment == check_segment shutil.rmtree(output_path, ignore_errors=True)
def test_config_api(tmpdir): p = tmpdir.mkdir("whylogs") writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("test_name") as logger: logger.log_dataframe(pd.DataFrame()) session.close()
def test_segments_keys(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("test", segments=["emp_title", "home_ownership"], cache_size=1) as logger: logger.log_dataframe(df_lending_club) profiles = logger.segmented_profiles assert len(profiles) == 47 shutil.rmtree(output_path, ignore_errors=True)
def test_log_multiple_segments(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) df = pd.DataFrame(data={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3], "y": [4, 5, 6, 5, 6, 4, 6, 4, 5], "z": [0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3]}) with session.logger("image_test", segments=["x", "y"]) as logger: logger.log_segments(df) assert len(logger.segmented_profiles) == 9
def test_log_rotation_hour(tmpdir, df): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time: with session_from_config(session_config) as session: with session.logger("test", with_rotation_time="h", cache_size=1) as logger: logger.log_dataframe(df) frozen_time.tick(delta=datetime.timedelta(hours=3)) logger.log(feature_name="E", value=4) logger.log_dataframe(df) output_files = [] for _, _, files in os.walk(output_path): output_files += files assert len(output_files) == 2 shutil.rmtree(output_path, ignore_errors=True)
def test_log_image(tmpdir, image_files): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("image_test") as logger: for image_file_path in image_files: logger.log_image(image_file_path) profile = logger.profile columns = profile.columns assert len(columns) == 19 shutil.rmtree(output_path, ignore_errors=True)
def test_segments(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) test_segments = [ [{"key": "home_ownership", "value": "RENT"}], [{"key": "home_ownership", "value": "MORTGAGE"}], ] session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with session_from_config(session_config) as session: with session.logger( "test", segments=test_segments, cache_size=1, ) as logger: logger.log_dataframe(df_lending_club) profile = logger.profile profiles = logger.segmented_profiles mortage_segment = logger.get_segment(test_segments[1]) assert profile is None assert len(profiles) == 2 segment_keys = [key for key in profiles[list(profiles.keys())[0]].tags.keys() if key.startswith(_TAG_PREFIX)] for segment_key in segment_keys: assert profiles[list(profiles.keys())[0]].tags[segment_key] == test_segments[0][0][_TAG_VALUE] # 'RENT' segment_keys = [key for key in profiles[list(profiles.keys())[1]].tags.keys() if key.startswith(_TAG_PREFIX)] for segment_key in segment_keys: assert profiles[list(profiles.keys())[1]].tags[segment_key] == test_segments[1][0][_TAG_VALUE] # 'MORTGAGE' check_segment = profiles[list(profiles.keys())[1]] assert mortage_segment == check_segment shutil.rmtree(output_path, ignore_errors=True)
def test_log_pil_image(tmpdir, image_files): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("image_pil_test", with_rotation_time="s", cache_size=1) as logger: for image_file_path in image_files: img = Image.open(image_file_path) logger.log_image(img) profile = logger.profile columns = profile.columns assert len(columns) == 19 shutil.rmtree(output_path)
class Session: """ Parameters ---------- project : str The project name. We will default to the project name when logging a dataset if the dataset name is not specified pipeline : str Name of the pipeline associated with this session writers : list configuration for the output writers. This is where the log data will go verbose : bool enable verbose logging for not. Default is ``False`` """ def __init__(self, project: str, pipeline: str, writers: List[Writer], verbose: bool = False, with_rotation_time: str = None, cache: int = None): if writers is None: writers = [] self.project = project self.pipeline = pipeline self.writers = writers self.verbose = verbose self._active = True self._loggers = {} self._session_time = datetime.datetime.now() self._session_id = str(uuid4()) self._config = SessionConfig(project, pipeline, writers, verbose) self.with_rotation_time = with_rotation_time self.cache = cache def __enter__(self): # TODO: configure other aspects return self def __exit__(self, tpe, value, traceback): self.close() def __repr__(self): return self._config.to_yaml() def get_config(self, ): return self._config def is_active(self): return self._active def logger( self, dataset_name: Optional[str] = None, dataset_timestamp: Optional[datetime.datetime] = None, session_timestamp: Optional[datetime.datetime] = None, tags: Dict[str, str] = {}, metadata: Dict[str, str] = None, segments: Optional[Union[List[Dict], List[str]]] = None, profile_full_dataset: bool = False, with_rotation_time: str = None, cache: int = None, ) -> Logger: """ Create a new logger or return an existing one for a given dataset name. If no dataset_name is specified, we default to project name Parameters ---------- metadata dataset_name : str Name of the dataset. Default is the project name dataset_timestamp: datetime.datetime, optional The timestamp associated with the dataset. Could be the timestamp for the batch, or the timestamp for the window that you are tracking tags: dict Tag the data with groupable information. For example, you might want to tag your data with the stage information (development, testing, production etc...) metadata: dict Useful to debug the data source. You can associate non-groupable information in this field such as hostname, session_timestamp: datetime.datetime, optional Override the timestamp associated with the session. Normally you shouldn't need to override this value Returns ------- ylog : whylogs.app.logger.Logger whylogs logger """ if not self._active: raise RuntimeError( "Session is already closed. Cannot create more loggers") if dataset_name is None: # using the project name for the datasetname dataset_name = self.project if session_timestamp is None: session_timestamp = self._session_time if with_rotation_time is None: with_rotation_time = self.with_rotation_time if cache is None: cache = self.cache # remove inactive loggers first for name, logger in list(self._loggers.items()): if not logger.is_active(): self._loggers.pop(name) logger = self._loggers.get(dataset_name) if logger is None: logger = Logger(session_id=self._session_id, dataset_name=dataset_name, dataset_timestamp=dataset_timestamp, session_timestamp=session_timestamp, writers=self.writers, tags=tags, metadata=metadata, verbose=self.verbose, with_rotation_time=with_rotation_time, segments=segments, profile_full_dataset=profile_full_dataset, cache=cache) self._loggers[dataset_name] = logger return logger def log_dataframe( self, df: pd.DataFrame, dataset_name: Optional[str] = None, dataset_timestamp: Optional[datetime.datetime] = None, session_timestamp: Optional[datetime.datetime] = None, tags: Dict[str, str] = None, metadata: Dict[str, str] = None, segments: Optional[Union[List[Dict], List[str]]] = None, profile_full_dataset: bool = False, ) -> Optional[DatasetProfile]: """ Perform statistics caluclations and log a pandas dataframe :param df: the dataframe to profile :param dataset_name: name of the dataset :param dataset_timestamp: the timestamp for the dataset :param session_timestamp: the timestamp for the session. Override the default one :param tags: the tags for the profile. Useful when merging :param metadata: information about this current profile. Can be discarded when merging :return: a dataset profile if the session is active """ if not self.is_active(): return None if dataset_name is None: # using the project name for the datasetname dataset_name = self.project ylog = self.logger( dataset_name, dataset_timestamp, session_timestamp, tags, metadata, segments=segments, profile_full_dataset=profile_full_dataset, ) ylog.log_dataframe(df) return ylog.close() def profile_dataframe( self, df: pd.DataFrame, dataset_name: Optional[str] = None, dataset_timestamp: Optional[datetime.datetime] = None, session_timestamp: Optional[datetime.datetime] = None, tags: Dict[str, str] = None, metadata: Dict[str, str] = None, ) -> Optional[DatasetProfile]: """ Profile a Pandas dataframe without actually writing data to disk. This is useful when you just want to quickly capture and explore a dataset profile. :param df: the dataframe to profile :param dataset_name: name of the dataset :param dataset_timestamp: the timestamp for the dataset :param session_timestamp: the timestamp for the session. Override the default one :param tags: the tags for the profile. Useful when merging :param metadata: information about this current profile. Can be discarded when merging :return: a dataset profile if the session is active """ dataset_profile = self.new_profile(dataset_name, dataset_timestamp, session_timestamp, tags, metadata) if dataset_profile is None: return None dataset_profile.track_dataframe(df) return dataset_profile def new_profile( self, dataset_name: Optional[str] = None, dataset_timestamp: Optional[datetime.datetime] = None, session_timestamp: Optional[datetime.datetime] = None, tags: Dict[str, str] = None, metadata: Dict[str, str] = None, ) -> Optional[DatasetProfile]: """ Create an empty dataset profile with the metadata from the session. :param dataset_name: name of the dataset :param dataset_timestamp: the timestamp for the dataset :param session_timestamp: the timestamp for the session. Override the default one :param tags: the tags for the profile. Useful when merging :param metadata: information about this current profile. Can be discarded when merging :return: a dataset profile if the session is active """ if not self.is_active(): return None if dataset_name is None: # using the project name for the datasetname dataset_name = self.project if session_timestamp is None: session_timestamp = self._session_time if tags is None: tags = dict() if self.pipeline: tags["Pipeline"] = self.pipeline profile = DatasetProfile( dataset_name, dataset_timestamp=dataset_timestamp, session_timestamp=session_timestamp, tags=tags, metadata=metadata, ) return profile def close(self): """ Deactivate this session and flush all associated loggers """ if not self._active: print("WARNING: attempting to close an inactive session") return self._active = False loggers = list(self._loggers.items()) for name, logger in loggers: if logger.is_active(): logger.close() self.remove_logger(name) def remove_logger(self, dataset_name: str): """ Remove a logger from the dataset. This is called by the logger when it's being closed Parameters ---------- dataset_name the name of the dataset. used to identify the logger Returns None ------- """ if self._loggers.get(dataset_name) is None: raise KeyError( "WARNING: logger {} is not present in the current Session". format(dataset_name)) self._loggers.pop(dataset_name)
class Session: """ Parameters ---------- project : str The project name. We will default to the project name when logging a dataset if the dataset name is not specified pipeline : str Name of the pipeline associated with this session writers : list configuration for the output writers. This is where the log data will go verbose : bool enable verbose logging for not. Default is ``False`` """ def __init__( self, project: str, pipeline: str, writers: List[Writer], metadata_writer: Optional[MetadataWriter] = None, verbose: bool = False, with_rotation_time: str = None, cache_size: int = None, report_progress: bool = False, ): self._py_logger = _getLogger(__name__) if writers is None: writers = [] self.project = project self.pipeline = pipeline self.writers = writers self.metadata_writer = metadata_writer self.verbose = verbose self._active = True self._loggers = {} self._session_time = datetime.datetime.now() self._session_id = str(uuid4()) self._config = SessionConfig(project, pipeline, writers, metadata_writer, verbose) self.with_rotation_time = with_rotation_time self.cache_size = cache_size self.report_progress = report_progress # enable special logic when starting/closing a Session if we're using whylabs client to save dataset profiles whylabs_writer_is_present = any(isinstance(w, WhyLabsWriter) for w in self.writers) self.use_whylabs_writer = _use_whylabs_client or whylabs_writer_is_present # add WhyLabs writer if it's not already present (which can happen if it's not specified in the config) if _use_whylabs_client and whylabs_writer_is_present is False: self.writers.append(WhyLabsWriter(output_path=None, formats=["protobuf"])) def __enter__(self): if self.use_whylabs_writer: from whylogs.whylabs_client.wrapper import start_session start_session() return self def __exit__(self, tpe, value, traceback): self.close() def __repr__(self): return self._config.to_yaml() def get_config( self, ): return self._config def is_active(self): return self._active def logger( self, dataset_name: Optional[str] = None, dataset_timestamp: Optional[datetime.datetime] = None, session_timestamp: Optional[datetime.datetime] = None, tags: Dict[str, str] = None, metadata: Dict[str, str] = None, segments: Optional[Union[List[Dict], List[str], str]] = None, profile_full_dataset: bool = False, with_rotation_time: str = None, cache_size: int = 1, constraints: DatasetConstraints = None, ) -> Logger: """ Create a new logger or return an existing one for a given dataset name. If no dataset_name is specified, we default to project name Args: dataset_name: name of the dataset dataset_timestamp: timestamp of the dataset. Default to now session_timestamp: timestamp of the session. Inherits from the session tags: metadata associated with the profile metadata: same as tags. Will be deprecated segments: slice of data that the profile belongs to profile_full_dataset: when segmenting dataset, an option to keep the full unsegmented profile of the dataset with_rotation_time: rotation time in minutes our hours ("1m", "1h") cache_size: size of the segment cache constraints: whylogs contrainst to monitor against """ if not self._active: raise RuntimeError("Session is already closed. Cannot create more loggers") logger_key = str( _LoggerKey( dataset_name=dataset_name, dataset_timestamp=dataset_timestamp, session_timestamp=session_timestamp, tags=tags, metadata=metadata, segments=segments, profile_full_dataset=profile_full_dataset, with_rotation_time=with_rotation_time, cache_size=cache_size, constraints=constraints, ) ) logger = self._loggers.get(logger_key) if logger is None or not logger.is_active(): logger = Logger( session_id=self._session_id, dataset_name=dataset_name or self.project, dataset_timestamp=dataset_timestamp, session_timestamp=session_timestamp or self._session_time, writers=self.writers, metadata_writer=self.metadata_writer, tags=tags or {}, metadata=metadata, verbose=self.verbose, with_rotation_time=with_rotation_time or self.with_rotation_time, segments=segments, profile_full_dataset=profile_full_dataset, cache_size=cache_size, constraints=constraints, ) self._loggers[logger_key] = logger return logger def get_logger(self, dataset_name: str = None): return self._loggers.get(dataset_name, None) def log_dataframe( self, df: pd.DataFrame, dataset_name: Optional[str] = None, dataset_timestamp: Optional[datetime.datetime] = None, session_timestamp: Optional[datetime.datetime] = None, tags: Dict[str, str] = None, metadata: Dict[str, str] = None, segments: Optional[Union[List[Dict], List[str], str]] = None, profile_full_dataset: bool = False, constraints: DatasetConstraints = None, ) -> Optional[DatasetProfile]: """ Perform statistics caluclations and log a pandas dataframe :param df: the dataframe to profile :param dataset_name: name of the dataset :param dataset_timestamp: the timestamp for the dataset :param session_timestamp: the timestamp for the session. Override the default one :param tags: the tags for the profile. Useful when merging :param metadata: information about this current profile. Can be discarded when merging :param segments: Can be either: - Autosegmentation source, one of ["auto", "local"] - List of tag key value pairs for tracking data segments - List of tag keys for which we will track every value - None, no segments will be used :param profile_full_dataset: when segmenting dataset, an option to keep the full unsegmented profile of the dataset :return: a dataset profile if the session is active """ if not self.is_active(): return None if dataset_name is None: # using the project name for the datasetname dataset_name = self.project ylog = self.logger( dataset_name, dataset_timestamp, session_timestamp, tags, metadata, segments=segments, profile_full_dataset=profile_full_dataset, constraints=constraints, ) ylog.log_dataframe(df) return ylog.close() def profile_dataframe( self, df: pd.DataFrame, dataset_name: Optional[str] = None, dataset_timestamp: Optional[datetime.datetime] = None, session_timestamp: Optional[datetime.datetime] = None, tags: Dict[str, str] = None, metadata: Dict[str, str] = None, ) -> Optional[DatasetProfile]: """ Profile a Pandas dataframe without actually writing data to disk. This is useful when you just want to quickly capture and explore a dataset profile. :param df: the dataframe to profile :param dataset_name: name of the dataset :param dataset_timestamp: the timestamp for the dataset :param session_timestamp: the timestamp for the session. Override the default one :param tags: the tags for the profile. Useful when merging :param metadata: information about this current profile. Can be discarded when merging :return: a dataset profile if the session is active """ dataset_profile = self.new_profile(dataset_name, dataset_timestamp, session_timestamp, tags, metadata) if dataset_profile is None: return None dataset_profile.track_dataframe(df) return dataset_profile def new_profile( self, dataset_name: Optional[str] = None, dataset_timestamp: Optional[datetime.datetime] = None, session_timestamp: Optional[datetime.datetime] = None, tags: Dict[str, str] = None, metadata: Dict[str, str] = None, ) -> Optional[DatasetProfile]: """ Create an empty dataset profile with the metadata from the session. :param dataset_name: name of the dataset :param dataset_timestamp: the timestamp for the dataset :param session_timestamp: the timestamp for the session. Override the default one :param tags: the tags for the profile. Useful when merging :param metadata: information about this current profile. Can be discarded when merging :return: a dataset profile if the session is active """ if not self.is_active(): return None if dataset_name is None: # using the project name for the dataset name dataset_name = self.project if session_timestamp is None: session_timestamp = self._session_time if tags is None: tags = dict() if self.pipeline: tags["Pipeline"] = self.pipeline profile = DatasetProfile( dataset_name, dataset_timestamp=dataset_timestamp, session_timestamp=session_timestamp, tags=tags, metadata=metadata, ) return profile def estimate_segments( self, df: pd.DataFrame, name: str, target_field: str = None, max_segments: int = 30, dry_run: bool = False, ) -> Optional[Union[List[Dict], List[str]]]: """ Estimates the most important features and values on which to segment data profiling using entropy-based methods. :param df: the dataframe of data to profile :param name: name for discovery in the logger, automatically applied to loggers with same dataset_name :param target_field: target field (optional) :param max_segments: upper threshold for total combinations of segments, default 30 :param dry_run: run calculation but do not write results to metadata :return: a list of segmentation feature names """ segments = _estimate_segments(df=df, target_field=target_field, max_segments=max_segments) if not dry_run: self.metadata_writer.autosegmentation_write(name, segments) return segments def close(self): """ Deactivate this session and flush all associated loggers """ if not self._active: self._py_logger.warning("attempting to close an inactive session") return self._active = False loggers = list(self._loggers.items()) with tqdm(loggers, disable=self.report_progress is False) as t: for key, logger in t: t.set_description("Closing session") if logger.is_active(): logger.close() self.remove_logger(key) for w in self.writers: w.close() if self.use_whylabs_writer: from whylogs.whylabs_client.wrapper import end_session url = end_session() if url: print(f"You can explore your data in the WhyLabs Platform here: {url}") def remove_logger(self, dataset_name: str): """ Remove a logger from the dataset. This is called by the logger when it's being closed Parameters ---------- dataset_name the name of the dataset. used to identify the logger Returns None ------- """ if self._loggers.get(dataset_name) is None: raise KeyError("WARNING: logger {} is not present in the current Session".format(dataset_name)) self._loggers.pop(dataset_name)
def test_log_rotation_concurrency(tmpdir): log_rotation_interval = "1s" sleep_interval = 2 test_path = tmpdir.mkdir("log_rotation_concurrency_repro") writer_config = WriterConfig( "local", ["json"], test_path.realpath(), filename_template="dataset_summary-$dataset_timestamp") # Load the full lending club 1000 csv, to get a chance at hitting the bug. csv_path = os.path.join(script_dir, "lending_club_1000.csv") full_df = pd.read_csv(csv_path) # full_df has shape (1000, 151) so create a test df with 4x size by iteratively appending to self 2 times for _ in range(2): full_df = full_df.append(full_df) TEST_LOGGER.info(f"test dataframe has shape {full_df.shape}") # Create a whylogs logging session session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) TEST_LOGGER.info( f"Running rotate log test with {log_rotation_interval} flush intervals and {sleep_interval}s pause" ) profiler = cProfile.Profile() profiler.enable() with session.logger(tags={"datasetId": "model-1"}, with_rotation_time=log_rotation_interval) as ylog: ylog.log_dataframe( full_df ) # Log a larger dataframe to increase chance of rotation before seeing all columns sleep(sleep_interval) ylog.log_dataframe( full_df.head(n=2) ) # Log a smaller dataframe to get more features before rotation sleep(sleep_interval) profiler.disable() stats = pstats.Stats(profiler).sort_stats("cumulative") TEST_LOGGER.info(stats.print_stats(10)) output_files = [] for root, subdir, file_names in os.walk(test_path): if not file_names: continue if subdir: for directory in subdir: for file in file_names: full_file_path = os.path.join(root, directory, file) output_files += [full_file_path] else: for file in file_names: full_file_path = os.path.join(root, file) output_files += [full_file_path] assert len( output_files) > 0, "No output files were generated during stress test" TEST_LOGGER.debug(f"Generated {len(output_files)} dataset summary files.") feature_counts = [] for filename in output_files: feature_count = count_features(filename) if feature_count > 0: feature_counts.append((count_features(filename), filename)) assert len( feature_counts ) > 0, f"feature counts are all empty, we expect some empty files with aggressive log rotation but not all empty!" TEST_LOGGER.info( f"Feature counts all same, first file with features was {feature_counts[0]}" ) TEST_LOGGER.debug(f"There were {len(feature_counts)} files with features.") assert_all_elements_equal(feature_counts) rmtree(test_path, ignore_errors=True) TEST_LOGGER.debug(f"End cleaning up test directory {test_path}")