Example #1
0
def get_or_create_session(path_to_config: Optional[str] = None, report_progress: Optional[bool] = False):
    """
    Retrieve the current active global session.

    If no active session exists, attempt to load config and create a new
    session.

    If an active session exists, return the session without loading new
    config.

    :return: The global active session
    :rtype: Session
    :type path_to_config: str
    """
    global _session
    if _session is not None and _session.is_active():
        _getLogger(__name__).debug("Active session found, ignoring session kwargs")
    else:
        config = load_config(path_to_config)
        if config is None:
            print("WARN: Missing config")

            config = SessionConfig(
                "default-project",
                "default-pipeline",
                [WriterConfig(type="local", output_path="output", formats=["all"])],
                MetadataConfig(type="local", output_path="output", input_path=""),
                False,
            )
        if report_progress is not None:
            config.report_progress = report_progress

        _session = session_from_config(config)
    return _session
Example #2
0
    def __init__(
        self,
        project: str,
        pipeline: str,
        writers: List[Writer],
        metadata_writer: Optional[MetadataWriter] = None,
        verbose: bool = False,
        with_rotation_time: str = None,
        cache_size: int = None,
        report_progress: bool = False,
    ):
        self._py_logger = _getLogger(__name__)
        if writers is None:
            writers = []
        self.project = project
        self.pipeline = pipeline
        self.writers = writers
        self.metadata_writer = metadata_writer
        self.verbose = verbose
        self._active = True
        self._loggers = {}
        self._session_time = datetime.datetime.now()
        self._session_id = str(uuid4())
        self._config = SessionConfig(project, pipeline, writers, metadata_writer, verbose)
        self.with_rotation_time = with_rotation_time
        self.cache_size = cache_size
        self.report_progress = report_progress

        # enable special logic when starting/closing a Session if we're using whylabs client to save dataset profiles
        whylabs_writer_is_present = any(isinstance(w, WhyLabsWriter) for w in self.writers)
        self.use_whylabs_writer = _use_whylabs_client or whylabs_writer_is_present

        # add WhyLabs writer if it's not already present (which can happen if it's not specified in the config)
        if _use_whylabs_client and whylabs_writer_is_present is False:
            self.writers.append(WhyLabsWriter(output_path=None, formats=["protobuf"]))
Example #3
0
def test_log_metrics_with_boolean_labels(tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])

    session = session_from_config(session_config)
    targets = [True, False, True]

    predictions = [False, True, False]
    scores = [0.2, 0.5, 0.6]
    with session.logger("metrics_test") as logger:
        logger.log_metrics(targets, predictions, scores)

        profile = logger.profile
        metrics_profile = profile.model_profile

        assert metrics_profile is not None
        assert len(metrics_profile.metrics.confusion_matrix.labels) == 2
    shutil.rmtree(output_path, ignore_errors=True)
Example #4
0
def test_segments_with_rotation(df_lending_club, tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig(
        "project", "pipeline", writers=[writer_config])
    with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time:
        session = session_from_config(session_config)
        with session.logger("test", with_rotation_time='s', segments=["home_ownership"], profile_full_dataset=True, cache=1) as logger:
            logger.log_dataframe(df_lending_club)
            frozen_time.tick(delta=datetime.timedelta(seconds=1))
            logger.log_dataframe(df_lending_club)
            frozen_time.tick(delta=datetime.timedelta(seconds=1))

            df = util.testing.makeDataFrame()
            with pytest.raises(KeyError):
                logger.log_dataframe(df)
    output_files = []
    for root, subdirs, files in os.walk(output_path):
        output_files += files
    assert len(output_files) == 8
    shutil.rmtree(output_path)
Example #5
0
def test_log_dataframe(tmpdir, df_lending_club):
    p = tmpdir.mkdir("whylogs")

    writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig(
        "project", "pipeline", writers=[writer_config])
    session = session_from_config(session_config)

    with session.logger("lendingclub") as logger:
        assert logger is not None
        logger.log_dataframe(df_lending_club)
        profile = logger.profile
        assert profile is not None

        summary = profile.flat_summary()

        flat_summary = summary['summary']

        assert len(flat_summary) == 151

    output_files = []
    for root, subdirs, files in os.walk(p):
        output_files += files
    assert len(output_files) == 5
Example #6
0
def test_log_multiple_calls(tmpdir, df_lending_club):
    original_dir = os.curdir
    os.chdir(script_dir)

    p = tmpdir.mkdir("whylogs")

    writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath(
    ), filename_template="dataset_summary-$dataset_timestamp")
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig(
        "project", "pipeline", writers=[writer_config])
    session = session_from_config(session_config)

    now = datetime.datetime.now()
    for i in range(0, 5):
        with session.logger(dataset_timestamp=now + datetime.timedelta(days=i)) as logger:
            logger.log_dataframe(df_lending_club)

    output_files = []
    for root, subdirs, files in os.walk(p):
        output_files += files
    # we run 5 times, so we should have five times more files than the above test
    assert len(output_files) == 25
    os.chdir(original_dir)
Example #7
0
def test_log_rotation_days(tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])
    with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time:
        session = session_from_config(session_config)
        with session.logger("test", with_rotation_time='d',
                            cache_size=1) as logger:
            df = util.testing.makeDataFrame()
            logger.log_dataframe(df)
            frozen_time.tick(delta=datetime.timedelta(days=1))
            df = util.testing.makeDataFrame()
            logger.log_dataframe(df)
            df = util.testing.makeDataFrame()
            logger.log_dataframe(df)
            frozen_time.tick(delta=datetime.timedelta(days=2))
            df = util.testing.makeDataFrame()
            logger.log_dataframe(df)
    output_files = []
    for root, subdirs, files in os.walk(output_path):
        output_files += files
    assert len(output_files) == 3
    shutil.rmtree(output_path)
Example #8
0
def test_log_pil_image(tmpdir, image_files):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])

    session = session_from_config(session_config)

    with session.logger("image_pil_test",
                        with_rotation_time="1m",
                        cache_size=1) as logger:

        for image_file_path in image_files:
            img = Image.open(image_file_path)
            logger.log_image(img)

        profile = logger.profile
        columns = profile.columns
        for column_name in _EXPECTED_COLUMNS:
            assert column_name in columns, f"{column_name} not found in {columns}"
    shutil.rmtree(output_path, ignore_errors=True)
Example #9
0
def test_log_metrics(tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])

    session = session_from_config(session_config)
    targets = ["class_name1", "class_name2", "class_name3"]

    predictions = ["class_name1", "class_name2", "class_name2"]
    scores = [0.2, 0.5, 0.6]
    num_labels = 3
    with session.logger("metrics_test") as logger:

        logger.log_metrics(targets, predictions, scores)

        profile = logger.profile
        metrics_profile = profile.model_profile

        assert metrics_profile is not None
        assert len(
            metrics_profile.metrics.confusion_matrix.labels) == num_labels
    shutil.rmtree(output_path)
Example #10
0
def get_or_create_session():
    """
    Retrieve the current active global session.

    If no active session exists, attempt to load config and create a new
    session.

    If an active session exists, return the session without loading new
    config.

    Returns
    -------
    session : Session
        The global active session
    """
    global _session
    if _session is not None and _session.is_active():
        _getLogger(__name__).debug(
            "Active session found, ignoring session kwargs")
    else:
        config = load_config()
        if config is None:
            print("WARN: Missing config")
            writer = WriterConfig(type="local",
                                  output_path="output",
                                  formats=["all"])
            config = SessionConfig("default-project", "default-pipeline",
                                   [writer], False)
        _session = session_from_config(config)
    return _session
Example #11
0
def test_session_log_dataframe(df):
    pass

    session = session_from_config(
        SessionConfig("default-project", "default-pipeline", [], False))
    session.log_dataframe(df)

    assert session.logger() is not None

    assert session.logger("default-project").dataset_name == "default-project"
Example #12
0
def test_session_log_dataframe():
    _session = None

    session = session_from_config(
        SessionConfig("default-project", "default-pipeline", [], False))
    df = util.testing.makeDataFrame()
    profile = session.log_dataframe(df)

    assert session.logger() is not None

    assert session.logger("default-project").dataset_name == "default-project"
Example #13
0
def test_session_profile(df):

    session = session_from_config(
        SessionConfig("default-project", "default-pipeline", [], False))
    profile = session.log_dataframe(df)
    assert profile is not None

    summary = profile.flat_summary()

    flat_summary = summary["summary"]
    assert len(flat_summary) == 4
Example #14
0
 def __init__(self,
              project: str,
              pipeline: str,
              writers: List[Writer],
              verbose: bool = False,
              with_rotation_time: str = None,
              cache: int = None):
     if writers is None:
         writers = []
     self.project = project
     self.pipeline = pipeline
     self.writers = writers
     self.verbose = verbose
     self._active = True
     self._loggers = {}
     self._session_time = datetime.datetime.now()
     self._session_id = str(uuid4())
     self._config = SessionConfig(project, pipeline, writers, verbose)
     self.with_rotation_time = with_rotation_time
     self.cache = cache
Example #15
0
def test_session_profile():

    session = session_from_config(
        SessionConfig("default-project", "default-pipeline", [], False))
    df = util.testing.makeDataFrame()
    profile = session.log_dataframe(df)
    assert profile is not None

    summary = profile.flat_summary()

    flat_summary = summary['summary']
    assert len(flat_summary) == 4
Example #16
0
def reset_default_session():
    """
    Reset and deactivate the global whylogs logging session.
    """
    global _session
    if _session is not None:
        _session.close()
    config: SessionConfig = load_config()
    if config is None:
        config = SessionConfig("default-project", "default-pipeline", [
            WriterConfig(type="local", output_path="output", formats=["all"])
        ], False)
    _session = session_from_config(config)
Example #17
0
def test_segments(df_lending_club, tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])
    with session_from_config(session_config) as session:
        with session.logger(
                "test",
                segments=[
                    [{
                        "key": "home_ownership",
                        "value": "RENT"
                    }],
                    [{
                        "key": "home_ownership",
                        "value": "MORTGAGE"
                    }],
                ],
                cache_size=1,
        ) as logger:
            logger.log_dataframe(df_lending_club)
            profile = logger.profile
            profiles = logger.segmented_profiles
            mortage_segment = logger.get_segment([{
                "key": "home_ownership",
                "value": "MORTGAGE"
            }])

    assert profile is None
    assert len(profiles) == 2
    assert profiles[list(profiles.keys())[0]].tags["segment"] == json.dumps([{
        "key":
        "home_ownership",
        "value":
        "RENT"
    }])
    assert profiles[list(profiles.keys())[1]].tags["segment"] == json.dumps([{
        "key":
        "home_ownership",
        "value":
        "MORTGAGE"
    }])
    check_segment = profiles[list(profiles.keys())[1]]
    assert mortage_segment == check_segment

    shutil.rmtree(output_path, ignore_errors=True)
Example #18
0
def test_config_api(tmpdir):
    p = tmpdir.mkdir("whylogs")

    writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project", "pipeline", writers=[writer_config])

    session = session_from_config(session_config)

    with session.logger("test_name") as logger:
        logger.log_dataframe(pd.DataFrame())
    session.close()
Example #19
0
def test_segments_keys(df_lending_club, tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project", "pipeline", writers=[writer_config])
    session = session_from_config(session_config)
    with session.logger("test", segments=["emp_title", "home_ownership"], cache_size=1) as logger:
        logger.log_dataframe(df_lending_club)
        profiles = logger.segmented_profiles
        assert len(profiles) == 47
    shutil.rmtree(output_path, ignore_errors=True)
Example #20
0
def test_log_multiple_segments(tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project", "pipeline", writers=[writer_config])

    session = session_from_config(session_config)

    df = pd.DataFrame(data={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3], "y": [4, 5, 6, 5, 6, 4, 6, 4, 5], "z": [0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3]})
    with session.logger("image_test", segments=["x", "y"]) as logger:
        logger.log_segments(df)
        assert len(logger.segmented_profiles) == 9
Example #21
0
def test_log_rotation_hour(tmpdir, df):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project", "pipeline", writers=[writer_config])
    with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time:
        with session_from_config(session_config) as session:
            with session.logger("test", with_rotation_time="h", cache_size=1) as logger:
                logger.log_dataframe(df)
                frozen_time.tick(delta=datetime.timedelta(hours=3))
                logger.log(feature_name="E", value=4)
                logger.log_dataframe(df)

    output_files = []
    for _, _, files in os.walk(output_path):
        output_files += files
    assert len(output_files) == 2
    shutil.rmtree(output_path, ignore_errors=True)
def test_log_image(tmpdir, image_files):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])

    session = session_from_config(session_config)

    with session.logger("image_test") as logger:

        for image_file_path in image_files:
            logger.log_image(image_file_path)

        profile = logger.profile
        columns = profile.columns
        assert len(columns) == 19
    shutil.rmtree(output_path, ignore_errors=True)
Example #23
0
def test_segments(df_lending_club, tmpdir):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path, ignore_errors=True)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)
    test_segments = [
        [{"key": "home_ownership", "value": "RENT"}],
        [{"key": "home_ownership", "value": "MORTGAGE"}],
    ]

    session_config = SessionConfig("project", "pipeline", writers=[writer_config])
    with session_from_config(session_config) as session:
        with session.logger(
            "test",
            segments=test_segments,
            cache_size=1,
        ) as logger:
            logger.log_dataframe(df_lending_club)
            profile = logger.profile
            profiles = logger.segmented_profiles
            mortage_segment = logger.get_segment(test_segments[1])

    assert profile is None
    assert len(profiles) == 2
    segment_keys = [key for key in profiles[list(profiles.keys())[0]].tags.keys() if key.startswith(_TAG_PREFIX)]
    for segment_key in segment_keys:
        assert profiles[list(profiles.keys())[0]].tags[segment_key] == test_segments[0][0][_TAG_VALUE]  # 'RENT'

    segment_keys = [key for key in profiles[list(profiles.keys())[1]].tags.keys() if key.startswith(_TAG_PREFIX)]
    for segment_key in segment_keys:
        assert profiles[list(profiles.keys())[1]].tags[segment_key] == test_segments[1][0][_TAG_VALUE]  # 'MORTGAGE'

    check_segment = profiles[list(profiles.keys())[1]]
    assert mortage_segment == check_segment
    shutil.rmtree(output_path, ignore_errors=True)
Example #24
0
def test_log_pil_image(tmpdir, image_files):
    output_path = tmpdir.mkdir("whylogs")
    shutil.rmtree(output_path)
    writer_config = WriterConfig("local", ["protobuf"], output_path.realpath())
    yaml_data = writer_config.to_yaml()
    WriterConfig.from_yaml(yaml_data)

    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])

    session = session_from_config(session_config)

    with session.logger("image_pil_test", with_rotation_time="s",
                        cache_size=1) as logger:

        for image_file_path in image_files:
            img = Image.open(image_file_path)
            logger.log_image(img)

        profile = logger.profile
        columns = profile.columns
        assert len(columns) == 19
    shutil.rmtree(output_path)
Example #25
0
class Session:
    """
    Parameters
    ----------
    project : str
        The project name. We will default to the project name when logging
        a dataset if the dataset name is not specified
    pipeline : str
        Name of the pipeline associated with this session
    writers : list
        configuration for the output writers. This is where the log data
        will go
    verbose : bool
        enable verbose logging for not. Default is ``False``
    """
    def __init__(self,
                 project: str,
                 pipeline: str,
                 writers: List[Writer],
                 verbose: bool = False,
                 with_rotation_time: str = None,
                 cache: int = None):
        if writers is None:
            writers = []
        self.project = project
        self.pipeline = pipeline
        self.writers = writers
        self.verbose = verbose
        self._active = True
        self._loggers = {}
        self._session_time = datetime.datetime.now()
        self._session_id = str(uuid4())
        self._config = SessionConfig(project, pipeline, writers, verbose)
        self.with_rotation_time = with_rotation_time
        self.cache = cache

    def __enter__(self):
        # TODO: configure other aspects
        return self

    def __exit__(self, tpe, value, traceback):
        self.close()

    def __repr__(self):
        return self._config.to_yaml()

    def get_config(self, ):
        return self._config

    def is_active(self):
        return self._active

    def logger(
        self,
        dataset_name: Optional[str] = None,
        dataset_timestamp: Optional[datetime.datetime] = None,
        session_timestamp: Optional[datetime.datetime] = None,
        tags: Dict[str, str] = {},
        metadata: Dict[str, str] = None,
        segments: Optional[Union[List[Dict], List[str]]] = None,
        profile_full_dataset: bool = False,
        with_rotation_time: str = None,
        cache: int = None,
    ) -> Logger:
        """
        Create a new logger or return an existing one for a given dataset name.
        If no dataset_name is specified, we default to project name

        Parameters
        ----------
        metadata
        dataset_name : str
            Name of the dataset. Default is the project name
        dataset_timestamp: datetime.datetime, optional
            The timestamp associated with the dataset. Could be the timestamp
            for the batch, or the timestamp
            for the window that you are tracking
        tags: dict
            Tag the data with groupable information. For example, you might want to tag your data
            with the stage information (development, testing, production etc...)
        metadata: dict
            Useful to debug the data source. You can associate non-groupable information in this field
            such as hostname,
        session_timestamp: datetime.datetime, optional
            Override the timestamp associated with the session. Normally you
            shouldn't need to override this value
        Returns
        -------
        ylog : whylogs.app.logger.Logger
            whylogs logger
        """
        if not self._active:
            raise RuntimeError(
                "Session is already closed. Cannot create more loggers")

        if dataset_name is None:
            # using the project name for the datasetname
            dataset_name = self.project

        if session_timestamp is None:
            session_timestamp = self._session_time
        if with_rotation_time is None:
            with_rotation_time = self.with_rotation_time
        if cache is None:
            cache = self.cache
        # remove inactive loggers first
        for name, logger in list(self._loggers.items()):
            if not logger.is_active():
                self._loggers.pop(name)

        logger = self._loggers.get(dataset_name)

        if logger is None:
            logger = Logger(session_id=self._session_id,
                            dataset_name=dataset_name,
                            dataset_timestamp=dataset_timestamp,
                            session_timestamp=session_timestamp,
                            writers=self.writers,
                            tags=tags,
                            metadata=metadata,
                            verbose=self.verbose,
                            with_rotation_time=with_rotation_time,
                            segments=segments,
                            profile_full_dataset=profile_full_dataset,
                            cache=cache)
            self._loggers[dataset_name] = logger

        return logger

    def log_dataframe(
        self,
        df: pd.DataFrame,
        dataset_name: Optional[str] = None,
        dataset_timestamp: Optional[datetime.datetime] = None,
        session_timestamp: Optional[datetime.datetime] = None,
        tags: Dict[str, str] = None,
        metadata: Dict[str, str] = None,
        segments: Optional[Union[List[Dict], List[str]]] = None,
        profile_full_dataset: bool = False,
    ) -> Optional[DatasetProfile]:
        """
        Perform statistics caluclations and log a pandas dataframe

        :param df: the dataframe to profile 
        :param dataset_name: name of the dataset
        :param dataset_timestamp: the timestamp for the dataset
        :param session_timestamp: the timestamp for the session. Override the default one
        :param tags: the tags for the profile. Useful when merging
        :param metadata: information about this current profile. Can be discarded when merging
        :return: a dataset profile if the session is active
        """
        if not self.is_active():
            return None

        if dataset_name is None:
            # using the project name for the datasetname
            dataset_name = self.project

        ylog = self.logger(
            dataset_name,
            dataset_timestamp,
            session_timestamp,
            tags,
            metadata,
            segments=segments,
            profile_full_dataset=profile_full_dataset,
        )

        ylog.log_dataframe(df)

        return ylog.close()

    def profile_dataframe(
        self,
        df: pd.DataFrame,
        dataset_name: Optional[str] = None,
        dataset_timestamp: Optional[datetime.datetime] = None,
        session_timestamp: Optional[datetime.datetime] = None,
        tags: Dict[str, str] = None,
        metadata: Dict[str, str] = None,
    ) -> Optional[DatasetProfile]:
        """
        Profile a Pandas dataframe without actually writing data to disk.
        This is useful when you just want to quickly capture and explore a dataset profile.

        :param df: the dataframe to profile
        :param dataset_name: name of the dataset
        :param dataset_timestamp: the timestamp for the dataset
        :param session_timestamp: the timestamp for the session. Override the default one
        :param tags: the tags for the profile. Useful when merging
        :param metadata: information about this current profile. Can be discarded when merging
        :return: a dataset profile if the session is active
        """
        dataset_profile = self.new_profile(dataset_name, dataset_timestamp,
                                           session_timestamp, tags, metadata)

        if dataset_profile is None:
            return None

        dataset_profile.track_dataframe(df)

        return dataset_profile

    def new_profile(
        self,
        dataset_name: Optional[str] = None,
        dataset_timestamp: Optional[datetime.datetime] = None,
        session_timestamp: Optional[datetime.datetime] = None,
        tags: Dict[str, str] = None,
        metadata: Dict[str, str] = None,
    ) -> Optional[DatasetProfile]:
        """
        Create an empty dataset profile with the metadata from the session.

        :param dataset_name: name of the dataset
        :param dataset_timestamp: the timestamp for the dataset
        :param session_timestamp: the timestamp for the session. Override the default one
        :param tags: the tags for the profile. Useful when merging
        :param metadata: information about this current profile. Can be discarded when merging
        :return: a dataset profile if the session is active
        """
        if not self.is_active():
            return None

        if dataset_name is None:
            # using the project name for the datasetname
            dataset_name = self.project
        if session_timestamp is None:
            session_timestamp = self._session_time

        if tags is None:
            tags = dict()
        if self.pipeline:
            tags["Pipeline"] = self.pipeline

        profile = DatasetProfile(
            dataset_name,
            dataset_timestamp=dataset_timestamp,
            session_timestamp=session_timestamp,
            tags=tags,
            metadata=metadata,
        )

        return profile

    def close(self):
        """
        Deactivate this session and flush all associated loggers
        """
        if not self._active:
            print("WARNING: attempting to close an inactive session")
            return

        self._active = False
        loggers = list(self._loggers.items())
        for name, logger in loggers:
            if logger.is_active():
                logger.close()
            self.remove_logger(name)

    def remove_logger(self, dataset_name: str):
        """
        Remove a logger from the dataset. This is called by the logger when it's being closed

        Parameters
        ----------
        dataset_name the name of the dataset. used to identify the logger

        Returns None
        -------

        """
        if self._loggers.get(dataset_name) is None:
            raise KeyError(
                "WARNING: logger {} is not present in the current Session".
                format(dataset_name))

        self._loggers.pop(dataset_name)
Example #26
0
class Session:
    """
    Parameters
    ----------
    project : str
        The project name. We will default to the project name when logging
        a dataset if the dataset name is not specified
    pipeline : str
        Name of the pipeline associated with this session
    writers : list
        configuration for the output writers. This is where the log data
        will go
    verbose : bool
        enable verbose logging for not. Default is ``False``
    """

    def __init__(
        self,
        project: str,
        pipeline: str,
        writers: List[Writer],
        metadata_writer: Optional[MetadataWriter] = None,
        verbose: bool = False,
        with_rotation_time: str = None,
        cache_size: int = None,
        report_progress: bool = False,
    ):
        self._py_logger = _getLogger(__name__)
        if writers is None:
            writers = []
        self.project = project
        self.pipeline = pipeline
        self.writers = writers
        self.metadata_writer = metadata_writer
        self.verbose = verbose
        self._active = True
        self._loggers = {}
        self._session_time = datetime.datetime.now()
        self._session_id = str(uuid4())
        self._config = SessionConfig(project, pipeline, writers, metadata_writer, verbose)
        self.with_rotation_time = with_rotation_time
        self.cache_size = cache_size
        self.report_progress = report_progress

        # enable special logic when starting/closing a Session if we're using whylabs client to save dataset profiles
        whylabs_writer_is_present = any(isinstance(w, WhyLabsWriter) for w in self.writers)
        self.use_whylabs_writer = _use_whylabs_client or whylabs_writer_is_present

        # add WhyLabs writer if it's not already present (which can happen if it's not specified in the config)
        if _use_whylabs_client and whylabs_writer_is_present is False:
            self.writers.append(WhyLabsWriter(output_path=None, formats=["protobuf"]))

    def __enter__(self):
        if self.use_whylabs_writer:
            from whylogs.whylabs_client.wrapper import start_session

            start_session()
        return self

    def __exit__(self, tpe, value, traceback):
        self.close()

    def __repr__(self):
        return self._config.to_yaml()

    def get_config(
        self,
    ):
        return self._config

    def is_active(self):
        return self._active

    def logger(
        self,
        dataset_name: Optional[str] = None,
        dataset_timestamp: Optional[datetime.datetime] = None,
        session_timestamp: Optional[datetime.datetime] = None,
        tags: Dict[str, str] = None,
        metadata: Dict[str, str] = None,
        segments: Optional[Union[List[Dict], List[str], str]] = None,
        profile_full_dataset: bool = False,
        with_rotation_time: str = None,
        cache_size: int = 1,
        constraints: DatasetConstraints = None,
    ) -> Logger:
        """
        Create a new logger or return an existing one for a given dataset name.
        If no dataset_name is specified, we default to project name

        Args:
            dataset_name: name of the dataset
            dataset_timestamp: timestamp of the dataset. Default to now
            session_timestamp: timestamp of the session. Inherits from the session
            tags: metadata associated with the profile
            metadata: same as tags. Will be deprecated
            segments: slice of data that the profile belongs to
            profile_full_dataset: when segmenting dataset, an option to keep the full unsegmented profile of the dataset
            with_rotation_time: rotation time in minutes our hours ("1m", "1h")
            cache_size: size of the segment cache
            constraints: whylogs contrainst to monitor against
        """
        if not self._active:
            raise RuntimeError("Session is already closed. Cannot create more loggers")

        logger_key = str(
            _LoggerKey(
                dataset_name=dataset_name,
                dataset_timestamp=dataset_timestamp,
                session_timestamp=session_timestamp,
                tags=tags,
                metadata=metadata,
                segments=segments,
                profile_full_dataset=profile_full_dataset,
                with_rotation_time=with_rotation_time,
                cache_size=cache_size,
                constraints=constraints,
            )
        )
        logger = self._loggers.get(logger_key)

        if logger is None or not logger.is_active():
            logger = Logger(
                session_id=self._session_id,
                dataset_name=dataset_name or self.project,
                dataset_timestamp=dataset_timestamp,
                session_timestamp=session_timestamp or self._session_time,
                writers=self.writers,
                metadata_writer=self.metadata_writer,
                tags=tags or {},
                metadata=metadata,
                verbose=self.verbose,
                with_rotation_time=with_rotation_time or self.with_rotation_time,
                segments=segments,
                profile_full_dataset=profile_full_dataset,
                cache_size=cache_size,
                constraints=constraints,
            )
            self._loggers[logger_key] = logger

        return logger

    def get_logger(self, dataset_name: str = None):
        return self._loggers.get(dataset_name, None)

    def log_dataframe(
        self,
        df: pd.DataFrame,
        dataset_name: Optional[str] = None,
        dataset_timestamp: Optional[datetime.datetime] = None,
        session_timestamp: Optional[datetime.datetime] = None,
        tags: Dict[str, str] = None,
        metadata: Dict[str, str] = None,
        segments: Optional[Union[List[Dict], List[str], str]] = None,
        profile_full_dataset: bool = False,
        constraints: DatasetConstraints = None,
    ) -> Optional[DatasetProfile]:
        """
        Perform statistics caluclations and log a pandas dataframe

        :param df: the dataframe to profile
        :param dataset_name: name of the dataset
        :param dataset_timestamp: the timestamp for the dataset
        :param session_timestamp: the timestamp for the session. Override the default one
        :param tags: the tags for the profile. Useful when merging
        :param metadata: information about this current profile. Can be discarded when merging
        :param segments:
            Can be either:
            - Autosegmentation source, one of ["auto", "local"]
            - List of tag key value pairs for tracking data segments
            - List of tag keys for which we will track every value
            - None, no segments will be used
        :param profile_full_dataset: when segmenting dataset, an option to keep the full unsegmented profile of the dataset
        :return: a dataset profile if the session is active
        """
        if not self.is_active():
            return None

        if dataset_name is None:
            # using the project name for the datasetname
            dataset_name = self.project

        ylog = self.logger(
            dataset_name,
            dataset_timestamp,
            session_timestamp,
            tags,
            metadata,
            segments=segments,
            profile_full_dataset=profile_full_dataset,
            constraints=constraints,
        )

        ylog.log_dataframe(df)

        return ylog.close()

    def profile_dataframe(
        self,
        df: pd.DataFrame,
        dataset_name: Optional[str] = None,
        dataset_timestamp: Optional[datetime.datetime] = None,
        session_timestamp: Optional[datetime.datetime] = None,
        tags: Dict[str, str] = None,
        metadata: Dict[str, str] = None,
    ) -> Optional[DatasetProfile]:
        """
        Profile a Pandas dataframe without actually writing data to disk.
        This is useful when you just want to quickly capture and explore a dataset profile.

        :param df: the dataframe to profile
        :param dataset_name: name of the dataset
        :param dataset_timestamp: the timestamp for the dataset
        :param session_timestamp: the timestamp for the session. Override the default one
        :param tags: the tags for the profile. Useful when merging
        :param metadata: information about this current profile. Can be discarded when merging
        :return: a dataset profile if the session is active
        """
        dataset_profile = self.new_profile(dataset_name, dataset_timestamp, session_timestamp, tags, metadata)

        if dataset_profile is None:
            return None

        dataset_profile.track_dataframe(df)

        return dataset_profile

    def new_profile(
        self,
        dataset_name: Optional[str] = None,
        dataset_timestamp: Optional[datetime.datetime] = None,
        session_timestamp: Optional[datetime.datetime] = None,
        tags: Dict[str, str] = None,
        metadata: Dict[str, str] = None,
    ) -> Optional[DatasetProfile]:
        """
        Create an empty dataset profile with the metadata from the session.

        :param dataset_name: name of the dataset
        :param dataset_timestamp: the timestamp for the dataset
        :param session_timestamp: the timestamp for the session. Override the default one
        :param tags: the tags for the profile. Useful when merging
        :param metadata: information about this current profile. Can be discarded when merging
        :return: a dataset profile if the session is active
        """
        if not self.is_active():
            return None

        if dataset_name is None:
            # using the project name for the dataset name
            dataset_name = self.project
        if session_timestamp is None:
            session_timestamp = self._session_time

        if tags is None:
            tags = dict()
        if self.pipeline:
            tags["Pipeline"] = self.pipeline

        profile = DatasetProfile(
            dataset_name,
            dataset_timestamp=dataset_timestamp,
            session_timestamp=session_timestamp,
            tags=tags,
            metadata=metadata,
        )

        return profile

    def estimate_segments(
        self,
        df: pd.DataFrame,
        name: str,
        target_field: str = None,
        max_segments: int = 30,
        dry_run: bool = False,
    ) -> Optional[Union[List[Dict], List[str]]]:
        """
        Estimates the most important features and values on which to segment
        data profiling using entropy-based methods.

        :param df: the dataframe of data to profile
        :param name: name for discovery in the logger, automatically applied
        to loggers with same dataset_name
        :param target_field: target field (optional)
        :param max_segments: upper threshold for total combinations of segments,
        default 30
        :param dry_run: run calculation but do not write results to metadata
        :return: a list of segmentation feature names
        """
        segments = _estimate_segments(df=df, target_field=target_field, max_segments=max_segments)

        if not dry_run:
            self.metadata_writer.autosegmentation_write(name, segments)

        return segments

    def close(self):
        """
        Deactivate this session and flush all associated loggers
        """
        if not self._active:
            self._py_logger.warning("attempting to close an inactive session")
            return

        self._active = False
        loggers = list(self._loggers.items())
        with tqdm(loggers, disable=self.report_progress is False) as t:
            for key, logger in t:
                t.set_description("Closing session")
                if logger.is_active():
                    logger.close()
                self.remove_logger(key)

        for w in self.writers:
            w.close()

        if self.use_whylabs_writer:
            from whylogs.whylabs_client.wrapper import end_session

            url = end_session()
            if url:
                print(f"You can explore your data in the WhyLabs Platform here: {url}")

    def remove_logger(self, dataset_name: str):
        """
        Remove a logger from the dataset. This is called by the logger when it's being closed

        Parameters
        ----------
        dataset_name the name of the dataset. used to identify the logger

        Returns None
        -------

        """
        if self._loggers.get(dataset_name) is None:
            raise KeyError("WARNING: logger {} is not present in the current Session".format(dataset_name))

        self._loggers.pop(dataset_name)
def test_log_rotation_concurrency(tmpdir):
    log_rotation_interval = "1s"
    sleep_interval = 2

    test_path = tmpdir.mkdir("log_rotation_concurrency_repro")
    writer_config = WriterConfig(
        "local", ["json"],
        test_path.realpath(),
        filename_template="dataset_summary-$dataset_timestamp")

    # Load the full lending club 1000 csv, to get a chance at hitting the bug.
    csv_path = os.path.join(script_dir, "lending_club_1000.csv")
    full_df = pd.read_csv(csv_path)

    # full_df has shape (1000, 151) so create a test df with 4x size by iteratively appending to self 2 times
    for _ in range(2):
        full_df = full_df.append(full_df)

    TEST_LOGGER.info(f"test dataframe has shape {full_df.shape}")

    # Create a whylogs logging session
    session_config = SessionConfig("project",
                                   "pipeline",
                                   writers=[writer_config])
    session = session_from_config(session_config)

    TEST_LOGGER.info(
        f"Running rotate log test with {log_rotation_interval} flush intervals and {sleep_interval}s pause"
    )
    profiler = cProfile.Profile()
    profiler.enable()
    with session.logger(tags={"datasetId": "model-1"},
                        with_rotation_time=log_rotation_interval) as ylog:
        ylog.log_dataframe(
            full_df
        )  # Log a larger dataframe to increase chance of rotation before seeing all columns
        sleep(sleep_interval)
        ylog.log_dataframe(
            full_df.head(n=2)
        )  # Log a smaller dataframe to get more features before rotation
        sleep(sleep_interval)
    profiler.disable()
    stats = pstats.Stats(profiler).sort_stats("cumulative")
    TEST_LOGGER.info(stats.print_stats(10))

    output_files = []
    for root, subdir, file_names in os.walk(test_path):
        if not file_names:
            continue
        if subdir:
            for directory in subdir:
                for file in file_names:
                    full_file_path = os.path.join(root, directory, file)
                    output_files += [full_file_path]
        else:
            for file in file_names:
                full_file_path = os.path.join(root, file)
                output_files += [full_file_path]

    assert len(
        output_files) > 0, "No output files were generated during stress test"
    TEST_LOGGER.debug(f"Generated {len(output_files)} dataset summary files.")

    feature_counts = []
    for filename in output_files:
        feature_count = count_features(filename)
        if feature_count > 0:
            feature_counts.append((count_features(filename), filename))

    assert len(
        feature_counts
    ) > 0, f"feature counts are all empty, we expect some empty files with aggressive log rotation but not all empty!"
    TEST_LOGGER.info(
        f"Feature counts all same, first file with features was {feature_counts[0]}"
    )
    TEST_LOGGER.debug(f"There were {len(feature_counts)} files with features.")
    assert_all_elements_equal(feature_counts)
    rmtree(test_path, ignore_errors=True)
    TEST_LOGGER.debug(f"End cleaning up test directory {test_path}")