def test_collect_feature_data_and_vis_attributes(self, df): """Unit test that attributes pre-logging are the correct format.""" in_df, out_df = df.iloc[:, :-1], df.iloc[:, [-1]] feature_data_list = RegisteredModelVersion._compute_training_data_profile( in_df, out_df, ) feature_data_attrs = ( RegisteredModelVersion._collect_feature_data_and_vis_attributes( feature_data_list, )) for key, val in feature_data_attrs.items(): if key.startswith(_deployable_entity._FEATURE_DATA_ATTR_PREFIX): feature_data = _utils.json_to_proto(val, FeatureDataInModelVersion) self.assert_feature_data_correctness(feature_data, in_df, out_df) if feature_data.profiler_name == "MissingValuesProfiler": sample_key = feature_data.feature_name + "MissingValues" else: sample_key = feature_data.feature_name + "Distribution" sample_key = ( _deployable_entity._TRAINING_DATA_ATTR_PREFIX + RegisteredModelVersion._normalize_attribute_key(sample_key) ) assert feature_data_attrs[sample_key] == json.loads( feature_data.content)
def _get_url_for_artifact(self, dataset_component_path, method, part_num=0): """ Obtains a URL to use for accessing stored artifacts. Parameters ---------- dataset_component_path : str Filepath in dataset component blob. method : {'GET', 'PUT'} HTTP method to request for the generated URL. part_num : int, optional If using Multipart Upload, number of part to be uploaded. Returns ------- response_msg : `_DatasetVersionService.GetUrlForDatasetBlobVersioned.Response` Backend response. """ if method.upper() not in ("GET", "PUT"): raise ValueError("`method` must be one of {'GET', 'PUT'}") Message = _DatasetVersionService.GetUrlForDatasetBlobVersioned msg = Message( path_dataset_component_blob_path=dataset_component_path, method=method, part_number=part_num, ) data = _utils.proto_to_json(msg) endpoint = "{}://{}/api/v1/modeldb/dataset-version/dataset/{}/datasetVersion/{}/getUrlForDatasetBlobVersioned".format( self._conn.scheme, self._conn.socket, self.dataset_id, self.id, ) response = _utils.make_request("POST", endpoint, self._conn, json=data) _utils.raise_for_http_error(response) response_msg = _utils.json_to_proto(response.json(), Message.Response) url = response_msg.url # accommodate port-forwarded NFS store if 'https://localhost' in url[:20]: url = 'http' + url[5:] if 'localhost%3a' in url[:20]: url = url.replace('localhost%3a', 'localhost:') if 'localhost%3A' in url[:20]: url = url.replace('localhost%3A', 'localhost:') response_msg.url = url return response_msg
def _get_url_for_artifact(self, key, method, artifact_type=0, part_num=0): if method.upper() not in ("GET", "PUT"): raise ValueError("`method` must be one of {'GET', 'PUT'}") Message = _RegistryService.GetUrlForArtifact msg = Message( model_version_id=self.id, key=key, method=method, artifact_type=artifact_type, part_number=part_num, ) data = _utils.proto_to_json(msg) endpoint = "{}://{}/api/v1/registry/model_versions/{}/getUrlForArtifact".format( self._conn.scheme, self._conn.socket, self.id) response = _utils.make_request("POST", endpoint, self._conn, json=data) _utils.raise_for_http_error(response) return _utils.json_to_proto(response.json(), Message.Response)
def get_code(self): """ Gets the code version. Returns ------- dict or zipfile.ZipFile Either: - a dictionary containing Git snapshot information with at most the following items: - **filepaths** (*list of str*) - **repo_url** (*str*) – Remote repository URL - **commit_hash** (*str*) – Commit hash - **is_dirty** (*bool*) - a `ZipFile <https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile>`_ containing Python source code files """ # TODO: remove this circular dependency from ._project import Project from ._experiment import Experiment from ._experimentrun import ExperimentRun if isinstance(self, Project): # TODO: not this Message = self._service.GetProjectCodeVersion endpoint = "getProjectCodeVersion" elif isinstance(self, Experiment): Message = self._service.GetExperimentCodeVersion endpoint = "getExperimentCodeVersion" elif isinstance(self, ExperimentRun): Message = self._service.GetExperimentRunCodeVersion endpoint = "getExperimentRunCodeVersion" msg = Message(id=self.id) data = _utils.proto_to_json(msg) response = _utils.make_request("GET", self._request_url.format(endpoint), self._conn, params=data) _utils.raise_for_http_error(response) response_msg = _utils.json_to_proto(_utils.body_to_json(response), Message.Response) code_ver_msg = response_msg.code_version which_code = code_ver_msg.WhichOneof('code') if which_code == 'git_snapshot': git_snapshot_msg = code_ver_msg.git_snapshot git_snapshot = {} if git_snapshot_msg.filepaths: git_snapshot['filepaths'] = git_snapshot_msg.filepaths if git_snapshot_msg.repo: git_snapshot['repo_url'] = git_snapshot_msg.repo if git_snapshot_msg.hash: git_snapshot['commit_hash'] = git_snapshot_msg.hash if git_snapshot_msg.is_dirty != _CommonCommonService.TernaryEnum.UNKNOWN: git_snapshot[ 'is_dirty'] = git_snapshot_msg.is_dirty == _CommonCommonService.TernaryEnum.TRUE return git_snapshot elif which_code == 'code_archive': # download artifact from artifact store # pylint: disable=no-member # this method should only be called on ExperimentRun, which does have _get_url_for_artifact() url = self._get_url_for_artifact( "verta_code_archive", "GET", code_ver_msg.code_archive.artifact_type).url response = _utils.make_request("GET", url, self._conn) _utils.raise_for_http_error(response) code_archive = six.BytesIO(response.content) return zipfile.ZipFile( code_archive, 'r') # TODO: return a util class instead, maybe else: raise RuntimeError("unable find code in response")
def test_profile_training_data(self, model_version): """Integration test for logging attributes with correct structure.""" pd = pytest.importorskip("pandas") np = pytest.importorskip("numpy") cont_col = np.random.random(100) discrete_col = np.random.choice(5, 100) string_discrete_col = np.random.choice(["a", "b", "c", "d", "e"], size=100) string_freeform_col = [ uuid.uuid4().hex.upper()[0:10] for _ in range(100) ] other_col = [datetime.datetime.now() for x in range(100)] output_col = np.random.choice(2, 100) col_names = [ "Continuous_Numeric", "Discrete_Numeric", "Discrete_String", "Freeform_String", "Other", "Output_Col", ] supported_col_names = [ "Continuous_Numeric", "Discrete_Numeric", "Output_Col" ] # create dataframes df = pd.DataFrame( list( zip( cont_col, discrete_col, string_discrete_col, string_freeform_col, other_col, output_col, )), columns=col_names, ) # log to model version with new method model_version.log_training_data_profile( df.loc[:, df.columns != "Output_Col"], pd.DataFrame(df["Output_Col"]), ) # get back attributes to validate attributes = model_version.get_attributes() key = _deployable_entity._FEATURE_DATA_ATTR_PREFIX + "{}" discrete_col_missing_summary = _utils.json_to_proto( model_version.get_attribute(key.format("2")), FeatureDataInModelVersion, # missing value ) discrete_col_distribution_summary = _utils.json_to_proto( model_version.get_attribute(key.format("3")), FeatureDataInModelVersion, # missing value ) # missing value, distribution summary for each supported column + # equal number of attributes for visualization assert len(attributes.keys()) == len(supported_col_names) * 2 * 2 assert (discrete_col_distribution_summary.summary_type_name == "verta.discreteHistogram.v1") assert (discrete_col_distribution_summary.profiler_name == "BinaryHistogramProfiler") assert (len( json.loads(discrete_col_distribution_summary.content) ["discreteHistogram"]["buckets"]) <= 5) assert (discrete_col_missing_summary.summary_type_name == "verta.discreteHistogram.v1") assert discrete_col_missing_summary.profiler_name == "MissingValuesProfiler" assert (len( json.loads(discrete_col_missing_summary.content) ["discreteHistogram"]["buckets"]) == 2) # reference distribution attributes can be fetched back as histograms for col in supported_col_names: key = _deployable_entity._TRAINING_DATA_ATTR_PREFIX + col + "Distribution" histogram = model_version.get_attribute(key) assert isinstance(histogram, _verta_data_type._VertaDataType)