Exemple #1
0
    def emit_mce(self, mce: MetadataChangeEvent) -> None:
        url = self._get_ingest_endpoint(mce)

        raw_mce_obj = mce.proposedSnapshot.to_obj()
        mce_obj = _rest_li_ify(raw_mce_obj)
        snapshot = {"snapshot": mce_obj}
        payload = json.dumps(snapshot)

        curl_command = _make_curl_command(self._session, "POST", url, payload)
        logger.debug(
            "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
            curl_command,
        )
        try:
            response = self._session.post(url, data=payload)

            response.raise_for_status()
        except HTTPError as e:
            info = response.json()
            raise OperationalError(
                "Unable to emit metadata to DataHub GMS", info
            ) from e
        except RequestException as e:
            raise OperationalError(
                "Unable to emit metadata to DataHub GMS", {"message": str(e)}
            ) from e
Exemple #2
0
    def _emit_generic(self, url: str, payload: str) -> None:
        curl_command = _make_curl_command(self._session, "POST", url, payload)
        logger.debug(
            "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
            curl_command,
        )
        try:
            response = self._session.post(
                url,
                data=payload,
                timeout=(self._connect_timeout_sec, self._read_timeout_sec),
            )

            response.raise_for_status()
        except HTTPError as e:
            try:
                info = response.json()
                raise OperationalError(
                    "Unable to emit metadata to DataHub GMS", info) from e
            except JSONDecodeError:
                # If we can't parse the JSON, just raise the original error.
                raise OperationalError(
                    "Unable to emit metadata to DataHub GMS",
                    {"message": str(e)}) from e
        except RequestException as e:
            raise OperationalError("Unable to emit metadata to DataHub GMS",
                                   {"message": str(e)}) from e
Exemple #3
0
 def _get_generic(self, url: str) -> Dict:
     try:
         response = self._session.get(url)
         response.raise_for_status()
         return response.json()
     except HTTPError as e:
         try:
             info = response.json()
             raise OperationalError("Unable to get metadata from DataHub",
                                    info) from e
         except JSONDecodeError:
             # If we can't parse the JSON, just raise the original error.
             raise OperationalError("Unable to get metadata from DataHub",
                                    {"message": str(e)}) from e
Exemple #4
0
 def get_latest_timeseries_value(
     self,
     entity_urn: str,
     aspect_name: str,
     aspect_type: Type[Aspect],
     filter_criteria_map: Dict[str, str],
 ) -> Optional[Aspect]:
     filter_criteria = [
         {"field": k, "value": v, "condition": "EQUAL"}
         for k, v in filter_criteria_map.items()
     ]
     query_body = {
         "urn": entity_urn,
         "entity": self._guess_entity_type(entity_urn),
         "aspect": aspect_name,
         "latestValue": True,
         "filter": {"or": [{"and": filter_criteria}]},
     }
     end_point = f"{self.config.server}/aspects?action=getTimeseriesAspectValues"
     resp: Dict = self._post_generic(end_point, query_body)
     values: list = resp.get("value", {}).get("values")
     if values:
         assert len(values) == 1
         aspect_json: str = values[0].get("aspect", {}).get("value")
         if aspect_json:
             return aspect_type.from_obj(json.loads(aspect_json), tuples=False)
         else:
             raise OperationalError(
                 f"Failed to find {aspect_type} in response {aspect_json}"
             )
     return None
Exemple #5
0
 def _post_generic(self, url: str, payload_dict: Dict) -> Dict:
     payload = json.dumps(payload_dict)
     logger.debug(payload)
     try:
         response: Response = self._session.post(url, payload)
         response.raise_for_status()
         return response.json()
     except HTTPError as e:
         try:
             info = response.json()
             raise OperationalError("Unable to get metadata from DataHub",
                                    info) from e
         except JSONDecodeError:
             # If we can't parse the JSON, just raise the original error.
             raise OperationalError("Unable to get metadata from DataHub",
                                    {"message": str(e)}) from e
Exemple #6
0
    def _emit_generic(self, url: str, payload: str) -> None:
        curl_command = _make_curl_command(self._session, "POST", url, payload)
        logger.debug(
            "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
            curl_command,
        )
        try:
            response = self._session.post(url, data=payload)

            response.raise_for_status()
        except HTTPError as e:
            info = response.json()
            raise OperationalError("Unable to emit metadata to DataHub GMS",
                                   info) from e
        except RequestException as e:
            raise OperationalError("Unable to emit metadata to DataHub GMS",
                                   {"message": str(e)}) from e
Exemple #7
0
 def _write_done_callback(
     self,
     record_envelope: RecordEnvelope,
     write_callback: WriteCallback,
     future: concurrent.futures.Future,
 ) -> None:
     if future.cancelled():
         self.report.report_failure({"error": "future was cancelled"})
         write_callback.on_failure(record_envelope,
                                   OperationalError("future was cancelled"),
                                   {})
     elif future.done():
         e = future.exception()
         if not e:
             self.report.report_record_written(record_envelope)
             start_time, end_time = future.result()
             self.report.report_downstream_latency(start_time, end_time)
             write_callback.on_success(record_envelope, {})
         elif isinstance(e, OperationalError):
             # only OperationalErrors should be ignored
             if not self.treat_errors_as_warnings:
                 self.report.report_failure({
                     "error": e.message,
                     "info": e.info
                 })
             else:
                 # trim exception stacktraces when reporting warnings
                 if "stackTrace" in e.info:
                     try:
                         e.info["stackTrace"] = "\n".join(
                             e.info["stackTrace"].split("\n")[0:2])
                     except Exception:
                         # ignore failures in trimming
                         pass
                 record = record_envelope.record
                 if isinstance(record, MetadataChangeProposalWrapper):
                     # include information about the entity that failed
                     entity_id = cast(MetadataChangeProposalWrapper,
                                      record).entityUrn
                     e.info["id"] = entity_id
                 else:
                     entity_id = None
                 self.report.report_warning({
                     "warning": e.message,
                     "info": e.info
                 })
             write_callback.on_failure(record_envelope, e, e.info)
         else:
             self.report.report_failure({"e": e})
             write_callback.on_failure(record_envelope, Exception(e), {})
Exemple #8
0
    def emit_mce(self, mce: MetadataChangeEvent) -> None:
        url = self._get_ingest_endpoint(mce)
        headers = {"X-RestLi-Protocol-Version": "2.0.0"}

        raw_mce_obj = mce.proposedSnapshot.to_obj()
        mce_obj = _rest_li_ify(raw_mce_obj)
        snapshot = {"snapshot": mce_obj}

        try:
            response = requests.post(url, headers=headers, json=snapshot)

            # import curlify
            # print(curlify.to_curl(response.request))
            # breakpoint()

            response.raise_for_status()
        except HTTPError as e:
            info = response.json()
            raise OperationalError("Unable to emit metadata to DataHub GMS",
                                   info) from e
        except RequestException as e:
            raise OperationalError("Unable to emit metadata to DataHub GMS",
                                   {"message": str(e)}) from e
Exemple #9
0
    def get_aspect_v2(
        self,
        entity_urn: str,
        aspect_type: Type[Aspect],
        aspect: str,
        aspect_type_name: Optional[str] = None,
    ) -> Optional[Aspect]:
        """
        Get an aspect for an entity.

        :param str entity_urn: The urn of the entity
        :param Type[Aspect] aspect_type: The type class of the aspect being requested (e.g. datahub.metadata.schema_classes.DatasetProperties)
        :param str aspect: The name of the aspect being requested (e.g. schemaMetadata, datasetProperties, etc.)
        :param Optional[str] aspect_type_name: The fully qualified classname of the aspect being requested. Typically not needed and extracted automatically from the class directly. (e.g. com.linkedin.common.DatasetProperties)
        :return: the Aspect as a dictionary if present, None if no aspect was found (HTTP status 404)
        :rtype: Optional[Aspect]
        :raises HttpError: if the HTTP response is not a 200 or a 404
        """
        url: str = f"{self._gms_server}/aspects/{Urn.url_encode(entity_urn)}?aspect={aspect}&version=0"
        response = self._session.get(url)
        if response.status_code == 404:
            # not found
            return None
        response.raise_for_status()
        response_json = response.json()
        if not aspect_type_name:
            record_schema: RecordSchema = aspect_type.__getattribute__(
                aspect_type, "RECORD_SCHEMA")
            if not record_schema:
                logger.warning(
                    f"Failed to infer type name of the aspect from the aspect type class {aspect_type}. Please provide an aspect_type_name. Continuing, but this will fail."
                )
            else:
                aspect_type_name = record_schema.fullname.replace(
                    ".pegasus2avro", "")
        aspect_json = response_json.get("aspect", {}).get(aspect_type_name)
        if aspect_json:
            return aspect_type.from_obj(aspect_json, tuples=True)
        else:
            raise OperationalError(
                f"Failed to find {aspect_type_name} in response {response_json}"
            )
Exemple #10
0
 def get_aspect(
     self,
     entity_urn: str,
     aspect: str,
     aspect_type_name: str,
     aspect_type: Type[Aspect],
 ) -> Optional[Aspect]:
     url = f"{self._gms_server}/aspects/{urllib.parse.quote(entity_urn)}?aspect={aspect}&version=0"
     response = self._session.get(url)
     if response.status_code == 404:
         # not found
         return None
     response.raise_for_status()
     response_json = response.json()
     aspect_json = response_json.get("aspect", {}).get(aspect_type_name)
     if aspect_json:
         return aspect_type.from_obj(aspect_json, tuples=True)
     else:
         raise OperationalError(
             f"Failed to find {aspect_type_name} in response {response_json}"
         )