Exemple #1
0
 def get_usage_aspects_from_urn(
     self, entity_urn: str, start_timestamp: int, end_timestamp: int
 ) -> Optional[List[DatasetUsageStatisticsClass]]:
     payload = {
         "urn": entity_urn,
         "entity": "dataset",
         "aspect": "datasetUsageStatistics",
         "startTimeMillis": start_timestamp,
         "endTimeMillis": end_timestamp,
     }
     headers: Dict[str, Any] = {}
     url = f"{self._gms_server}/aspects?action=getTimeseriesAspectValues"
     try:
         usage_aspects: List[DatasetUsageStatisticsClass] = []
         response = self._session.post(
             url, data=json.dumps(payload), headers=headers
         )
         if response.status_code != 200:
             logger.debug(
                 f"Non 200 status found while fetching usage aspects - {response.status_code}"
             )
             return None
         json_resp = response.json()
         all_aspects = json_resp.get("value", {}).get("values", [])
         for aspect in all_aspects:
             if aspect.get("aspect") and aspect.get("aspect").get("value"):
                 usage_aspects.append(
                     DatasetUsageStatisticsClass.from_obj(
                         json.loads(aspect.get("aspect").get("value")), tuples=True
                     )
                 )
         return usage_aspects
     except Exception as e:
         logger.error("Error while getting usage aspects.", e)
     return None
    def make_usage_workunit(
        self,
        bucket_duration: BucketDuration,
        urn_builder: Callable[[ResourceType], str],
        top_n_queries: int,
        format_sql_queries: bool,
    ) -> MetadataWorkUnit:
        budget_per_query: int = int(self.total_budget_for_query_list / top_n_queries)

        usageStats = DatasetUsageStatisticsClass(
            timestampMillis=int(self.bucket_start_time.timestamp() * 1000),
            eventGranularity=TimeWindowSizeClass(unit=bucket_duration, multiple=1),
            uniqueUserCount=len(self.userFreq),
            totalSqlQueries=self.queryCount,
            topSqlQueries=[
                self.trim_query(
                    format_sql_query(query, keyword_case="upper", reindent_aligned=True)
                    if format_sql_queries
                    else query,
                    budget_per_query,
                )
                for query, _ in self.queryFreq.most_common(top_n_queries)
            ],
            userCounts=[
                DatasetUserUsageCountsClass(
                    user=builder.make_user_urn(user_email.split("@")[0]),
                    count=count,
                    userEmail=user_email,
                )
                for user_email, count in self.userFreq.most_common()
            ],
            fieldCounts=[
                DatasetFieldUsageCountsClass(
                    fieldPath=column,
                    count=count,
                )
                for column, count in self.columnFreq.most_common()
            ],
        )

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            aspectName="datasetUsageStatistics",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=urn_builder(self.resource),
            aspect=usageStats,
        )

        return MetadataWorkUnit(
            id=f"{self.bucket_start_time.isoformat()}-{self.resource}", mcp=mcp
        )