コード例 #1
0
    def process(self):
        """Convert to parquet."""
        msg = (
            f"Converting CSV files to Parquet.\n\tStart date: {str(self.start_date)}\n\tFile: {str(self.report_file)}"
        )
        LOG.info(msg)
        parquet_base_filename, daily_data_frames = self.convert_to_parquet()

        # Clean up the original downloaded file
        if (
            self.provider_type != Provider.PROVIDER_OCP
            and not enable_trino_processing(self.provider_uuid, self.provider_type, self.schema_name)
        ) or enable_trino_processing(self.provider_uuid, self.provider_type, self.schema_name):
            for f in self.file_list:
                if os.path.exists(f):
                    os.remove(f)

            for f in self.files_to_remove:
                if os.path.exists(f):
                    os.remove(f)

            if os.path.exists(self.report_file):
                os.remove(self.report_file)

        return parquet_base_filename, daily_data_frames
コード例 #2
0
    def _set_updater(self):
        """
        Create the report summary updater object.

        Object is specific to the report provider.

        Args:
            None

        Returns:
            (Object) : Provider-specific report summary updater

        """
        if self._provider.type in (Provider.PROVIDER_AWS,
                                   Provider.PROVIDER_AWS_LOCAL):
            report_summary_updater = (
                AWSReportParquetSummaryUpdater if enable_trino_processing(
                    self._provider_uuid, self._provider.type,
                    self._provider.customer.schema_name) else
                AWSReportSummaryUpdater)
        elif self._provider.type in (Provider.PROVIDER_AZURE,
                                     Provider.PROVIDER_AZURE_LOCAL):
            report_summary_updater = (
                AzureReportParquetSummaryUpdater if enable_trino_processing(
                    self._provider_uuid, self._provider.type,
                    self._provider.customer.schema_name) else
                AzureReportSummaryUpdater)
        elif self._provider.type in (Provider.PROVIDER_OCP, ):
            report_summary_updater = (
                OCPReportParquetSummaryUpdater if enable_trino_processing(
                    self._provider_uuid, self._provider.type,
                    self._provider.customer.schema_name) else
                OCPReportSummaryUpdater)
        elif self._provider.type in (Provider.PROVIDER_GCP,
                                     Provider.PROVIDER_GCP_LOCAL):
            report_summary_updater = (
                GCPReportParquetSummaryUpdater if enable_trino_processing(
                    self._provider_uuid, self._provider.type,
                    self._provider.customer.schema_name) else
                GCPReportSummaryUpdater)
        else:
            return (None, None)

        ocp_cloud_updater = (OCPCloudParquetReportSummaryUpdater
                             if enable_trino_processing(
                                 self._provider_uuid, self._provider.type,
                                 self._provider.customer.schema_name) else
                             OCPCloudReportSummaryUpdater)

        LOG.info(
            f"Set report_summary_updater = {report_summary_updater.__name__}")

        return (
            report_summary_updater(self._schema, self._provider,
                                   self._manifest),
            ocp_cloud_updater(self._schema, self._provider, self._manifest),
        )
コード例 #3
0
def remove_files_not_in_set_from_s3_bucket(request_id, s3_path, manifest_id, context={}):
    """
    Removes all files in a given prefix if they are not within the given set.
    """
    if not (
        settings.ENABLE_S3_ARCHIVING
        or enable_trino_processing(context.get("provider_uuid"), context.get("provider_type"), context.get("account"))
    ):
        return []

    removed = []
    if s3_path:
        try:
            s3_resource = get_s3_resource()
            existing_objects = s3_resource.Bucket(settings.S3_BUCKET_NAME).objects.filter(Prefix=s3_path)
            for obj_summary in existing_objects:
                existing_object = obj_summary.Object()
                metadata = existing_object.metadata
                manifest = metadata.get("manifestid")
                manifest_id_str = str(manifest_id)
                key = existing_object.key
                if manifest != manifest_id_str:
                    s3_resource.Object(settings.S3_BUCKET_NAME, key).delete()
                    removed.append(key)
            if removed:
                msg = f"Removed files from s3 bucket {settings.S3_BUCKET_NAME}: {','.join(removed)}."
                LOG.info(log_json(request_id, msg, context))
        except (EndpointConnectionError, ClientError) as err:
            msg = f"Unable to remove data in bucket {settings.S3_BUCKET_NAME}.  Reason: {str(err)}"
            LOG.info(log_json(request_id, msg, context))
    return removed
コード例 #4
0
    def get_file_keys_from_s3_with_manifest_id(self,
                                               request_id,
                                               s3_path,
                                               manifest_id,
                                               context={}):
        """
        Get all files in a given prefix that match the given manifest_id.
        """
        if not enable_trino_processing(context.get("provider_uuid")):
            return []

        keys = []
        if s3_path:
            try:
                s3_resource = get_s3_resource()
                existing_objects = s3_resource.Bucket(
                    settings.S3_BUCKET_NAME).objects.filter(Prefix=s3_path)
                for obj_summary in existing_objects:
                    existing_object = obj_summary.Object()
                    metadata = existing_object.metadata
                    manifest = metadata.get("manifestid")
                    manifest_id_str = str(manifest_id)
                    key = existing_object.key
                    if manifest == manifest_id_str:
                        keys.append(key)
            except (EndpointConnectionError, ClientError) as err:
                msg = f"Unable to find data in bucket {settings.S3_BUCKET_NAME}.  Reason: {str(err)}"
                LOG.info(log_json(request_id, msg, context))
        return keys
コード例 #5
0
def create_daily_archives(request_id, account, provider_uuid, filename, filepath, manifest_id, start_date, context={}):
    """
    Create daily CSVs from incoming report and archive to S3.

    Args:
        request_id (str): The request id
        account (str): The account number
        provider_uuid (str): The uuid of a provider
        filename (str): The OCP file name
        filepath (str): The full path name of the file
        manifest_id (int): The manifest identifier
        start_date (Datetime): The start datetime of incoming report
        context (Dict): Logging context dictionary
    """
    daily_file_names = []
    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider_uuid):
        daily_files = divide_csv_daily(filepath, filename)
        for daily_file in daily_files:
            # Push to S3
            s3_csv_path = get_path_prefix(
                account, Provider.PROVIDER_OCP, provider_uuid, start_date, Config.CSV_DATA_TYPE
            )
            copy_local_report_file_to_s3_bucket(
                request_id,
                s3_csv_path,
                daily_file.get("filepath"),
                daily_file.get("filename"),
                manifest_id,
                start_date,
                context,
            )
            daily_file_names.append(daily_file.get("filepath"))
    return daily_file_names
コード例 #6
0
ファイル: common.py プロジェクト: project-koku/koku
def copy_data_to_s3_bucket(request_id,
                           path,
                           filename,
                           data,
                           manifest_id=None,
                           context={}):
    """
    Copies data to s3 bucket file
    """
    if not (settings.ENABLE_S3_ARCHIVING or enable_trino_processing(
            context.get("provider_uuid"), context.get("provider_type"),
            context.get("account"))):
        return None

    upload = None
    upload_key = f"{path}/{filename}"
    extra_args = {}
    if manifest_id:
        extra_args = {"Metadata": {"ManifestId": str(manifest_id)}}
    try:
        s3_resource = get_s3_resource()
        s3_obj = {"bucket_name": settings.S3_BUCKET_NAME, "key": upload_key}
        upload = s3_resource.Object(**s3_obj)
        upload.upload_fileobj(data, ExtraArgs=extra_args)
    except (EndpointConnectionError, ClientError) as err:
        msg = f"Unable to copy data to {upload_key} in bucket {settings.S3_BUCKET_NAME}.  Reason: {str(err)}"
        LOG.info(log_json(request_id, msg, context))
    return upload
コード例 #7
0
ファイル: report_processor.py プロジェクト: saroj3k/koku
    def _set_processor(self):
        """
        Create the report processor object.

        Processor is specific to the provider's cloud service.

        Args:
            None

        Returns:
            (Object) : Provider-specific report processor

        """
        if enable_trino_processing(self.provider_uuid):
            return ParquetReportProcessor(
                schema_name=self.schema_name,
                report_path=self.report_path,
                compression=self.compression,
                provider_uuid=self.provider_uuid,
                provider_type=self.provider_type,
                manifest_id=self.manifest_id,
                context=self.context,
            )
        if self.provider_type in (Provider.PROVIDER_AWS,
                                  Provider.PROVIDER_AWS_LOCAL):
            return AWSReportProcessor(
                schema_name=self.schema_name,
                report_path=self.report_path,
                compression=self.compression,
                provider_uuid=self.provider_uuid,
                manifest_id=self.manifest_id,
            )

        if self.provider_type in (Provider.PROVIDER_AZURE,
                                  Provider.PROVIDER_AZURE_LOCAL):
            return AzureReportProcessor(
                schema_name=self.schema_name,
                report_path=self.report_path,
                compression=self.compression,
                provider_uuid=self.provider_uuid,
                manifest_id=self.manifest_id,
            )

        if self.provider_type in (Provider.PROVIDER_OCP, ):
            return OCPReportProcessor(
                schema_name=self.schema_name,
                report_path=self.report_path,
                compression=self.compression,
                provider_uuid=self.provider_uuid,
            )
        if self.provider_type in (Provider.PROVIDER_GCP,
                                  Provider.PROVIDER_GCP_LOCAL):
            return GCPReportProcessor(
                schema_name=self.schema_name,
                report_path=self.report_path,
                compression=self.compression,
                provider_uuid=self.provider_uuid,
                manifest_id=self.manifest_id,
            )
        return None
コード例 #8
0
def provider_post_delete_callback(*args, **kwargs):
    """
    Asynchronously delete this Provider's archived data.

    Note: Signal receivers must accept keyword arguments (**kwargs).
    """
    provider = kwargs["instance"]
    if provider.authentication_id:
        provider_auth_query = Provider.objects.exclude(
            uuid=provider.uuid).filter(
                authentication_id=provider.authentication_id)
        auth_count = provider_auth_query.count()
        if auth_count == 0:
            LOG.info("Deleting unreferenced ProviderAuthentication")
            auth_query = ProviderAuthentication.objects.filter(
                pk=provider.authentication_id)
            execute_delete_sql(auth_query)
    if provider.billing_source_id:
        provider_billing_query = Provider.objects.exclude(
            uuid=provider.uuid).filter(
                billing_source_id=provider.billing_source_id)
        billing_count = provider_billing_query.count()
        if billing_count == 0:
            LOG.info("Deleting unreferenced ProviderBillingSource")
            billing_source_query = ProviderBillingSource.objects.filter(
                pk=provider.billing_source_id)
            execute_delete_sql(billing_source_query)

    if not provider.customer:
        LOG.warning(
            "Provider %s has no Customer; we cannot call delete_archived_data.",
            provider.uuid)
        return

    customer = provider.customer
    customer.date_updated = DateHelper().now_utc
    customer.save()

    LOG.info("Deleting any related CostModelMap records")
    execute_delete_sql(
        CostModelMap.objects.filter(provider_uuid=provider.uuid))

    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(
            provider.uuid, provider.type, provider.customer.schema_name):
        # Local import of task function to avoid potential import cycle.
        from masu.celery.tasks import delete_archived_data

        LOG.info("Deleting any archived data")
        delete_func = partial(delete_archived_data.delay,
                              provider.customer.schema_name, provider.type,
                              provider.uuid)
        transaction.on_commit(delete_func)

    LOG.info("Refreshing materialized views post-provider-delete uuid=%s.",
             provider.uuid)
    refresh_materialized_views(provider.customer.schema_name,
                               provider.type,
                               provider_uuid=provider.uuid,
                               synchronous=True)
コード例 #9
0
ファイル: tasks.py プロジェクト: saroj3k/koku
def delete_archived_data(schema_name, provider_type, provider_uuid):
    """
    Delete archived data from our S3 bucket for a given provider.

    This function chiefly follows the deletion of a provider.

    This task is defined to attempt up to 10 retries using exponential backoff
    starting with a 10-second delay. This is intended to allow graceful handling
    of temporary AWS S3 connectivity issues because it is relatively important
    for us to delete this archived data.

    Args:
        schema_name (str): Koku user account (schema) name.
        provider_type (str): Koku backend provider type identifier.
        provider_uuid (UUID): Koku backend provider UUID.

    """
    if not schema_name or not provider_type or not provider_uuid:
        # Sanity-check all of these inputs in case somehow any receives an
        # empty value such as None or '' because we need to minimize the risk
        # of deleting unrelated files from our S3 bucket.
        messages = []
        if not schema_name:
            message = "missing required argument: schema_name"
            LOG.error(message)
            messages.append(message)
        if not provider_type:
            message = "missing required argument: provider_type"
            LOG.error(message)
            messages.append(message)
        if not provider_uuid:
            message = "missing required argument: provider_uuid"
            LOG.error(message)
            messages.append(message)
        raise TypeError("delete_archived_data() %s", ", ".join(messages))

    if not (settings.ENABLE_S3_ARCHIVING
            or enable_trino_processing(provider_uuid)):
        LOG.info("Skipping delete_archived_data. Upload feature is disabled.")
        return
    else:
        message = f"Deleting S3 data for {provider_type} provider {provider_uuid} in account {schema_name}."
        LOG.info(message)

    # We need to normalize capitalization and "-local" dev providers.
    account = schema_name[4:]

    # Data in object storage does not use the local designation
    source_type = provider_type.replace("-local", "")
    path_prefix = f"{Config.WAREHOUSE_PATH}/{Config.CSV_DATA_TYPE}"
    prefix = f"{path_prefix}/{account}/{source_type}/source={provider_uuid}/"
    LOG.info("Attempting to delete our archived data in S3 under %s", prefix)
    deleted_archived_with_prefix(settings.S3_BUCKET_NAME, prefix)

    path_prefix = f"{Config.WAREHOUSE_PATH}/{Config.PARQUET_DATA_TYPE}"
    prefix = f"{path_prefix}/{account}/{source_type}/source={provider_uuid}/"
    LOG.info("Attempting to delete our archived data in S3 under %s", prefix)
    deleted_archived_with_prefix(settings.S3_BUCKET_NAME, prefix)
コード例 #10
0
def create_daily_archives(
    request_id, account, provider_uuid, filename, file_path, manifest_id, start_date, context={}
):
    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider_uuid, Provider.PROVIDER_IBM, account):
        s3_csv_path = get_path_prefix(account, Provider.PROVIDER_IBM, provider_uuid, start_date, Config.CSV_DATA_TYPE)
        # add day to S3 CSV path because the IBM report is monthly and we want to diff between two days
        s3_csv_path = f"{s3_csv_path}/day={start_date.strftime('%d')}"
        copy_local_report_file_to_s3_bucket(
            request_id, s3_csv_path, file_path, filename, manifest_id, start_date, context
        )
    return [file_path]
コード例 #11
0
def copy_local_report_file_to_s3_bucket(
    request_id, s3_path, full_file_path, local_filename, manifest_id, start_date, context={}
):
    """
    Copies local report file to s3 bucket
    """
    if s3_path and (
        settings.ENABLE_S3_ARCHIVING
        or enable_trino_processing(context.get("provider_uuid"), context.get("provider_type"), context.get("account"))
    ):
        LOG.info(f"copy_local_report_file_to_s3_bucket: {s3_path} {full_file_path}")
        with open(full_file_path, "rb") as fin:
            copy_data_to_s3_bucket(request_id, s3_path, local_filename, fin, manifest_id, context)
コード例 #12
0
def provider_post_delete_callback(*args, **kwargs):
    """
    Asynchronously delete this Provider's archived data.

    Note: Signal receivers must accept keyword arguments (**kwargs).
    """
    provider = kwargs["instance"]
    if provider.authentication:
        auth_count = (
            Provider.objects.exclude(uuid=provider.uuid).filter(authentication=provider.authentication).count()
        )
        if auth_count == 0:
            provider.authentication.delete()
    if provider.billing_source:
        billing_count = (
            Provider.objects.exclude(uuid=provider.uuid).filter(billing_source=provider.billing_source).count()
        )
        if billing_count == 0:
            provider.billing_source.delete()

    provider_rate_objs = CostModelMap.objects.filter(provider_uuid=provider.uuid)
    if provider_rate_objs:
        provider_rate_objs.delete()

    if not provider.customer:
        LOG.warning("Provider %s has no Customer; we cannot call delete_archived_data.", provider.uuid)
        return

    customer = provider.customer
    customer.date_updated = DateHelper().now_utc
    customer.save()

    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider.uuid):
        # Local import of task function to avoid potential import cycle.
        from masu.celery.tasks import delete_archived_data

        delete_func = partial(delete_archived_data.delay, provider.customer.schema_name, provider.type, provider.uuid)
        transaction.on_commit(delete_func)

    refresh_materialized_views(
        provider.customer.schema_name, provider.type, provider_uuid=provider.uuid, synchronous=True
    )
コード例 #13
0
def create_daily_archives(tracing_id,
                          account,
                          provider_uuid,
                          filename,
                          filepath,
                          manifest_id,
                          start_date,
                          last_export_time,
                          context={}):
    """
    Create daily CSVs from incoming report and archive to S3.

    Args:
        tracing_id (str): The tracing id
        account (str): The account number
        provider_uuid (str): The uuid of a provider
        filename (str): The OCP file name
        filepath (str): The full path name of the file
        manifest_id (int): The manifest identifier
        start_date (Datetime): The start datetime of incoming report
        context (Dict): Logging context dictionary
    """
    download_hash = None
    daily_file_names = []
    if last_export_time:
        download_hash = hashlib.md5(str(last_export_time).encode())
        download_hash = download_hash.hexdigest()
    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(
            provider_uuid, Provider.PROVIDER_GCP, account):
        dh = DateHelper()
        directory = os.path.dirname(filepath)
        try:
            data_frame = pd.read_csv(filepath)
        except Exception as error:
            LOG.error(
                f"File {filepath} could not be parsed. Reason: {str(error)}")
            raise error
        for invoice_month in data_frame["invoice.month"].unique():
            # daily_files = []
            invoice_filter = data_frame["invoice.month"] == invoice_month
            invoice_data = data_frame[invoice_filter]
            unique_times = invoice_data.partition_date.unique()
            days = list({cur_dt[:10] for cur_dt in unique_times})
            daily_data_frames = [{
                "data_frame":
                invoice_data[invoice_data.partition_date.str.contains(
                    cur_day)],
                "date":
                cur_day
            } for cur_day in days]
            start_of_invoice = dh.gcp_invoice_month_start(invoice_month)
            s3_csv_path = get_path_prefix(account, Provider.PROVIDER_GCP,
                                          provider_uuid, start_of_invoice,
                                          Config.CSV_DATA_TYPE)
            for daily_data in daily_data_frames:
                day = daily_data.get("date")
                df = daily_data.get("data_frame")
                if download_hash:
                    day_file = f"{invoice_month}_{day}_{download_hash}.csv"
                else:
                    day_file = f"{invoice_month}_{day}.csv"
                day_filepath = f"{directory}/{day_file}"
                df.to_csv(day_filepath, index=False, header=True)
                copy_local_report_file_to_s3_bucket(tracing_id, s3_csv_path,
                                                    day_filepath, day_file,
                                                    manifest_id, start_date,
                                                    context)
                daily_file_names.append(day_filepath)
        return daily_file_names
コード例 #14
0
def update_summary_tables(  # noqa: C901
    schema_name,
    provider,
    provider_uuid,
    start_date,
    end_date=None,
    manifest_id=None,
    queue_name=None,
    synchronous=False,
    tracing_id=None,
):
    """Populate the summary tables for reporting.

    Args:
        schema_name (str) The DB schema name.
        provider    (str) The provider type.
        provider_uuid (str) The provider uuid.
        report_dict (dict) The report data dict from previous task.
        start_date  (str) The date to start populating the table.
        end_date    (str) The date to end on.

    Returns
        None

    """
    worker_stats.REPORT_SUMMARY_ATTEMPTS_COUNTER.labels(
        provider_type=provider).inc()
    task_name = "masu.processor.tasks.update_summary_tables"
    cache_args = [schema_name, provider, provider_uuid]
    ocp_on_cloud_infra_map = {}

    if not synchronous:
        worker_cache = WorkerCache()
        if worker_cache.single_task_is_running(task_name, cache_args):
            msg = f"Task {task_name} already running for {cache_args}. Requeuing."
            LOG.info(log_json(tracing_id, msg))
            update_summary_tables.s(
                schema_name,
                provider,
                provider_uuid,
                start_date,
                end_date=end_date,
                manifest_id=manifest_id,
                queue_name=queue_name,
                tracing_id=tracing_id,
            ).apply_async(queue=queue_name or UPDATE_SUMMARY_TABLES_QUEUE)
            return
        worker_cache.lock_single_task(task_name,
                                      cache_args,
                                      timeout=settings.WORKER_CACHE_TIMEOUT)

    stmt = (f"update_summary_tables called with args: "
            f" schema_name: {schema_name}, "
            f" provider: {provider}, "
            f" start_date: {start_date}, "
            f" end_date: {end_date}, "
            f" manifest_id: {manifest_id}, "
            f" tracing_id: {tracing_id}")
    LOG.info(log_json(tracing_id, stmt))

    try:
        updater = ReportSummaryUpdater(schema_name, provider_uuid, manifest_id,
                                       tracing_id)
        start_date, end_date = updater.update_daily_tables(
            start_date, end_date)
        updater.update_summary_tables(start_date, end_date, tracing_id)
        ocp_on_cloud_infra_map = updater.get_openshift_on_cloud_infra_map(
            start_date, end_date, tracing_id)
    except ReportSummaryUpdaterCloudError as ex:
        LOG.info(
            log_json(
                tracing_id,
                f"Failed to correlate OpenShift metrics for provider: {provider_uuid}. Error: {ex}"
            ))

    except ReportSummaryUpdaterProviderNotFoundError as pnf_ex:
        LOG.warning(
            log_json(
                tracing_id,
                (f"{pnf_ex} Possible source/provider delete during processing. "
                 + "Processing for this provier will halt."),
            ))
        if not synchronous:
            worker_cache.release_single_task(task_name, cache_args)
        return
    except Exception as ex:
        if not synchronous:
            worker_cache.release_single_task(task_name, cache_args)
        raise ex
    if not provider_uuid:
        refresh_materialized_views.s(
            schema_name,
            provider,
            manifest_id=manifest_id,
            queue_name=queue_name,
            tracing_id=tracing_id).apply_async(
                queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE)
        return

    if enable_trino_processing(provider_uuid, provider,
                               schema_name) and provider in (
                                   Provider.PROVIDER_AWS,
                                   Provider.PROVIDER_AWS_LOCAL,
                                   Provider.PROVIDER_AZURE,
                                   Provider.PROVIDER_AZURE_LOCAL,
                               ):
        cost_model = None
        stmt = (
            f"Markup for {provider} is calculated during summarization. No need to run update_cost_model_costs"
            f" schema_name: {schema_name}, "
            f" provider_uuid: {provider_uuid}")
        LOG.info(log_json(tracing_id, stmt))
    else:
        with CostModelDBAccessor(schema_name,
                                 provider_uuid) as cost_model_accessor:
            cost_model = cost_model_accessor.cost_model

    # Create queued tasks for each OpenShift on Cloud cluster
    signature_list = []
    for openshift_provider_uuid, infrastructure_tuple in ocp_on_cloud_infra_map.items(
    ):
        infra_provider_uuid = infrastructure_tuple[0]
        infra_provider_type = infrastructure_tuple[1]
        signature_list.append(
            update_openshift_on_cloud.s(
                schema_name,
                openshift_provider_uuid,
                infra_provider_uuid,
                infra_provider_type,
                str(start_date),
                str(end_date),
                manifest_id=manifest_id,
                queue_name=queue_name,
                synchronous=synchronous,
                tracing_id=tracing_id,
            ).set(queue=queue_name or UPDATE_SUMMARY_TABLES_QUEUE))

    # Apply OCP on Cloud tasks
    if signature_list:
        if synchronous:
            group(signature_list).apply()
        else:
            group(signature_list).apply_async()

    if cost_model is not None:
        linked_tasks = update_cost_model_costs.s(
            schema_name,
            provider_uuid,
            start_date,
            end_date,
            tracing_id=tracing_id).set(
                queue=queue_name or UPDATE_COST_MODEL_COSTS_QUEUE
            ) | refresh_materialized_views.si(
                schema_name,
                provider,
                provider_uuid=provider_uuid,
                manifest_id=manifest_id,
                tracing_id=tracing_id).set(
                    queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE)
    else:
        stmt = f"update_cost_model_costs skipped. schema_name: {schema_name}, provider_uuid: {provider_uuid}"
        LOG.info(log_json(tracing_id, stmt))
        linked_tasks = refresh_materialized_views.s(
            schema_name,
            provider,
            provider_uuid=provider_uuid,
            manifest_id=manifest_id,
            tracing_id=tracing_id).set(
                queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE)

    chain(linked_tasks).apply_async()

    if not synchronous:
        worker_cache.release_single_task(task_name, cache_args)
コード例 #15
0
ファイル: tasks.py プロジェクト: saroj3k/koku
def update_summary_tables(  # noqa: C901
    schema_name,
    provider,
    provider_uuid,
    start_date,
    end_date=None,
    manifest_id=None,
    queue_name=None,
    synchronous=False,
):
    """Populate the summary tables for reporting.

    Args:
        schema_name (str) The DB schema name.
        provider    (str) The provider type.
        provider_uuid (str) The provider uuid.
        report_dict (dict) The report data dict from previous task.
        start_date  (str) The date to start populating the table.
        end_date    (str) The date to end on.

    Returns
        None

    """
    worker_stats.REPORT_SUMMARY_ATTEMPTS_COUNTER.labels(
        provider_type=provider).inc()
    task_name = "masu.processor.tasks.update_summary_tables"
    cache_args = [schema_name]

    if not synchronous:
        worker_cache = WorkerCache()
        if worker_cache.single_task_is_running(task_name, cache_args):
            msg = f"Task {task_name} already running for {cache_args}. Requeuing."
            LOG.info(msg)
            update_summary_tables.s(
                schema_name,
                provider,
                provider_uuid,
                start_date,
                end_date=end_date,
                manifest_id=manifest_id,
                queue_name=queue_name,
            ).apply_async(queue=queue_name or UPDATE_SUMMARY_TABLES_QUEUE)
            return
        worker_cache.lock_single_task(task_name, cache_args, timeout=3600)

    stmt = (f"update_summary_tables called with args:\n"
            f" schema_name: {schema_name},\n"
            f" provider: {provider},\n"
            f" start_date: {start_date},\n"
            f" end_date: {end_date},\n"
            f" manifest_id: {manifest_id}")
    LOG.info(stmt)

    try:
        updater = ReportSummaryUpdater(schema_name, provider_uuid, manifest_id)
        start_date, end_date = updater.update_daily_tables(
            start_date, end_date)
        updater.update_summary_tables(start_date, end_date)
    except Exception as ex:
        if not synchronous:
            worker_cache.release_single_task(task_name, cache_args)
        raise ex

    if not provider_uuid:
        refresh_materialized_views.s(
            schema_name,
            provider,
            manifest_id=manifest_id,
            queue_name=queue_name).apply_async(
                queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE)
        return

    if enable_trino_processing(provider_uuid) and provider in (
            Provider.PROVIDER_AWS,
            Provider.PROVIDER_AWS_LOCAL,
            Provider.PROVIDER_AZURE,
            Provider.PROVIDER_AZURE_LOCAL,
    ):
        cost_model = None
        stmt = (
            f"\n Markup for {provider} is calculated during summarization. No need to run update_cost_model_costs\n"
            f" schema_name: {schema_name},\n"
            f" provider_uuid: {provider_uuid}")
        LOG.info(stmt)
    else:
        with CostModelDBAccessor(schema_name,
                                 provider_uuid) as cost_model_accessor:
            cost_model = cost_model_accessor.cost_model

    if cost_model is not None:
        linked_tasks = update_cost_model_costs.s(
            schema_name, provider_uuid, start_date, end_date).set(
                queue=queue_name or UPDATE_COST_MODEL_COSTS_QUEUE
            ) | refresh_materialized_views.si(
                schema_name,
                provider,
                provider_uuid=provider_uuid,
                manifest_id=manifest_id).set(
                    queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE)
    else:
        stmt = (f"\n update_cost_model_costs skipped.\n"
                f" schema_name: {schema_name},\n"
                f" provider_uuid: {provider_uuid}")
        LOG.info(stmt)
        linked_tasks = refresh_materialized_views.s(
            schema_name,
            provider,
            provider_uuid=provider_uuid,
            manifest_id=manifest_id).set(
                queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE)

    dh = DateHelper(utc=True)
    prev_month_start_day = dh.last_month_start.replace(tzinfo=None).date()
    if isinstance(start_date, str):
        start_date = ciso8601.parse_datetime(start_date).date()
    if manifest_id and (start_date <= prev_month_start_day):
        # We want make sure that the manifest_id is not none, because
        # we only want to call the delete line items after the summarize_reports
        # task above
        simulate = False
        line_items_only = True

        linked_tasks |= remove_expired_data.si(
            schema_name, provider, simulate, provider_uuid, line_items_only,
            queue_name).set(queue=queue_name or REMOVE_EXPIRED_DATA_QUEUE)

    chain(linked_tasks).apply_async()
    if not synchronous:
        worker_cache.release_single_task(task_name, cache_args)
コード例 #16
0
 def trino_enabled(self):
     """Return whether the source is enabled for Trino processing."""
     return enable_trino_processing(self.provider_uuid, self.provider_type, self.schema_name)
コード例 #17
0
    def convert_to_parquet(self):
        """
        Convert archived CSV data from our S3 bucket for a given provider to Parquet.

        This function chiefly follows the download of a providers data.

        This task is defined to attempt up to 10 retries using exponential backoff
        starting with a 10-second delay. This is intended to allow graceful handling
        of temporary AWS S3 connectivity issues because it is relatively important
        for us to convert the archived data.
        """
        parquet_base_filename = ""
        daily_data_frame = pd.DataFrame()
        if not enable_trino_processing(self.provider_uuid, self.provider_type,
                                       self.schema_name):
            msg = "Skipping convert_to_parquet. Parquet processing is disabled."
            LOG.info(log_json(self.request_id, msg, self.error_context))
            return "", pd.DataFrame()

        if self.csv_path_s3 is None or self.parquet_path_s3 is None or self.local_path is None:
            msg = (
                f"Invalid paths provided to convert_csv_to_parquet."
                f"CSV path={self.csv_path_s3}, Parquet path={self.parquet_path_s3}, and local_path={self.local_path}."
            )
            LOG.error(log_json(self.request_id, msg, self.error_context))
            return "", pd.DataFrame()

        manifest_accessor = ReportManifestDBAccessor()
        manifest = manifest_accessor.get_manifest_by_id(self.manifest_id)

        # OCP data is daily chunked report files.
        # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated
        if not manifest_accessor.get_s3_parquet_cleared(
                manifest) and self.provider_type not in (
                    Provider.PROVIDER_OCP,
                    Provider.PROVIDER_GCP,
                    Provider.PROVIDER_GCP_LOCAL,
                ):
            remove_files_not_in_set_from_s3_bucket(self.request_id,
                                                   self.parquet_path_s3,
                                                   self.manifest_id,
                                                   self.error_context)
            remove_files_not_in_set_from_s3_bucket(self.request_id,
                                                   self.parquet_daily_path_s3,
                                                   self.manifest_id,
                                                   self.error_context)
            manifest_accessor.mark_s3_parquet_cleared(manifest)

        failed_conversion = []
        for csv_filename in self.file_list:
            if self.provider_type == Provider.PROVIDER_OCP and self.report_type is None:
                msg = f"Could not establish report type for {csv_filename}."
                LOG.warn(log_json(self.request_id, msg, self.error_context))
                failed_conversion.append(csv_filename)
                continue

            parquet_base_filename, daily_data_frame, success = self.convert_csv_to_parquet(
                csv_filename)
            if self.provider_type not in (Provider.PROVIDER_AZURE,
                                          Provider.PROVIDER_GCP):
                self.create_daily_parquet(parquet_base_filename,
                                          daily_data_frame)
            if not success:
                failed_conversion.append(csv_filename)

        if failed_conversion:
            msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}."
            LOG.warn(log_json(self.request_id, msg, self.error_context))
        return parquet_base_filename, daily_data_frame
コード例 #18
0
    def convert_to_parquet(  # noqa: C901
            self,
            request_id,
            account,
            provider_uuid,
            provider_type,
            start_date,
            manifest_id,
            files=[],
            context={}):
        """
        Convert archived CSV data from our S3 bucket for a given provider to Parquet.

        This function chiefly follows the download of a providers data.

        This task is defined to attempt up to 10 retries using exponential backoff
        starting with a 10-second delay. This is intended to allow graceful handling
        of temporary AWS S3 connectivity issues because it is relatively important
        for us to convert the archived data.

        Args:
            request_id (str): The associated request id (ingress or celery task id)
            account (str): The account string
            provider_uuid (UUID): The provider UUID
            start_date (str): The report start time (YYYY-mm-dd)
            manifest_id (str): The identifier for the report manifest
            context (dict): A context object for logging

        """
        if not context:
            context = {"account": account, "provider_uuid": provider_uuid}

        if not enable_trino_processing(provider_uuid):
            msg = "Skipping convert_to_parquet. Parquet processing is disabled."
            LOG.info(log_json(request_id, msg, context))
            return

        if not request_id or not account or not provider_uuid:
            if not request_id:
                message = "missing required argument: request_id"
                LOG.error(message)
            if not account:
                message = "missing required argument: account"
                LOG.error(message)
            if not provider_uuid:
                message = "missing required argument: provider_uuid"
                LOG.error(message)
            if not provider_type:
                message = "missing required argument: provider_type"
                LOG.error(message)
            return

        if not start_date:
            msg = "Parquet processing is enabled, but no start_date was given for processing."
            LOG.warn(log_json(request_id, msg, context))
            return

        try:
            cost_date = parser.parse(start_date)
        except ValueError:
            msg = "Parquet processing is enabled, but the start_date was not a valid date string ISO 8601 format."
            LOG.warn(log_json(request_id, msg, context))
            return

        s3_csv_path = get_path_prefix(account, provider_type, provider_uuid,
                                      cost_date, Config.CSV_DATA_TYPE)
        local_path = f"{Config.TMP_DIR}/{account}/{provider_uuid}"
        s3_parquet_path = get_path_prefix(account, provider_type,
                                          provider_uuid, cost_date,
                                          Config.PARQUET_DATA_TYPE)

        if not files:
            file_keys = self.get_file_keys_from_s3_with_manifest_id(
                request_id, s3_csv_path, manifest_id, context)
            files = [os.path.basename(file_key) for file_key in file_keys]
            if not files:
                msg = "Parquet processing is enabled, but no files to process."
                LOG.info(log_json(request_id, msg, context))
                return

        post_processor = None
        # OCP data is daily chunked report files.
        # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated
        if provider_type not in (Provider.PROVIDER_OCP, Provider.PROVIDER_GCP,
                                 Provider.PROVIDER_GCP_LOCAL):
            remove_files_not_in_set_from_s3_bucket(request_id, s3_parquet_path,
                                                   manifest_id, context)

        if provider_type in [
                Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL
        ]:
            post_processor = aws_post_processor
        elif provider_type in [
                Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL
        ]:
            post_processor = azure_post_processor
        elif provider_type in [
                Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL
        ]:
            post_processor = gcp_post_processor

        failed_conversion = []
        for csv_filename in files:
            kwargs = {}
            parquet_path = s3_parquet_path
            parquet_report_type = None
            if provider_type == Provider.PROVIDER_OCP:
                for report_type in REPORT_TYPES.keys():
                    if report_type in csv_filename:
                        parquet_path = get_path_prefix(
                            account,
                            provider_type,
                            provider_uuid,
                            cost_date,
                            Config.PARQUET_DATA_TYPE,
                            report_type=report_type,
                        )
                        kwargs["report_type"] = report_type
                        parquet_report_type = report_type
                        break
                if parquet_report_type is None:
                    msg = f"Could not establish report type for {csv_filename}."
                    LOG.warn(log_json(request_id, msg, context))
                    continue

            converters = get_column_converters(provider_type, **kwargs)
            result = self.convert_csv_to_parquet(
                request_id,
                s3_csv_path,
                parquet_path,
                local_path,
                manifest_id,
                csv_filename,
                converters,
                post_processor,
                context,
                parquet_report_type,
            )
            if not result:
                failed_conversion.append(csv_filename)

        if failed_conversion:
            msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}."
            LOG.warn(log_json(request_id, msg, context))
            return