def _write_parquet_to_file(self, file_path, file_name, data_frame, file_type=None):
        """Write Parquet file and send to S3."""
        if self._provider_type == Provider.PROVIDER_GCP:
            # We need to determine the parquet file path based off
            # of the start of the invoice month and usage start for GCP.
            s3_path = self._determin_s3_path_for_gcp(file_type, file_name)
        else:
            s3_path = self._determin_s3_path(file_type)
        data_frame.to_parquet(file_path, allow_truncated_timestamps=True, coerce_timestamps="ms", index=False)
        try:
            with open(file_path, "rb") as fin:
                copy_data_to_s3_bucket(
                    self.tracing_id, s3_path, file_name, fin, manifest_id=self.manifest_id, context=self.error_context
                )
                msg = f"{file_path} sent to S3."
                LOG.info(log_json(self.tracing_id, msg, self.error_context))
        except Exception as err:
            s3_key = f"{self.parquet_path_s3}/{file_path}"
            msg = f"File {file_name} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}"
            LOG.warn(log_json(self.tracing_id, msg, self.error_context))
            return False
        finally:
            self.files_to_remove.append(file_path)

        return True
Beispiel #2
0
    def _get_manifest(self, date_time):
        """
        Download and return the CUR manifest for the given date.

        Args:
            date_time (DateTime): The starting datetime object

        Returns:
            (Dict): A dict-like object serialized from JSON data.

        """
        manifest = "{}/{}-Manifest.json".format(
            self._get_report_path(date_time), self.report_name)
        msg = f"Will attempt to download manifest: {manifest}"
        LOG.info(log_json(self.request_id, msg, self.context))

        try:
            manifest_file, _, manifest_modified_timestamp, __ = self.download_file(
                manifest)
        except AWSReportDownloaderNoFileError as err:
            msg = f"Unable to get report manifest. Reason: {str(err)}"
            LOG.info(log_json(self.request_id, msg, self.context))
            return "", self.empty_manifest, None

        manifest_json = None
        with open(manifest_file, "r") as manifest_file_handle:
            manifest_json = json.load(manifest_file_handle)

        return manifest_file, manifest_json, manifest_modified_timestamp
    def update_summary_tables(self, start_date, end_date, tracing_id):
        """
        Update report summary tables.

        Args:
            start_date (str, datetime): When to start.
            end_date (str, datetime): When to end.
            tracing_id (str): The tracing_id.

        Returns:
            None

        """
        msg = f"Summary processing starting for source {self._provider_uuid}"
        LOG.info(log_json(self._tracing_id, msg))
        start_date, end_date = self._format_dates(start_date, end_date)
        LOG.info(log_json(tracing_id, f"Using start date: {start_date}"))
        LOG.info(log_json(tracing_id, f"Using end date: {end_date}"))

        start_date, end_date = self._updater.update_summary_tables(start_date, end_date)

        msg = f"Summary processing completed for source {self._provider_uuid}"
        LOG.info(log_json(self._tracing_id, msg))

        invalidate_view_cache_for_tenant_and_source_type(self._schema, self._provider.type)

        return start_date, end_date
Beispiel #4
0
def remove_files_not_in_set_from_s3_bucket(request_id,
                                           s3_path,
                                           manifest_id,
                                           context={}):
    """
    Removes all files in a given prefix if they are not within the given set.
    """
    if not settings.ENABLE_S3_ARCHIVING:
        return []

    removed = []
    if s3_path:
        try:
            s3_resource = get_s3_resource()
            existing_objects = s3_resource.Bucket(
                settings.S3_BUCKET_NAME).objects.filter(Prefix=s3_path)
            for obj_summary in existing_objects:
                existing_object = obj_summary.Object()
                metadata = existing_object.metadata
                manifest = metadata.get("manifestid")
                manifest_id_str = str(manifest_id)
                key = existing_object.key
                if manifest != manifest_id_str:
                    s3_resource.Object(settings.S3_BUCKET_NAME, key).delete()
                    removed.append(key)
            if removed:
                msg = f"Removed files from s3 bucket {settings.S3_BUCKET_NAME}: {','.join(removed)}."
                LOG.info(log_json(request_id, msg, context))
        except ClientError as err:
            msg = f"Unable to remove data in bucket {settings.S3_BUCKET_NAME}.  Reason: {str(err)}"
            LOG.info(log_json(request_id, msg, context))
    return removed
Beispiel #5
0
    def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        etag_hasher = hashlib.new("ripemd160")
        etag_hasher.update(bytes(local_filename, "utf-8"))
        etag = etag_hasher.hexdigest()

        if etag != stored_etag:
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.request_id, msg, self.context))
            shutil.copy2(key, full_file_path)
            # Push to S3
            s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(
                self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context
            )
        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_file_path, etag
    def get_report_for(self, date_time):
        """
        Get OCP usage report files corresponding to a date.

        Args:
            date_time (DateTime): Start date of the usage report.

        Returns:
            ([]) List of file paths for a particular report.

        """
        dates = utils.month_date_range(date_time)
        msg = f"Looking for cluster {self.cluster_id} report for date {str(dates)}"
        LOG.debug(log_json(self.request_id, msg, self.context))
        directory = f"{REPORTS_DIR}/{self.cluster_id}/{dates}"

        manifest = self._get_manifest(date_time)
        msg = f"manifest found: {str(manifest)}"
        LOG.info(log_json(self.request_id, msg, self.context))

        reports = []
        for file in manifest.get("files", []):
            report_full_path = os.path.join(directory, file)
            reports.append(report_full_path)

        return reports
Beispiel #7
0
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download an S3 object to file.

        Args:
            key (str): The S3 object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        s3_filename = key.split("/")[-1]
        directory_path = f"{DATA_DIR}/{self.customer_name}/aws/{self.bucket}"

        local_s3_filename = utils.get_local_file_name(key)
        msg = f"Local S3 filename: {local_s3_filename}"
        LOG.info(log_json(self.request_id, msg, self.context))
        full_file_path = f"{directory_path}/{local_s3_filename}"

        # Make sure the data directory exists
        os.makedirs(directory_path, exist_ok=True)
        s3_etag = None
        try:
            s3_file = self.s3_client.get_object(
                Bucket=self.report.get("S3Bucket"), Key=key)
            s3_etag = s3_file.get("ETag")
        except ClientError as ex:
            if ex.response["Error"]["Code"] == "NoSuchKey":
                msg = "Unable to find {} in S3 Bucket: {}".format(
                    s3_filename, self.report.get("S3Bucket"))
                LOG.info(log_json(self.request_id, msg, self.context))
                raise AWSReportDownloaderNoFileError(msg)

            msg = f"Error downloading file: Error: {str(ex)}"
            LOG.error(log_json(self.request_id, msg, self.context))
            raise AWSReportDownloaderError(str(ex))

        if not self._check_size(key, check_inflate=True):
            raise AWSReportDownloaderError(
                f"Insufficient disk space to download file: {s3_file}")

        if s3_etag != stored_etag or not os.path.isfile(full_file_path):
            LOG.debug("Downloading key: %s to file path: %s", key,
                      full_file_path)
            self.s3_client.download_file(self.report.get("S3Bucket"), key,
                                         full_file_path)
            # Push to S3
            s3_csv_path = get_path_prefix(self.account, self._provider_uuid,
                                          start_date, Config.CSV_DATA_TYPE)
            utils.copy_local_report_file_to_s3_bucket(
                self.request_id, s3_csv_path, full_file_path,
                local_s3_filename, manifest_id, start_date, self.context)
            utils.remove_files_not_in_set_from_s3_bucket(
                self.request_id, s3_csv_path, manifest_id)

        return full_file_path, s3_etag
    def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None):
        """
        Download a file from GCP storage bucket.

        If we have a stored etag and it matches the current GCP blob, we can
        safely skip download since the blob/file content must not have changed.

        Args:
            key (str): name of the blob in the GCP storage bucket
            stored_etag (str): optional etag stored in our DB for comparison

        Returns:
            tuple(str, str) with the local filesystem path to file and GCP's etag.

        """
        blob = self._bucket_info.get_blob(key)
        if not blob:
            raise GCPReportDownloaderNoFileError(f'No blob found in bucket "{self.bucket_name}" with name "{key}"')

        if stored_etag is not None and stored_etag != blob.etag:
            # Should we abort download here? Just log a warning for now...
            msg = f"etag for {key} is {blob.etag}, but stored etag is {stored_etag}"
            LOG.warning(log_json(self.request_id, msg, self.context))

        directory_path = self._get_local_directory_path()
        full_local_path = self._get_local_file_path(directory_path, key)
        os.makedirs(directory_path, exist_ok=True)
        msg = f"Downloading {key} to {full_local_path}"
        LOG.info(log_json(self.request_id, msg, self.context))
        blob.download_to_filename(full_local_path)

        msg = f"Returning full_file_path: {full_local_path}, etag: {blob.etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_local_path, blob.etag
    def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        try:
            blob = self._azure_client.get_cost_export_for_key(key, self.container_name)
            etag = blob.etag
        except AzureCostReportNotFound as ex:
            msg = f"Error when downloading Azure report for key: {key}. Error {ex}"
            LOG.error(log_json(self.request_id, msg, self.context))
            raise AzureReportDownloaderError(msg)

        if etag != stored_etag:
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.request_id, msg, self.context))
            blob = self._azure_client.download_cost_export(key, self.container_name, destination=full_file_path)
            # Push to S3
            s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(
                self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context
            )
        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_file_path, etag
Beispiel #10
0
    def _write_parquet_to_file(self,
                               file_path,
                               file_name,
                               data_frame,
                               file_type=None):
        """Write Parquet file and send to S3."""
        s3_path = self._determin_s3_path(file_type)
        data_frame.to_parquet(file_path,
                              allow_truncated_timestamps=True,
                              coerce_timestamps="ms",
                              index=False)
        try:
            with open(file_path, "rb") as fin:
                copy_data_to_s3_bucket(self.request_id,
                                       s3_path,
                                       file_name,
                                       fin,
                                       manifest_id=self.manifest_id,
                                       context=self.error_context)
                msg = f"{file_path} sent to S3."
                LOG.info(log_json(self.request_id, msg, self.error_context))
        except Exception as err:
            s3_key = f"{self.parquet_path_s3}/{file_path}"
            msg = f"File {file_name} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}"
            LOG.warn(log_json(self.request_id, msg, self.error_context))
            return False
        finally:
            self.files_to_remove.append(file_path)

        return True
Beispiel #11
0
    def process(self):
        """
        Process the current cost usage report.

        Args:
            None

        Returns:
            (List) List of filenames downloaded.

        """
        msg = f"Report processing started for {self.report_path}"
        LOG.info(log_json(self.tracing_id, msg))
        try:
            if self.trino_enabled:
                parquet_base_filename, daily_data_frames = self._processor.process()
                if self.ocp_on_cloud_processor:
                    self.ocp_on_cloud_processor.process(parquet_base_filename, daily_data_frames)
                return
            msg = f"Report processing completed for {self.report_path}"
            LOG.info(log_json(self.tracing_id, msg))
            if self._secondary_processor:
                try:
                    self._secondary_processor.process()
                except (ConnectTimeout, InvalidURL, ConnectionError):
                    pass
            return self._processor.process()
        except (InterfaceError, DjangoInterfaceError) as err:
            raise ReportProcessorDBError(str(err))
        except OperationalError as o_err:
            db_exc = get_extended_exception_by_type(o_err)
            LOG.error(log_json(self.tracing_id, str(db_exc), context=db_exc.as_dict()))
            raise db_exc
        except Exception as err:
            raise ReportProcessorError(str(err))
Beispiel #12
0
def record_report_status(manifest_id, file_name, request_id, context={}):
    """
    Creates initial report status database entry for new report files.

    If a report has already been downloaded from the ingress service
    there is a chance that processing has already been complete.  The
    function returns the last completed date time to determine if the
    report processing should continue in extract_payload.

    Args:
        manifest_id (Integer): Manifest Identifier.
        file_name (String): Report file name
        request_id (String): Identifier associated with the payload
        context (Dict): Context for logging (account, etc)

    Returns:
        DateTime - Last completed date time for a given report file.

    """
    already_processed = False
    with ReportStatsDBAccessor(file_name, manifest_id) as db_accessor:
        already_processed = db_accessor.get_last_completed_datetime()
        if already_processed:
            msg = f"Report {file_name} has already been processed."
            LOG.info(log_json(request_id, msg, context))
        else:
            msg = f"Recording stats entry for {file_name}"
            LOG.info(log_json(request_id, msg, context))
    return already_processed
    def _process_manifest_db_record(self, assembly_id, billing_start,
                                    num_of_files, manifest_modified_datetime,
                                    **kwargs):
        """Insert or update the manifest DB record."""
        msg = f"Inserting/updating manifest in database for assembly_id: {assembly_id}"
        LOG.info(log_json(self.tracing_id, msg))

        with ReportManifestDBAccessor() as manifest_accessor:
            manifest_entry = manifest_accessor.get_manifest(
                assembly_id, self._provider_uuid)

            if not manifest_entry:
                msg = f"No manifest entry found in database. Adding for bill period start: {billing_start}"
                LOG.info(log_json(self.tracing_id, msg, self.context))
                manifest_dict = {
                    "assembly_id": assembly_id,
                    "billing_period_start_datetime": billing_start,
                    "num_total_files": num_of_files,
                    "provider_uuid": self._provider_uuid,
                    "manifest_modified_datetime": manifest_modified_datetime,
                }
                manifest_dict.update(kwargs)
                try:
                    manifest_entry = manifest_accessor.add(**manifest_dict)
                except IntegrityError as error:
                    fk_violation = FKViolation(error)
                    if fk_violation:
                        LOG.warning(fk_violation)
                        raise ReportDownloaderError(
                            f"Method: _process_manifest_db_record :: {fk_violation}"
                        )
                    msg = (
                        f"Manifest entry uniqueness collision: Error {error}. "
                        "Manifest already added, getting manifest_entry_id.")
                    LOG.warning(log_json(self.tracing_id, msg, self.context))
                    with ReportManifestDBAccessor() as manifest_accessor:
                        manifest_entry = manifest_accessor.get_manifest(
                            assembly_id, self._provider_uuid)
            if not manifest_entry:
                msg = f"Manifest entry not found for given manifest {manifest_dict}."
                with ProviderDBAccessor(
                        self._provider_uuid) as provider_accessor:
                    provider = provider_accessor.get_provider()
                    if not provider:
                        msg = f"Provider entry not found for {self._provider_uuid}."
                        LOG.warning(
                            log_json(self.tracing_id, msg, self.context))
                        raise ReportDownloaderError(msg)
                LOG.warning(log_json(self.tracing_id, msg, self.context))
                raise IntegrityError(msg)
            else:
                if num_of_files != manifest_entry.num_total_files:
                    manifest_accessor.update_number_of_files_for_manifest(
                        manifest_entry)
                manifest_accessor.mark_manifest_as_updated(manifest_entry)
                manifest_id = manifest_entry.id

        return manifest_id
Beispiel #14
0
def refresh_materialized_views(  # noqa: C901
    schema_name,
    provider_type,
    manifest_id=None,
    provider_uuid="",
    synchronous=False,
    queue_name=None,
    tracing_id=None,
):
    """Refresh the database's materialized views for reporting."""
    task_name = "masu.processor.tasks.refresh_materialized_views"
    cache_args = [schema_name, provider_type, provider_uuid]
    if not synchronous:
        worker_cache = WorkerCache()
        if worker_cache.single_task_is_running(task_name, cache_args):
            msg = f"Task {task_name} already running for {cache_args}. Requeuing."
            LOG.info(log_json(tracing_id, msg))
            refresh_materialized_views.s(
                schema_name,
                provider_type,
                manifest_id=manifest_id,
                provider_uuid=provider_uuid,
                synchronous=synchronous,
                queue_name=queue_name,
                tracing_id=tracing_id,
            ).apply_async(queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE)
            return
        worker_cache.lock_single_task(task_name,
                                      cache_args,
                                      timeout=settings.WORKER_CACHE_TIMEOUT)
    materialized_views = ()
    try:
        with schema_context(schema_name):
            for view in materialized_views:
                table_name = view._meta.db_table
                with connection.cursor() as cursor:
                    cursor.execute(
                        f"REFRESH MATERIALIZED VIEW CONCURRENTLY {table_name}")
                    LOG.info(log_json(tracing_id, f"Refreshed {table_name}."))

        invalidate_view_cache_for_tenant_and_source_type(
            schema_name, provider_type)

        if provider_uuid:
            ProviderDBAccessor(provider_uuid).set_data_updated_timestamp()
        if manifest_id:
            # Processing for this monifest should be complete after this step
            with ReportManifestDBAccessor() as manifest_accessor:
                manifest = manifest_accessor.get_manifest_by_id(manifest_id)
                manifest_accessor.mark_manifest_as_completed(manifest)
    except Exception as ex:
        if not synchronous:
            worker_cache.release_single_task(task_name, cache_args)
        raise ex

    if not synchronous:
        worker_cache.release_single_task(task_name, cache_args)
    def _check_size(self, s3key, check_inflate=False):
        """Check the size of an S3 file.

        Determine if there is enough local space to download and decompress the
        file.

        Args:
            s3key (str): the key name of the S3 object to check
            check_inflate (bool): if the file is compressed, evaluate the file's decompressed size.

        Returns:
            (bool): whether the file can be safely stored (and decompressed)

        """
        size_ok = False

        try:
            s3fileobj = self.s3_client.get_object(
                Bucket=self.report.get("S3Bucket"), Key=s3key)
            size = int(s3fileobj.get("ContentLength", -1))
        except ClientError as ex:
            if ex.response["Error"]["Code"] == "AccessDenied":
                msg = "Unable to access S3 Bucket {}: (AccessDenied)".format(
                    self.report.get("S3Bucket"))
                LOG.info(log_json(self.tracing_id, msg, self.context))
                raise AWSReportDownloaderNoFileError(msg)
            msg = f"Error downloading file: Error: {str(ex)}"
            LOG.error(log_json(self.tracing_id, msg, self.context))
            raise AWSReportDownloaderError(str(ex))

        if size < 0:
            raise AWSReportDownloaderError(
                f"Invalid size for S3 object: {s3fileobj}")

        free_space = shutil.disk_usage(self.download_path)[2]
        if size < free_space:
            size_ok = True

        LOG.debug("%s is %s bytes; Download path has %s free", s3key, size,
                  free_space)

        ext = os.path.splitext(s3key)[1]
        if ext == ".gz" and check_inflate and size_ok and size > 0:
            # isize block is the last 4 bytes of the file; see: RFC1952
            resp = self.s3_client.get_object(
                Bucket=self.report.get("S3Bucket"),
                Key=s3key,
                Range=f"bytes={size - 4}-{size}")
            isize = struct.unpack("<I", resp["Body"].read(4))[0]
            if isize > free_space:
                size_ok = False

            LOG.debug("%s is %s bytes uncompressed; Download path has %s free",
                      s3key, isize, free_space)

        return size_ok
 def _remove_manifest_file(self, manifest_file):
     """Clean up the manifest file after extracting information."""
     try:
         os.remove(manifest_file)
         msg = f"Deleted manifest file at {manifest_file}"
         LOG.info(log_json(self.request_id, msg, self.context))
     except OSError:
         msg = f"Could not delete manifest file at {manifest_file}"
         LOG.info(log_json(self.request_id, msg, self.context))
     return None
    def convert_csv_to_parquet(self, csv_filename):  # noqa: C901
        """Convert CSV file to parquet and send to S3."""
        daily_data_frames = []
        converters = self._get_column_converters()
        csv_path, csv_name = os.path.split(csv_filename)
        unique_keys = set()
        parquet_file = None
        parquet_base_filename = csv_name.replace(self.file_extension, "")
        kwargs = {}
        if self.file_extension == CSV_GZIP_EXT:
            kwargs = {"compression": "gzip"}

        msg = f"Running convert_csv_to_parquet on file {csv_filename}."
        LOG.info(log_json(self.tracing_id, msg, self.error_context))

        try:
            col_names = pd.read_csv(csv_filename, nrows=0, **kwargs).columns
            csv_converters = {
                col_name: converters[col_name.lower()] for col_name in col_names if col_name.lower() in converters
            }
            csv_converters.update({col: str for col in col_names if col not in csv_converters})
            with pd.read_csv(
                csv_filename, converters=csv_converters, chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE, **kwargs
            ) as reader:
                for i, data_frame in enumerate(reader):
                    if data_frame.empty:
                        continue
                    parquet_filename = f"{parquet_base_filename}_{i}{PARQUET_EXT}"
                    parquet_file = f"{self.local_path}/{parquet_filename}"
                    if self.post_processor:
                        data_frame = self.post_processor(data_frame)
                        if isinstance(data_frame, tuple):
                            data_frame, data_frame_tag_keys = data_frame
                            LOG.info(f"Updating unique keys with {len(data_frame_tag_keys)} keys")
                            unique_keys.update(data_frame_tag_keys)
                            LOG.info(f"Total unique keys for file {len(unique_keys)}")
                    if self.daily_data_processor is not None:
                        daily_data_frames.append(self.daily_data_processor(data_frame))

                    success = self._write_parquet_to_file(parquet_file, parquet_filename, data_frame)
                    if not success:
                        return parquet_base_filename, daily_data_frames, False
            if self.create_table and not self.presto_table_exists.get(self.report_type):
                self.create_parquet_table(parquet_file)
            create_enabled_keys(self._schema_name, self.enabled_tags_model, unique_keys)
        except Exception as err:
            msg = (
                f"File {csv_filename} could not be written as parquet to temp file {parquet_file}. Reason: {str(err)}"
            )
            LOG.warn(log_json(self.tracing_id, msg, self.error_context))
            return parquet_base_filename, daily_data_frames, False

        return parquet_base_filename, daily_data_frames, True
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        file_creation_date = None
        try:
            blob = self._azure_client.get_cost_export_for_key(
                key, self.container_name)
            etag = blob.etag
            file_creation_date = blob.last_modified
        except AzureCostReportNotFound as ex:
            msg = f"Error when downloading Azure report for key: {key}. Error {ex}"
            LOG.error(log_json(self.tracing_id, msg, self.context))
            raise AzureReportDownloaderError(msg)

        msg = f"Downloading {key} to {full_file_path}"
        LOG.info(log_json(self.tracing_id, msg, self.context))
        blob = self._azure_client.download_cost_export(
            key, self.container_name, destination=full_file_path)
        # Push to S3
        s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE,
                                      self._provider_uuid, start_date,
                                      Config.CSV_DATA_TYPE)
        copy_local_report_file_to_s3_bucket(self.tracing_id, s3_csv_path,
                                            full_file_path, local_filename,
                                            manifest_id, start_date,
                                            self.context)

        manifest_accessor = ReportManifestDBAccessor()
        manifest = manifest_accessor.get_manifest_by_id(manifest_id)

        if not manifest_accessor.get_s3_csv_cleared(manifest):
            remove_files_not_in_set_from_s3_bucket(self.tracing_id,
                                                   s3_csv_path, manifest_id)
            manifest_accessor.mark_s3_csv_cleared(manifest)

        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.tracing_id, msg, self.context))
        return full_file_path, etag, file_creation_date, []
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        etag_hasher = hashlib.new("ripemd160")
        etag_hasher.update(bytes(local_filename, "utf-8"))
        etag = etag_hasher.hexdigest()

        file_creation_date = None
        if etag != stored_etag:
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.request_id, msg, self.context))
            shutil.copy2(key, full_file_path)
            file_creation_date = datetime.datetime.fromtimestamp(
                os.path.getmtime(full_file_path))
            # Push to S3
            s3_csv_path = get_path_prefix(self.account,
                                          Provider.PROVIDER_AZURE,
                                          self._provider_uuid, start_date,
                                          Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(self.request_id, s3_csv_path,
                                                full_file_path, local_filename,
                                                manifest_id, start_date,
                                                self.context)

            manifest_accessor = ReportManifestDBAccessor()
            manifest = manifest_accessor.get_manifest_by_id(manifest_id)

            if not manifest_accessor.get_s3_csv_cleared(manifest):
                remove_files_not_in_set_from_s3_bucket(self.request_id,
                                                       s3_csv_path,
                                                       manifest_id)
                manifest_accessor.mark_s3_csv_cleared(manifest)

        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_file_path, etag, file_creation_date, []
    def get_openshift_on_cloud_infra_map(self, start_date, end_date, tracing_id):
        """Get cloud infrastructure source and OpenShift source mapping."""
        infra_map = {}
        try:
            if self._provider.type in Provider.OPENSHIFT_ON_CLOUD_PROVIDER_LIST:
                msg = f"Getting OpenShift on Cloud infrastructure map for {self._provider_uuid}"
                LOG.info(log_json(self._tracing_id, msg))
                start_date, end_date = self._format_dates(start_date, end_date)
                LOG.info(log_json(tracing_id, f"Using start date: {start_date}"))
                LOG.info(log_json(tracing_id, f"Using end date: {end_date}"))
                infra_map = self._ocp_cloud_updater.get_infra_map(start_date, end_date)
        except Exception as ex:
            raise ReportSummaryUpdaterCloudError(str(ex))

        return infra_map
    def _remove_manifest_file(self, date_time):
        """Clean up the manifest file after extracting information."""
        dates = utils.month_date_range(date_time)
        directory = f"{REPORTS_DIR}/{self.cluster_id}/{dates}"

        manifest_path = "{}/{}".format(directory, "manifest.json")
        try:
            os.remove(manifest_path)
            msg = f"Deleted manifest file at {directory}"
            LOG.debug(log_json(self.request_id, msg, self.context))
        except OSError:
            msg = f"Could not delete manifest file at {directory}"
            LOG.info(log_json(self.request_id, msg, self.context))

        return None
Beispiel #22
0
    def download_report(self, date_time):
        """
        Download CUR for a given date.

        Args:
            date_time (DateTime): The starting datetime object

        Returns:
            ([{}]) List of dictionaries containing file path and compression.

        """
        msg = f"Attempting to get {self.provider_type,} manifest for {str(date_time)}..."
        LOG.info(log_json(self.request_id, msg, self.context))
        report_context = self._downloader.get_report_context_for_date(
            date_time)
        manifest_id = report_context.get("manifest_id")
        reports = report_context.get("files", [])
        cur_reports = []
        for report in reports:
            report_dictionary = {}
            local_file_name = self._downloader.get_local_file_for_report(
                report)

            if self.is_report_processed(local_file_name, manifest_id):
                msg = f"File has already been processed: {local_file_name}. Skipping..."
                LOG.info(log_json(self.request_id, msg, self.context))
                continue
            with ReportStatsDBAccessor(local_file_name,
                                       manifest_id) as stats_recorder:
                stored_etag = stats_recorder.get_etag()
                file_name, etag = self._downloader.download_file(
                    report,
                    stored_etag,
                    manifest_id=manifest_id,
                    start_date=date_time)
                stats_recorder.update(etag=etag)

            report_dictionary["file"] = file_name
            report_dictionary["compression"] = report_context.get(
                "compression")
            report_dictionary["start_date"] = date_time
            report_dictionary["assembly_id"] = report_context.get(
                "assembly_id")
            report_dictionary["manifest_id"] = manifest_id
            report_dictionary["provider_uuid"] = self.provider_uuid

            cur_reports.append(report_dictionary)
        return cur_reports
Beispiel #23
0
def get_file_keys_from_s3_with_manifest_id(request_id,
                                           s3_path,
                                           manifest_id,
                                           context={}):
    """
    Get all files in a given prefix that match the given manifest_id.
    """
    if not settings.ENABLE_S3_ARCHIVING:
        return []

    keys = []
    if s3_path:
        try:
            s3_resource = get_s3_resource()
            existing_objects = s3_resource.Bucket(
                settings.S3_BUCKET_NAME).objects.filter(Prefix=s3_path)
            for obj_summary in existing_objects:
                existing_object = obj_summary.Object()
                metadata = existing_object.metadata
                manifest = metadata.get("manifestid")
                manifest_id_str = str(manifest_id)
                key = existing_object.key
                if manifest == manifest_id_str:
                    keys.append(key)
        except ClientError as err:
            msg = f"Unable to find data in bucket {settings.S3_BUCKET_NAME}.  Reason: {str(err)}"
            LOG.info(log_json(request_id, msg, context))
    return keys
Beispiel #24
0
def copy_data_to_s3_bucket(request_id,
                           path,
                           filename,
                           data,
                           manifest_id=None,
                           context={}):
    """
    Copies data to s3 bucket file
    """
    if not settings.ENABLE_S3_ARCHIVING:
        return None

    upload = None
    upload_key = f"{path}/{filename}"
    try:
        s3_resource = get_s3_resource()
        s3_obj = {"bucket_name": settings.S3_BUCKET_NAME, "key": upload_key}
        upload = s3_resource.Object(**s3_obj)
        put_value = {"Body": data}
        if manifest_id:
            put_value["Metadata"] = {"ManifestId": str(manifest_id)}
        upload.put(**put_value)
    except ClientError as err:
        msg = f"Unable to copy data to {upload_key} in bucket {settings.S3_BUCKET_NAME}.  Reason: {str(err)}"
        LOG.info(log_json(request_id, msg, context))
    return upload
Beispiel #25
0
    def check_if_manifest_should_be_downloaded(self, assembly_id):
        """Check if we should download this manifest.

        We first check if we have a database record of this manifest.
        That would indicate that we have already downloaded and at least
        begun processing. We then check the last completed time for
        a file in this manifest. This second check is to cover the case
        when we did not complete processing and need to re-downlaod and
        process the manifest.

        Returns True if the manifest should be downloaded and processed.
        """
        if self._cache_key and self.worker_cache.task_is_running(self._cache_key):
            msg = f"{self._cache_key} is currently running."
            LOG.info(log_json(self.request_id, msg, self.context))
            return False
        with ReportManifestDBAccessor() as manifest_accessor:
            manifest = manifest_accessor.get_manifest(assembly_id, self._provider_uuid)

            if manifest:
                manifest_id = manifest.id
                # check if `last_completed_datetime` is null for any report in the manifest.
                # if nulls exist, report processing is not complete and reports should be downloaded.
                need_to_download = manifest_accessor.is_last_completed_datetime_null(manifest_id)
                if need_to_download:
                    self.worker_cache.add_task_to_cache(self._cache_key)
                return need_to_download

        # The manifest does not exist, this is the first time we are
        # downloading and processing it.
        self.worker_cache.add_task_to_cache(self._cache_key)
        return True
    def __init__(self, customer_name, data_source, **kwargs):
        """
        Constructor.

        Args:
            customer_name  (str): Name of the customer
            data_source    (dict): dict containing name of GCP storage bucket

        """
        super().__init__(**kwargs)

        self.customer_name = customer_name.replace(" ", "_")
        self.credentials = kwargs.get("credentials", {})
        self.data_source = data_source
        self._provider_uuid = kwargs.get("provider_uuid")
        self.gcp_big_query_columns = [
            "billing_account_id",
            "service.id",
            "service.description",
            "sku.id",
            "sku.description",
            "usage_start_time",
            "usage_end_time",
            "project.id",
            "project.name",
            "project.labels",
            "project.ancestry_numbers",
            "labels",
            "system_labels",
            "location.location",
            "location.country",
            "location.region",
            "location.zone",
            "export_time",
            "cost",
            "currency",
            "currency_conversion_rate",
            "usage.amount",
            "usage.unit",
            "usage.amount_in_pricing_units",
            "usage.pricing_unit",
            "credits",
            "invoice.month",
            "cost_type",
        ]
        self.table_name = ".".join([
            self.credentials.get("project_id"),
            self._get_dataset_name(),
            self.data_source.get("table_id")
        ])
        self.scan_start, self.scan_end = self._generate_default_scan_range()
        try:
            GCPProvider().cost_usage_source_is_reachable(
                self.credentials, self.data_source)
            self.etag = self._generate_etag()
        except ValidationError as ex:
            msg = f"GCP source ({self._provider_uuid}) for {customer_name} is not reachable. Error: {str(ex)}"
            LOG.warning(log_json(self.tracing_id, msg, self.context))
            raise GCPReportDownloaderError(str(ex))
        self.big_query_export_time = None
 def _get_manifest(self, date_time):
     dates = utils.month_date_range(date_time)
     directory = f"{REPORTS_DIR}/{self.cluster_id}/{dates}"
     msg = f"Looking for manifest at {directory}"
     LOG.info(log_json(self.request_id, msg, self.context))
     report_meta = utils.get_report_details(directory)
     return report_meta
    def __init__(self, customer_name, data_source, **kwargs):
        """
        Constructor.

        Args:
            customer_name  (str): Name of the customer
            data_source    (dict): dict containing name of GCP storage bucket

        """
        super().__init__(**kwargs)

        self.bucket_name = data_source.get("bucket")
        self.report_prefix = data_source.get("report_prefix", "")
        self.customer_name = customer_name.replace(" ", "_")
        self._provider_uuid = kwargs.get("provider_uuid")

        try:
            GCPProvider().cost_usage_source_is_reachable(None, data_source)
            self._storage_client = storage.Client()
            self._bucket_info = self._storage_client.lookup_bucket(
                self.bucket_name)
        except ValidationError as ex:
            msg = f"GCP bucket {self.bucket_name} for customer {customer_name} is not reachable. Error: {str(ex)}"
            LOG.error(log_json(self.request_id, msg, self.context))
            raise GCPReportDownloaderError(str(ex))
Beispiel #29
0
def get_account(provider_uuid, request_id, context={}):
    """
    Retrieve a provider's account configuration needed for processing.

    Args:
        provider_uuid (String): Provider unique identifier.
        request_id (String): Identifier associated with the payload
        context (Dict): Context for logging (account, etc)

    Returns:
        (dict) - keys: value
                 authentication: String,
                 customer_name: String,
                 billing_source: String,
                 provider_type: String,
                 schema_name: String,
                 provider_uuid: String

    """
    all_accounts = []
    try:
        all_accounts = AccountsAccessor().get_accounts(provider_uuid)
    except AccountsAccessorError as error:
        msg = f"Unable to get accounts. Error: {str(error)}"
        LOG.warning(log_json(request_id, msg, context))
        return None

    return all_accounts.pop() if all_accounts else None
Beispiel #30
0
def get_account_from_cluster_id(cluster_id, request_id, context={}):
    """
    Returns the provider details for a given OCP cluster id.

    Args:
        cluster_id (String): Cluster UUID.
        request_id (String): Identifier associated with the payload
        context (Dict): Context for logging (account, etc)

    Returns:
        (dict) - keys: value
                 authentication: String,
                 customer_name: String,
                 billing_source: String,
                 provider_type: String,
                 schema_name: String,
                 provider_uuid: String

    """
    account = None
    provider_uuid = utils.get_provider_uuid_from_cluster_id(cluster_id)
    if provider_uuid:
        msg = f"Found provider_uuid: {str(provider_uuid)} for cluster_id: {str(cluster_id)}"
        LOG.info(log_json(request_id, msg, context))
        if context:
            context["provider_uuid"] = provider_uuid
        account = get_account(provider_uuid, request_id, context)
    return account