Beispiel #1
0
    def test_remove_files_not_in_set_from_s3_bucket(self):
        """Test remove_files_not_in_set_from_s3_bucket."""
        removed = utils.remove_files_not_in_set_from_s3_bucket(
            "request_id", None, "manifest_id")
        self.assertEqual(removed, [])

        date_accessor = DateAccessor()
        start_date = date_accessor.today_with_timezone("utc").replace(day=1)
        s3_csv_path = get_path_prefix("account", Provider.PROVIDER_AWS,
                                      "provider_uuid", start_date,
                                      Config.CSV_DATA_TYPE)
        expected_key = "removed_key"
        mock_object = Mock(metadata={}, key=expected_key)
        mock_summary = Mock()
        mock_summary.Object.return_value = mock_object
        with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True):
            with patch("masu.util.aws.common.get_s3_resource") as mock_s3:
                mock_s3.return_value.Bucket.return_value.objects.filter.return_value = [
                    mock_summary
                ]
                removed = utils.remove_files_not_in_set_from_s3_bucket(
                    "request_id", s3_csv_path, "manifest_id")
                self.assertEqual(removed, [expected_key])

        with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True):
            with patch("masu.util.aws.common.get_s3_resource") as mock_s3:
                mock_s3.side_effect = ClientError({}, "Error")
                removed = utils.remove_files_not_in_set_from_s3_bucket(
                    "request_id", s3_csv_path, "manifest_id")
                self.assertEqual(removed, [])
Beispiel #2
0
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download an S3 object to file.

        Args:
            key (str): The S3 object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        s3_filename = key.split("/")[-1]
        directory_path = f"{DATA_DIR}/{self.customer_name}/aws/{self.bucket}"

        local_s3_filename = utils.get_local_file_name(key)
        msg = f"Local S3 filename: {local_s3_filename}"
        LOG.info(log_json(self.request_id, msg, self.context))
        full_file_path = f"{directory_path}/{local_s3_filename}"

        # Make sure the data directory exists
        os.makedirs(directory_path, exist_ok=True)
        s3_etag = None
        try:
            s3_file = self.s3_client.get_object(
                Bucket=self.report.get("S3Bucket"), Key=key)
            s3_etag = s3_file.get("ETag")
        except ClientError as ex:
            if ex.response["Error"]["Code"] == "NoSuchKey":
                msg = "Unable to find {} in S3 Bucket: {}".format(
                    s3_filename, self.report.get("S3Bucket"))
                LOG.info(log_json(self.request_id, msg, self.context))
                raise AWSReportDownloaderNoFileError(msg)

            msg = f"Error downloading file: Error: {str(ex)}"
            LOG.error(log_json(self.request_id, msg, self.context))
            raise AWSReportDownloaderError(str(ex))

        if not self._check_size(key, check_inflate=True):
            raise AWSReportDownloaderError(
                f"Insufficient disk space to download file: {s3_file}")

        if s3_etag != stored_etag or not os.path.isfile(full_file_path):
            LOG.debug("Downloading key: %s to file path: %s", key,
                      full_file_path)
            self.s3_client.download_file(self.report.get("S3Bucket"), key,
                                         full_file_path)
            # Push to S3
            s3_csv_path = get_path_prefix(self.account, self._provider_uuid,
                                          start_date, Config.CSV_DATA_TYPE)
            utils.copy_local_report_file_to_s3_bucket(
                self.request_id, s3_csv_path, full_file_path,
                local_s3_filename, manifest_id, start_date, self.context)
            utils.remove_files_not_in_set_from_s3_bucket(
                self.request_id, s3_csv_path, manifest_id)

        return full_file_path, s3_etag
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download an S3 object to file.

        Args:
            key (str): The S3 object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_s3_filename = utils.get_local_file_name(key)

        directory_path = f"{DATA_DIR}/{self.customer_name}/aws-local/{self.bucket}"
        full_file_path = f"{directory_path}/{local_s3_filename}"

        if not os.path.isfile(key):
            log_msg = f"Unable to locate {key} in {self.bucket_path}"
            raise AWSReportDownloaderNoFileError(log_msg)

        # Make sure the data directory exists
        os.makedirs(directory_path, exist_ok=True)
        s3_etag_hasher = hashlib.new("ripemd160")
        s3_etag_hasher.update(bytes(local_s3_filename, "utf-8"))
        s3_etag = s3_etag_hasher.hexdigest()

        file_creation_date = None
        if s3_etag != stored_etag or not os.path.isfile(full_file_path):
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.tracing_id, msg, self.context))
            shutil.copy2(key, full_file_path)
            file_creation_date = datetime.datetime.fromtimestamp(
                os.path.getmtime(full_file_path))
            # Push to S3

            s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AWS,
                                          self._provider_uuid, start_date,
                                          Config.CSV_DATA_TYPE)
            utils.copy_local_report_file_to_s3_bucket(
                self.tracing_id, s3_csv_path, full_file_path,
                local_s3_filename, manifest_id, start_date, self.context)

            manifest_accessor = ReportManifestDBAccessor()
            manifest = manifest_accessor.get_manifest_by_id(manifest_id)

            if not manifest_accessor.get_s3_csv_cleared(manifest):
                utils.remove_files_not_in_set_from_s3_bucket(
                    self.tracing_id, s3_csv_path, manifest_id)
                manifest_accessor.mark_s3_csv_cleared(manifest)
        return full_file_path, s3_etag, file_creation_date, []
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        file_creation_date = None
        try:
            blob = self._azure_client.get_cost_export_for_key(
                key, self.container_name)
            etag = blob.etag
            file_creation_date = blob.last_modified
        except AzureCostReportNotFound as ex:
            msg = f"Error when downloading Azure report for key: {key}. Error {ex}"
            LOG.error(log_json(self.tracing_id, msg, self.context))
            raise AzureReportDownloaderError(msg)

        msg = f"Downloading {key} to {full_file_path}"
        LOG.info(log_json(self.tracing_id, msg, self.context))
        blob = self._azure_client.download_cost_export(
            key, self.container_name, destination=full_file_path)
        # Push to S3
        s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE,
                                      self._provider_uuid, start_date,
                                      Config.CSV_DATA_TYPE)
        copy_local_report_file_to_s3_bucket(self.tracing_id, s3_csv_path,
                                            full_file_path, local_filename,
                                            manifest_id, start_date,
                                            self.context)

        manifest_accessor = ReportManifestDBAccessor()
        manifest = manifest_accessor.get_manifest_by_id(manifest_id)

        if not manifest_accessor.get_s3_csv_cleared(manifest):
            remove_files_not_in_set_from_s3_bucket(self.tracing_id,
                                                   s3_csv_path, manifest_id)
            manifest_accessor.mark_s3_csv_cleared(manifest)

        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.tracing_id, msg, self.context))
        return full_file_path, etag, file_creation_date, []
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        etag_hasher = hashlib.new("ripemd160")
        etag_hasher.update(bytes(local_filename, "utf-8"))
        etag = etag_hasher.hexdigest()

        file_creation_date = None
        if etag != stored_etag:
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.request_id, msg, self.context))
            shutil.copy2(key, full_file_path)
            file_creation_date = datetime.datetime.fromtimestamp(
                os.path.getmtime(full_file_path))
            # Push to S3
            s3_csv_path = get_path_prefix(self.account,
                                          Provider.PROVIDER_AZURE,
                                          self._provider_uuid, start_date,
                                          Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(self.request_id, s3_csv_path,
                                                full_file_path, local_filename,
                                                manifest_id, start_date,
                                                self.context)

            manifest_accessor = ReportManifestDBAccessor()
            manifest = manifest_accessor.get_manifest_by_id(manifest_id)

            if not manifest_accessor.get_s3_csv_cleared(manifest):
                remove_files_not_in_set_from_s3_bucket(self.request_id,
                                                       s3_csv_path,
                                                       manifest_id)
                manifest_accessor.mark_s3_csv_cleared(manifest)

        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_file_path, etag, file_creation_date, []
Beispiel #6
0
def convert_to_parquet(request_id,
                       account,
                       provider_uuid,
                       provider_type,
                       start_date,
                       manifest_id,
                       files=[],
                       context={}):
    """
    Convert archived CSV data from our S3 bucket for a given provider to Parquet.

    This function chiefly follows the download of a providers data.

    This task is defined to attempt up to 10 retries using exponential backoff
    starting with a 10-second delay. This is intended to allow graceful handling
    of temporary AWS S3 connectivity issues because it is relatively important
    for us to convert the archived data.

    Args:
        request_id (str): The associated request id (ingress or celery task id)
        account (str): The account string
        provider_uuid (UUID): The provider UUID
        start_date (str): The report start time (YYYY-mm-dd)
        manifest_id (str): The identifier for the report manifest
        context (dict): A context object for logging

    """
    if not context:
        context = {"account": account, "provider_uuid": provider_uuid}

    if not settings.ENABLE_S3_ARCHIVING:
        msg = "Skipping convert_to_parquet. S3 archiving feature is disabled."
        LOG.info(log_json(request_id, msg, context))
        return

    if not request_id or not account or not provider_uuid:
        if not request_id:
            message = "missing required argument: request_id"
            LOG.error(message)
        if not account:
            message = "missing required argument: account"
            LOG.error(message)
        if not provider_uuid:
            message = "missing required argument: provider_uuid"
            LOG.error(message)
        if not provider_type:
            message = "missing required argument: provider_type"
            LOG.error(message)
        return

    if not start_date:
        msg = "S3 archiving feature is enabled, but no start_date was given for processing."
        LOG.warn(log_json(request_id, msg, context))
        return

    try:
        cost_date = parser.parse(start_date)
    except ValueError:
        msg = "S3 archiving feature is enabled, but the start_date was not a valid date string ISO 8601 format."
        LOG.warn(log_json(request_id, msg, context))
        return

    s3_csv_path = get_path_prefix(account, provider_uuid, cost_date,
                                  Config.CSV_DATA_TYPE)
    local_path = f"{Config.TMP_DIR}/{account}/{provider_uuid}"
    s3_parquet_path = get_path_prefix(account, provider_uuid, cost_date,
                                      Config.PARQUET_DATA_TYPE)

    if not files:
        file_keys = get_file_keys_from_s3_with_manifest_id(
            request_id, s3_csv_path, manifest_id, context)
        files = [os.path.basename(file_key) for file_key in file_keys]
        if not files:
            msg = "S3 archiving feature is enabled, but no files to process."
            LOG.info(log_json(request_id, msg, context))
            return

    post_processor = None
    # OCP data is daily chunked report files.
    # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated
    if provider_type != Provider.PROVIDER_OCP:
        remove_files_not_in_set_from_s3_bucket(request_id, s3_parquet_path,
                                               manifest_id, context)

    if provider_type in [Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL]:
        post_processor = aws_post_processor

    failed_conversion = []
    for csv_filename in files:
        kwargs = {}
        parquet_path = s3_parquet_path
        if provider_type == Provider.PROVIDER_OCP:
            for report_type in REPORT_TYPES.keys():
                if report_type in csv_filename:
                    parquet_path = f"{s3_parquet_path}/{report_type}"
                    kwargs["report_type"] = report_type
                    break
        converters = get_column_converters(provider_type, **kwargs)
        result = convert_csv_to_parquet(
            request_id,
            s3_csv_path,
            parquet_path,
            local_path,
            manifest_id,
            csv_filename,
            converters,
            post_processor,
            context,
        )
        if not result:
            failed_conversion.append(csv_filename)

    if failed_conversion:
        msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}."
        LOG.warn(log_json(request_id, msg, context))
        return
    def convert_to_parquet(self):  # noqa: C901
        """
        Convert archived CSV data from our S3 bucket for a given provider to Parquet.

        This function chiefly follows the download of a providers data.

        This task is defined to attempt up to 10 retries using exponential backoff
        starting with a 10-second delay. This is intended to allow graceful handling
        of temporary AWS S3 connectivity issues because it is relatively important
        for us to convert the archived data.
        """
        parquet_base_filename = ""

        if self.csv_path_s3 is None or self.parquet_path_s3 is None or self.local_path is None:
            msg = (
                f"Invalid paths provided to convert_csv_to_parquet."
                f"CSV path={self.csv_path_s3}, Parquet path={self.parquet_path_s3}, and local_path={self.local_path}."
            )
            LOG.error(log_json(self.tracing_id, msg, self.error_context))
            return "", pd.DataFrame()

        manifest_accessor = ReportManifestDBAccessor()
        manifest = manifest_accessor.get_manifest_by_id(self.manifest_id)

        # OCP data is daily chunked report files.
        # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated
        if not manifest_accessor.get_s3_parquet_cleared(manifest) and self.provider_type not in (
            Provider.PROVIDER_OCP,
            Provider.PROVIDER_GCP,
            Provider.PROVIDER_GCP_LOCAL,
        ):
            remove_files_not_in_set_from_s3_bucket(
                self.tracing_id, self.parquet_path_s3, self.manifest_id, self.error_context
            )
            remove_files_not_in_set_from_s3_bucket(
                self.tracing_id, self.parquet_daily_path_s3, self.manifest_id, self.error_context
            )
            remove_files_not_in_set_from_s3_bucket(
                self.tracing_id, self.parquet_ocp_on_cloud_path_s3, self.manifest_id, self.error_context
            )
            manifest_accessor.mark_s3_parquet_cleared(manifest)

        failed_conversion = []
        daily_data_frames = []
        for csv_filename in self.file_list:
            if self.provider_type == Provider.PROVIDER_OCP and self.report_type is None:
                msg = f"Could not establish report type for {csv_filename}."
                LOG.warn(log_json(self.tracing_id, msg, self.error_context))
                failed_conversion.append(csv_filename)
                continue
            parquet_base_filename, daily_frame, success = self.convert_csv_to_parquet(csv_filename)
            daily_data_frames.extend(daily_frame)
            if self.provider_type not in (Provider.PROVIDER_AZURE):
                self.create_daily_parquet(parquet_base_filename, daily_frame)
            if not success:
                failed_conversion.append(csv_filename)

        if failed_conversion:
            msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}."
            LOG.warn(log_json(self.tracing_id, msg, self.error_context))
        return parquet_base_filename, daily_data_frames