Exemple #1
0
def convert_to_parquet(request_id,
                       account,
                       provider_uuid,
                       provider_type,
                       start_date,
                       manifest_id,
                       files=[],
                       context={}):
    """
    Convert archived CSV data from our S3 bucket for a given provider to Parquet.

    This function chiefly follows the download of a providers data.

    This task is defined to attempt up to 10 retries using exponential backoff
    starting with a 10-second delay. This is intended to allow graceful handling
    of temporary AWS S3 connectivity issues because it is relatively important
    for us to convert the archived data.

    Args:
        request_id (str): The associated request id (ingress or celery task id)
        account (str): The account string
        provider_uuid (UUID): The provider UUID
        start_date (str): The report start time (YYYY-mm-dd)
        manifest_id (str): The identifier for the report manifest
        context (dict): A context object for logging

    """
    if not context:
        context = {"account": account, "provider_uuid": provider_uuid}

    if not settings.ENABLE_S3_ARCHIVING:
        msg = "Skipping convert_to_parquet. S3 archiving feature is disabled."
        LOG.info(log_json(request_id, msg, context))
        return

    if not request_id or not account or not provider_uuid:
        if not request_id:
            message = "missing required argument: request_id"
            LOG.error(message)
        if not account:
            message = "missing required argument: account"
            LOG.error(message)
        if not provider_uuid:
            message = "missing required argument: provider_uuid"
            LOG.error(message)
        if not provider_type:
            message = "missing required argument: provider_type"
            LOG.error(message)
        return

    if not start_date:
        msg = "S3 archiving feature is enabled, but no start_date was given for processing."
        LOG.warn(log_json(request_id, msg, context))
        return

    try:
        cost_date = parser.parse(start_date)
    except ValueError:
        msg = "S3 archiving feature is enabled, but the start_date was not a valid date string ISO 8601 format."
        LOG.warn(log_json(request_id, msg, context))
        return

    s3_csv_path = get_path_prefix(account, provider_uuid, cost_date,
                                  Config.CSV_DATA_TYPE)
    local_path = f"{Config.TMP_DIR}/{account}/{provider_uuid}"
    s3_parquet_path = get_path_prefix(account, provider_uuid, cost_date,
                                      Config.PARQUET_DATA_TYPE)

    if not files:
        file_keys = get_file_keys_from_s3_with_manifest_id(
            request_id, s3_csv_path, manifest_id, context)
        files = [os.path.basename(file_key) for file_key in file_keys]
        if not files:
            msg = "S3 archiving feature is enabled, but no files to process."
            LOG.info(log_json(request_id, msg, context))
            return

    post_processor = None
    # OCP data is daily chunked report files.
    # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated
    if provider_type != Provider.PROVIDER_OCP:
        remove_files_not_in_set_from_s3_bucket(request_id, s3_parquet_path,
                                               manifest_id, context)

    if provider_type in [Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL]:
        post_processor = aws_post_processor

    failed_conversion = []
    for csv_filename in files:
        kwargs = {}
        parquet_path = s3_parquet_path
        if provider_type == Provider.PROVIDER_OCP:
            for report_type in REPORT_TYPES.keys():
                if report_type in csv_filename:
                    parquet_path = f"{s3_parquet_path}/{report_type}"
                    kwargs["report_type"] = report_type
                    break
        converters = get_column_converters(provider_type, **kwargs)
        result = convert_csv_to_parquet(
            request_id,
            s3_csv_path,
            parquet_path,
            local_path,
            manifest_id,
            csv_filename,
            converters,
            post_processor,
            context,
        )
        if not result:
            failed_conversion.append(csv_filename)

    if failed_conversion:
        msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}."
        LOG.warn(log_json(request_id, msg, context))
        return
Exemple #2
0
    def test_convert_csv_to_parquet(self):
        """Test convert_csv_to_parquet."""
        result = utils.convert_csv_to_parquet("request_id", None,
                                              "s3_parquet_path", "local_path",
                                              "manifest_id", "csv_filename")
        self.assertFalse(result)

        result = utils.convert_csv_to_parquet("request_id", "s3_csv_path",
                                              "s3_parquet_path", "local_path",
                                              "manifest_id", "csv_filename")
        self.assertFalse(result)

        with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True):
            with patch("masu.util.aws.common.get_s3_resource") as mock_s3:
                with patch("masu.util.aws.common.shutil.rmtree"):
                    with patch("masu.util.aws.common.Path"):
                        mock_s3.side_effect = ClientError({}, "Error")
                        result = utils.convert_csv_to_parquet(
                            "request_id",
                            "s3_csv_path",
                            "s3_parquet_path",
                            "local_path",
                            "manifest_id",
                            "csv_filename.csv",
                        )
                        self.assertFalse(result)

        with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True):
            with patch("masu.util.aws.common.get_s3_resource"):
                with patch("masu.util.aws.common.shutil.rmtree"):
                    with patch("masu.util.aws.common.Path"):
                        result = utils.convert_csv_to_parquet(
                            "request_id",
                            "s3_csv_path",
                            "s3_parquet_path",
                            "local_path",
                            "manifest_id",
                            "csv_filename.csv.gz",
                        )
                        self.assertFalse(result)

        with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True):
            with patch("masu.util.aws.common.get_s3_resource"):
                with patch("masu.util.aws.common.shutil.rmtree"):
                    with patch("masu.util.aws.common.Path"):
                        with patch("masu.util.aws.common.pd"):
                            with patch(
                                    "masu.util.aws.common.open") as mock_open:
                                mock_open.side_effect = ValueError()
                                result = utils.convert_csv_to_parquet(
                                    "request_id",
                                    "s3_csv_path",
                                    "s3_parquet_path",
                                    "local_path",
                                    "manifest_id",
                                    "csv_filename.csv.gz",
                                )
                                self.assertFalse(result)

        with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True):
            with patch("masu.util.aws.common.get_s3_resource"):
                with patch("masu.util.aws.common.Path"):
                    with patch("masu.util.aws.common.shutil.rmtree"):
                        with patch("masu.util.aws.common.pd"):
                            with patch("masu.util.aws.common.open"):
                                with patch("masu.util.aws.common.BytesIO"):
                                    with patch(
                                            "masu.util.aws.common.copy_data_to_s3_bucket"
                                    ):
                                        result = utils.convert_csv_to_parquet(
                                            "request_id",
                                            "s3_csv_path",
                                            "s3_parquet_path",
                                            "local_path",
                                            "manifest_id",
                                            "csv_filename.csv.gz",
                                        )
                                        self.assertTrue(result)