def _set_report_processor(self, parquet_file, daily=False):
        """Return the correct ReportParquetProcessor."""
        s3_hive_table_path = get_hive_table_path(
            self.account, self.provider_type, report_type=self.report_type, daily=daily
        )
        processor = None
        if self.provider_type in (Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL):
            processor = AWSReportParquetProcessor(
                self.manifest_id, self.account, s3_hive_table_path, self.provider_uuid, parquet_file
            )
        elif self.provider_type in (Provider.PROVIDER_OCP,):
            processor = OCPReportParquetProcessor(
                self.manifest_id, self.account, s3_hive_table_path, self.provider_uuid, parquet_file, self.report_type
            )
        elif self.provider_type in (Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL):
            processor = AzureReportParquetProcessor(
                self.manifest_id, self.account, s3_hive_table_path, self.provider_uuid, parquet_file
            )
        elif self.provider_type in (Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL):
            processor = GCPReportParquetProcessor(
                self.manifest_id, self.account, s3_hive_table_path, self.provider_uuid, parquet_file
            )
        if processor is None:
            msg = f"There is no ReportParquetProcessor for provider type {self.provider_type}"
            raise ParquetReportProcessorError(msg)

        return processor
Beispiel #2
0
    def test_get_hive_table_path(self):
        """Test that we resolve the path for a Hive table."""
        account = "10001"
        provider_type = Provider.PROVIDER_AWS

        expected_path_prefix = f"{Config.WAREHOUSE_PATH}/{Config.PARQUET_DATA_TYPE}"
        expected_path = f"{expected_path_prefix}/{account}/{provider_type}"

        path = common_utils.get_hive_table_path(account, provider_type)
        self.assertEqual(path, expected_path)

        # Test with report_type
        report_type = "pod_report"
        expected_path = f"{expected_path_prefix}/{account}/{provider_type}/{report_type}"
        path = common_utils.get_hive_table_path(account, provider_type, report_type=report_type)
        self.assertEqual(path, expected_path)
    def convert_csv_to_parquet(  # noqa: C901
        self,
        request_id,
        s3_csv_path,
        s3_parquet_path,
        local_path,
        manifest_id,
        csv_filename,
        converters={},
        post_processor=None,
        context={},
        report_type=None,
    ):
        """
        Convert CSV files to parquet on S3.
        """
        if s3_csv_path is None or s3_parquet_path is None or local_path is None:
            msg = (
                f"Invalid paths provided to convert_csv_to_parquet."
                f"CSV path={s3_csv_path}, Parquet path={s3_parquet_path}, and local_path={local_path}."
            )
            LOG.error(log_json(request_id, msg, context))
            return False

        msg = f"Running convert_csv_to_parquet on file {csv_filename} in S3 path {s3_csv_path}."
        LOG.info(log_json(request_id, msg, context))

        kwargs = {}
        parquet_file = None
        csv_file = f"{s3_csv_path}/{csv_filename}"
        if csv_filename.lower().endswith(CSV_EXT):
            ext = -len(CSV_EXT)
            parquet_file = f"{csv_filename[:ext]}.parquet"
        elif csv_filename.lower().endswith(CSV_GZIP_EXT):
            ext = -len(CSV_GZIP_EXT)
            parquet_file = f"{csv_filename[:ext]}.parquet"
            kwargs = {"compression": "gzip"}
        else:
            msg = f"File {csv_filename} is not valid CSV. Conversion to parquet skipped."
            LOG.warn(log_json(request_id, msg, context))
            return False

        Path(local_path).mkdir(parents=True, exist_ok=True)
        tmpfile = f"{local_path}/{csv_filename}"
        try:
            s3_resource = get_s3_resource()
            csv_obj = s3_resource.Object(bucket_name=settings.S3_BUCKET_NAME,
                                         key=csv_file)
            csv_obj.download_file(tmpfile)
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            msg = f"File {csv_filename} could not obtained for parquet conversion. Reason: {str(err)}"
            LOG.warn(log_json(request_id, msg, context))
            return False

        output_file = f"{local_path}/{parquet_file}"
        try:
            col_names = pd.read_csv(tmpfile, nrows=0, **kwargs).columns
            converters.update(
                {col: str
                 for col in col_names if col not in converters})
            data_frame = pd.read_csv(tmpfile, converters=converters, **kwargs)
            if post_processor:
                data_frame = post_processor(data_frame)
            data_frame.to_parquet(output_file,
                                  allow_truncated_timestamps=True,
                                  coerce_timestamps="ms")
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            msg = f"File {csv_filename} could not be written as parquet to temp file {output_file}. Reason: {str(err)}"
            LOG.warn(log_json(request_id, msg, context))
            return False

        try:
            with open(output_file, "rb") as fin:
                data = BytesIO(fin.read())
                copy_data_to_s3_bucket(request_id,
                                       s3_parquet_path,
                                       parquet_file,
                                       data,
                                       manifest_id=manifest_id,
                                       context=context)
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            s3_key = f"{s3_parquet_path}/{parquet_file}"
            msg = f"File {csv_filename} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}"
            LOG.warn(log_json(request_id, msg, context))
            return False

        s3_hive_table_path = get_hive_table_path(context.get("account"),
                                                 self._provider_type,
                                                 report_type=report_type)

        if not self.presto_table_exists.get(report_type):
            self.create_parquet_table(
                context.get("account"),
                context.get("provider_uuid"),
                manifest_id,
                s3_hive_table_path,
                output_file,
                report_type,
            )

        shutil.rmtree(local_path, ignore_errors=True)
        return True
Beispiel #4
0
    def convert_csv_to_parquet(  # noqa: C901
        self,
        request_id,
        s3_csv_path,
        s3_parquet_path,
        local_path,
        manifest_id,
        csv_filename,
        converters={},
        post_processor=None,
        context={},
        report_type=None,
    ):
        """
        Convert CSV files to parquet on S3.
        """
        csv_path, csv_name = os.path.split(csv_filename)
        if s3_csv_path is None or s3_parquet_path is None or local_path is None:
            msg = (
                f"Invalid paths provided to convert_csv_to_parquet."
                f"CSV path={s3_csv_path}, Parquet path={s3_parquet_path}, and local_path={local_path}."
            )
            LOG.error(log_json(request_id, msg, context))
            return False

        msg = f"Running convert_csv_to_parquet on file {csv_filename} in S3 path {s3_csv_path}."
        LOG.info(log_json(request_id, msg, context))

        kwargs = {}
        parquet_file = None
        if csv_name.lower().endswith(CSV_EXT):
            ext = -len(CSV_EXT)
            parquet_base_filename = f"{csv_name[:ext]}"
        elif csv_name.lower().endswith(CSV_GZIP_EXT):
            ext = -len(CSV_GZIP_EXT)
            parquet_base_filename = f"{csv_name[:ext]}"
            kwargs = {"compression": "gzip"}
        else:
            msg = f"File {csv_name} is not valid CSV. Conversion to parquet skipped."
            LOG.warn(log_json(request_id, msg, context))
            return False

        Path(local_path).mkdir(parents=True, exist_ok=True)

        try:
            col_names = pd.read_csv(csv_filename, nrows=0, **kwargs).columns
            converters.update(
                {col: str
                 for col in col_names if col not in converters})
            data_frame = pd.read_csv(
                csv_filename,
                converters=converters,
                chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE,
                **kwargs)
            with pd.read_csv(csv_filename,
                             converters=converters,
                             chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE,
                             **kwargs) as reader:
                for i, data_frame in enumerate(reader):
                    parquet_filename = f"{parquet_base_filename}_{i}{PARQUET_EXT}"
                    parquet_file = f"{local_path}/{parquet_filename}"
                    if post_processor:
                        data_frame = post_processor(data_frame)
                    data_frame.to_parquet(parquet_file,
                                          allow_truncated_timestamps=True,
                                          coerce_timestamps="ms")
                    try:
                        with open(parquet_file, "rb") as fin:
                            data = BytesIO(fin.read())
                            copy_data_to_s3_bucket(
                                request_id,
                                s3_parquet_path,
                                parquet_filename,
                                data,
                                manifest_id=manifest_id,
                                context=context,
                            )
                            msg = f"{parquet_file} sent to S3."
                            LOG.info(msg)
                    except Exception as err:
                        shutil.rmtree(local_path, ignore_errors=True)
                        s3_key = f"{s3_parquet_path}/{parquet_file}"
                        msg = f"File {csv_filename} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}"
                        LOG.warn(log_json(request_id, msg, context))
                        return False
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            msg = (
                f"File {csv_filename} could not be written as parquet to temp file {parquet_file}. Reason: {str(err)}"
            )
            LOG.warn(log_json(request_id, msg, context))
            return False

        s3_hive_table_path = get_hive_table_path(context.get("account"),
                                                 self._provider_type,
                                                 report_type=report_type)

        if not self.presto_table_exists.get(report_type):
            self.create_parquet_table(
                context.get("account"),
                context.get("provider_uuid"),
                manifest_id,
                s3_hive_table_path,
                parquet_file,
                report_type,
            )

        # Delete the local parquet files
        shutil.rmtree(local_path, ignore_errors=True)
        # Now we can delete the local CSV
        if os.path.exists(csv_filename):
            os.remove(csv_filename)
        return True