def _set_report_processor(self, parquet_file, daily=False): """Return the correct ReportParquetProcessor.""" s3_hive_table_path = get_hive_table_path( self.account, self.provider_type, report_type=self.report_type, daily=daily ) processor = None if self.provider_type in (Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL): processor = AWSReportParquetProcessor( self.manifest_id, self.account, s3_hive_table_path, self.provider_uuid, parquet_file ) elif self.provider_type in (Provider.PROVIDER_OCP,): processor = OCPReportParquetProcessor( self.manifest_id, self.account, s3_hive_table_path, self.provider_uuid, parquet_file, self.report_type ) elif self.provider_type in (Provider.PROVIDER_AZURE, Provider.PROVIDER_AZURE_LOCAL): processor = AzureReportParquetProcessor( self.manifest_id, self.account, s3_hive_table_path, self.provider_uuid, parquet_file ) elif self.provider_type in (Provider.PROVIDER_GCP, Provider.PROVIDER_GCP_LOCAL): processor = GCPReportParquetProcessor( self.manifest_id, self.account, s3_hive_table_path, self.provider_uuid, parquet_file ) if processor is None: msg = f"There is no ReportParquetProcessor for provider type {self.provider_type}" raise ParquetReportProcessorError(msg) return processor
def test_get_hive_table_path(self): """Test that we resolve the path for a Hive table.""" account = "10001" provider_type = Provider.PROVIDER_AWS expected_path_prefix = f"{Config.WAREHOUSE_PATH}/{Config.PARQUET_DATA_TYPE}" expected_path = f"{expected_path_prefix}/{account}/{provider_type}" path = common_utils.get_hive_table_path(account, provider_type) self.assertEqual(path, expected_path) # Test with report_type report_type = "pod_report" expected_path = f"{expected_path_prefix}/{account}/{provider_type}/{report_type}" path = common_utils.get_hive_table_path(account, provider_type, report_type=report_type) self.assertEqual(path, expected_path)
def convert_csv_to_parquet( # noqa: C901 self, request_id, s3_csv_path, s3_parquet_path, local_path, manifest_id, csv_filename, converters={}, post_processor=None, context={}, report_type=None, ): """ Convert CSV files to parquet on S3. """ if s3_csv_path is None or s3_parquet_path is None or local_path is None: msg = ( f"Invalid paths provided to convert_csv_to_parquet." f"CSV path={s3_csv_path}, Parquet path={s3_parquet_path}, and local_path={local_path}." ) LOG.error(log_json(request_id, msg, context)) return False msg = f"Running convert_csv_to_parquet on file {csv_filename} in S3 path {s3_csv_path}." LOG.info(log_json(request_id, msg, context)) kwargs = {} parquet_file = None csv_file = f"{s3_csv_path}/{csv_filename}" if csv_filename.lower().endswith(CSV_EXT): ext = -len(CSV_EXT) parquet_file = f"{csv_filename[:ext]}.parquet" elif csv_filename.lower().endswith(CSV_GZIP_EXT): ext = -len(CSV_GZIP_EXT) parquet_file = f"{csv_filename[:ext]}.parquet" kwargs = {"compression": "gzip"} else: msg = f"File {csv_filename} is not valid CSV. Conversion to parquet skipped." LOG.warn(log_json(request_id, msg, context)) return False Path(local_path).mkdir(parents=True, exist_ok=True) tmpfile = f"{local_path}/{csv_filename}" try: s3_resource = get_s3_resource() csv_obj = s3_resource.Object(bucket_name=settings.S3_BUCKET_NAME, key=csv_file) csv_obj.download_file(tmpfile) except Exception as err: shutil.rmtree(local_path, ignore_errors=True) msg = f"File {csv_filename} could not obtained for parquet conversion. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False output_file = f"{local_path}/{parquet_file}" try: col_names = pd.read_csv(tmpfile, nrows=0, **kwargs).columns converters.update( {col: str for col in col_names if col not in converters}) data_frame = pd.read_csv(tmpfile, converters=converters, **kwargs) if post_processor: data_frame = post_processor(data_frame) data_frame.to_parquet(output_file, allow_truncated_timestamps=True, coerce_timestamps="ms") except Exception as err: shutil.rmtree(local_path, ignore_errors=True) msg = f"File {csv_filename} could not be written as parquet to temp file {output_file}. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False try: with open(output_file, "rb") as fin: data = BytesIO(fin.read()) copy_data_to_s3_bucket(request_id, s3_parquet_path, parquet_file, data, manifest_id=manifest_id, context=context) except Exception as err: shutil.rmtree(local_path, ignore_errors=True) s3_key = f"{s3_parquet_path}/{parquet_file}" msg = f"File {csv_filename} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False s3_hive_table_path = get_hive_table_path(context.get("account"), self._provider_type, report_type=report_type) if not self.presto_table_exists.get(report_type): self.create_parquet_table( context.get("account"), context.get("provider_uuid"), manifest_id, s3_hive_table_path, output_file, report_type, ) shutil.rmtree(local_path, ignore_errors=True) return True
def convert_csv_to_parquet( # noqa: C901 self, request_id, s3_csv_path, s3_parquet_path, local_path, manifest_id, csv_filename, converters={}, post_processor=None, context={}, report_type=None, ): """ Convert CSV files to parquet on S3. """ csv_path, csv_name = os.path.split(csv_filename) if s3_csv_path is None or s3_parquet_path is None or local_path is None: msg = ( f"Invalid paths provided to convert_csv_to_parquet." f"CSV path={s3_csv_path}, Parquet path={s3_parquet_path}, and local_path={local_path}." ) LOG.error(log_json(request_id, msg, context)) return False msg = f"Running convert_csv_to_parquet on file {csv_filename} in S3 path {s3_csv_path}." LOG.info(log_json(request_id, msg, context)) kwargs = {} parquet_file = None if csv_name.lower().endswith(CSV_EXT): ext = -len(CSV_EXT) parquet_base_filename = f"{csv_name[:ext]}" elif csv_name.lower().endswith(CSV_GZIP_EXT): ext = -len(CSV_GZIP_EXT) parquet_base_filename = f"{csv_name[:ext]}" kwargs = {"compression": "gzip"} else: msg = f"File {csv_name} is not valid CSV. Conversion to parquet skipped." LOG.warn(log_json(request_id, msg, context)) return False Path(local_path).mkdir(parents=True, exist_ok=True) try: col_names = pd.read_csv(csv_filename, nrows=0, **kwargs).columns converters.update( {col: str for col in col_names if col not in converters}) data_frame = pd.read_csv( csv_filename, converters=converters, chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE, **kwargs) with pd.read_csv(csv_filename, converters=converters, chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE, **kwargs) as reader: for i, data_frame in enumerate(reader): parquet_filename = f"{parquet_base_filename}_{i}{PARQUET_EXT}" parquet_file = f"{local_path}/{parquet_filename}" if post_processor: data_frame = post_processor(data_frame) data_frame.to_parquet(parquet_file, allow_truncated_timestamps=True, coerce_timestamps="ms") try: with open(parquet_file, "rb") as fin: data = BytesIO(fin.read()) copy_data_to_s3_bucket( request_id, s3_parquet_path, parquet_filename, data, manifest_id=manifest_id, context=context, ) msg = f"{parquet_file} sent to S3." LOG.info(msg) except Exception as err: shutil.rmtree(local_path, ignore_errors=True) s3_key = f"{s3_parquet_path}/{parquet_file}" msg = f"File {csv_filename} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False except Exception as err: shutil.rmtree(local_path, ignore_errors=True) msg = ( f"File {csv_filename} could not be written as parquet to temp file {parquet_file}. Reason: {str(err)}" ) LOG.warn(log_json(request_id, msg, context)) return False s3_hive_table_path = get_hive_table_path(context.get("account"), self._provider_type, report_type=report_type) if not self.presto_table_exists.get(report_type): self.create_parquet_table( context.get("account"), context.get("provider_uuid"), manifest_id, s3_hive_table_path, parquet_file, report_type, ) # Delete the local parquet files shutil.rmtree(local_path, ignore_errors=True) # Now we can delete the local CSV if os.path.exists(csv_filename): os.remove(csv_filename) return True