def _write_parquet_to_file(self, file_path, file_name, data_frame, file_type=None): """Write Parquet file and send to S3.""" s3_path = self._determin_s3_path(file_type) data_frame.to_parquet(file_path, allow_truncated_timestamps=True, coerce_timestamps="ms", index=False) try: with open(file_path, "rb") as fin: copy_data_to_s3_bucket(self.request_id, s3_path, file_name, fin, manifest_id=self.manifest_id, context=self.error_context) msg = f"{file_path} sent to S3." LOG.info(log_json(self.request_id, msg, self.error_context)) except Exception as err: s3_key = f"{self.parquet_path_s3}/{file_path}" msg = f"File {file_name} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}" LOG.warn(log_json(self.request_id, msg, self.error_context)) return False finally: self.files_to_remove.append(file_path) return True
def _write_parquet_to_file(self, file_path, file_name, data_frame, file_type=None): """Write Parquet file and send to S3.""" if self._provider_type == Provider.PROVIDER_GCP: # We need to determine the parquet file path based off # of the start of the invoice month and usage start for GCP. s3_path = self._determin_s3_path_for_gcp(file_type, file_name) else: s3_path = self._determin_s3_path(file_type) data_frame.to_parquet(file_path, allow_truncated_timestamps=True, coerce_timestamps="ms", index=False) try: with open(file_path, "rb") as fin: copy_data_to_s3_bucket( self.tracing_id, s3_path, file_name, fin, manifest_id=self.manifest_id, context=self.error_context ) msg = f"{file_path} sent to S3." LOG.info(log_json(self.tracing_id, msg, self.error_context)) except Exception as err: s3_key = f"{self.parquet_path_s3}/{file_path}" msg = f"File {file_name} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}" LOG.warn(log_json(self.tracing_id, msg, self.error_context)) return False finally: self.files_to_remove.append(file_path) return True
def test_copy_data_to_s3_bucket(self): """Test copy_data_to_s3_bucket.""" upload = utils.copy_data_to_s3_bucket("request_id", "path", "filename", "data", "manifest_id") self.assertEqual(upload, None) with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True): with patch("masu.util.aws.common.get_s3_resource") as mock_s3: upload = utils.copy_data_to_s3_bucket("request_id", "path", "filename", "data", "manifest_id") self.assertIsNotNone(upload) with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True): with patch("masu.util.aws.common.get_s3_resource") as mock_s3: mock_s3.side_effect = ClientError({}, "Error") upload = utils.copy_data_to_s3_bucket("request_id", "path", "filename", "data", "manifest_id") self.assertEqual(upload, None)
def convert_csv_to_parquet( # noqa: C901 self, request_id, s3_csv_path, s3_parquet_path, local_path, manifest_id, csv_filename, converters={}, post_processor=None, context={}, report_type=None, ): """ Convert CSV files to parquet on S3. """ if s3_csv_path is None or s3_parquet_path is None or local_path is None: msg = ( f"Invalid paths provided to convert_csv_to_parquet." f"CSV path={s3_csv_path}, Parquet path={s3_parquet_path}, and local_path={local_path}." ) LOG.error(log_json(request_id, msg, context)) return False msg = f"Running convert_csv_to_parquet on file {csv_filename} in S3 path {s3_csv_path}." LOG.info(log_json(request_id, msg, context)) kwargs = {} parquet_file = None csv_file = f"{s3_csv_path}/{csv_filename}" if csv_filename.lower().endswith(CSV_EXT): ext = -len(CSV_EXT) parquet_file = f"{csv_filename[:ext]}.parquet" elif csv_filename.lower().endswith(CSV_GZIP_EXT): ext = -len(CSV_GZIP_EXT) parquet_file = f"{csv_filename[:ext]}.parquet" kwargs = {"compression": "gzip"} else: msg = f"File {csv_filename} is not valid CSV. Conversion to parquet skipped." LOG.warn(log_json(request_id, msg, context)) return False Path(local_path).mkdir(parents=True, exist_ok=True) tmpfile = f"{local_path}/{csv_filename}" try: s3_resource = get_s3_resource() csv_obj = s3_resource.Object(bucket_name=settings.S3_BUCKET_NAME, key=csv_file) csv_obj.download_file(tmpfile) except Exception as err: shutil.rmtree(local_path, ignore_errors=True) msg = f"File {csv_filename} could not obtained for parquet conversion. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False output_file = f"{local_path}/{parquet_file}" try: col_names = pd.read_csv(tmpfile, nrows=0, **kwargs).columns converters.update( {col: str for col in col_names if col not in converters}) data_frame = pd.read_csv(tmpfile, converters=converters, **kwargs) if post_processor: data_frame = post_processor(data_frame) data_frame.to_parquet(output_file, allow_truncated_timestamps=True, coerce_timestamps="ms") except Exception as err: shutil.rmtree(local_path, ignore_errors=True) msg = f"File {csv_filename} could not be written as parquet to temp file {output_file}. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False try: with open(output_file, "rb") as fin: data = BytesIO(fin.read()) copy_data_to_s3_bucket(request_id, s3_parquet_path, parquet_file, data, manifest_id=manifest_id, context=context) except Exception as err: shutil.rmtree(local_path, ignore_errors=True) s3_key = f"{s3_parquet_path}/{parquet_file}" msg = f"File {csv_filename} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False s3_hive_table_path = get_hive_table_path(context.get("account"), self._provider_type, report_type=report_type) if not self.presto_table_exists.get(report_type): self.create_parquet_table( context.get("account"), context.get("provider_uuid"), manifest_id, s3_hive_table_path, output_file, report_type, ) shutil.rmtree(local_path, ignore_errors=True) return True
def convert_csv_to_parquet( # noqa: C901 self, request_id, s3_csv_path, s3_parquet_path, local_path, manifest_id, csv_filename, converters={}, post_processor=None, context={}, report_type=None, ): """ Convert CSV files to parquet on S3. """ csv_path, csv_name = os.path.split(csv_filename) if s3_csv_path is None or s3_parquet_path is None or local_path is None: msg = ( f"Invalid paths provided to convert_csv_to_parquet." f"CSV path={s3_csv_path}, Parquet path={s3_parquet_path}, and local_path={local_path}." ) LOG.error(log_json(request_id, msg, context)) return False msg = f"Running convert_csv_to_parquet on file {csv_filename} in S3 path {s3_csv_path}." LOG.info(log_json(request_id, msg, context)) kwargs = {} parquet_file = None if csv_name.lower().endswith(CSV_EXT): ext = -len(CSV_EXT) parquet_base_filename = f"{csv_name[:ext]}" elif csv_name.lower().endswith(CSV_GZIP_EXT): ext = -len(CSV_GZIP_EXT) parquet_base_filename = f"{csv_name[:ext]}" kwargs = {"compression": "gzip"} else: msg = f"File {csv_name} is not valid CSV. Conversion to parquet skipped." LOG.warn(log_json(request_id, msg, context)) return False Path(local_path).mkdir(parents=True, exist_ok=True) try: col_names = pd.read_csv(csv_filename, nrows=0, **kwargs).columns converters.update( {col: str for col in col_names if col not in converters}) data_frame = pd.read_csv( csv_filename, converters=converters, chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE, **kwargs) with pd.read_csv(csv_filename, converters=converters, chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE, **kwargs) as reader: for i, data_frame in enumerate(reader): parquet_filename = f"{parquet_base_filename}_{i}{PARQUET_EXT}" parquet_file = f"{local_path}/{parquet_filename}" if post_processor: data_frame = post_processor(data_frame) data_frame.to_parquet(parquet_file, allow_truncated_timestamps=True, coerce_timestamps="ms") try: with open(parquet_file, "rb") as fin: data = BytesIO(fin.read()) copy_data_to_s3_bucket( request_id, s3_parquet_path, parquet_filename, data, manifest_id=manifest_id, context=context, ) msg = f"{parquet_file} sent to S3." LOG.info(msg) except Exception as err: shutil.rmtree(local_path, ignore_errors=True) s3_key = f"{s3_parquet_path}/{parquet_file}" msg = f"File {csv_filename} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}" LOG.warn(log_json(request_id, msg, context)) return False except Exception as err: shutil.rmtree(local_path, ignore_errors=True) msg = ( f"File {csv_filename} could not be written as parquet to temp file {parquet_file}. Reason: {str(err)}" ) LOG.warn(log_json(request_id, msg, context)) return False s3_hive_table_path = get_hive_table_path(context.get("account"), self._provider_type, report_type=report_type) if not self.presto_table_exists.get(report_type): self.create_parquet_table( context.get("account"), context.get("provider_uuid"), manifest_id, s3_hive_table_path, parquet_file, report_type, ) # Delete the local parquet files shutil.rmtree(local_path, ignore_errors=True) # Now we can delete the local CSV if os.path.exists(csv_filename): os.remove(csv_filename) return True