def test_get_path_prefix(self): """Test that path prefix is returned.""" account = "10001" provider_type = Provider.PROVIDER_AWS provider_uuid = self.aws_provider_uuid start_date = datetime.utcnow().date() expected_path_prefix = f"{Config.WAREHOUSE_PATH}/{Config.PARQUET_DATA_TYPE}" expected_path = ( f"{expected_path_prefix}/{account}/{provider_type}/" f"source={provider_uuid}/year={start_date.year}/month={start_date.month}" ) path = common_utils.get_path_prefix(account, provider_type, provider_uuid, start_date, "parquet") self.assertEqual(path, expected_path) # Test with report_type report_type = "pod_report" expected_path = ( f"{expected_path_prefix}/{account}/{provider_type}/{report_type}/" f"source={provider_uuid}/year={start_date.year}/month={start_date.month}" ) path = common_utils.get_path_prefix(account, provider_type, provider_uuid, start_date, "parquet", report_type=report_type) self.assertEqual(path, expected_path)
def _determin_s3_path_for_gcp(self, file_type, gcp_file_name): """Determine the s3 path based off of the invoice month.""" invoice_month = gcp_file_name.split("_")[0] dh = DateHelper() start_of_invoice = dh.gcp_invoice_month_start(invoice_month) if file_type == DAILY_FILE_TYPE: report_type = self.report_type if report_type is None: report_type = "raw" return get_path_prefix( self.account, self.provider_type, self.provider_uuid, start_of_invoice, Config.PARQUET_DATA_TYPE, report_type=report_type, daily=True, ) else: if self.report_type == OPENSHIFT_REPORT_TYPE: return get_path_prefix( self.account, self.provider_type, self.provider_uuid, start_of_invoice, Config.PARQUET_DATA_TYPE, report_type=self.report_type, daily=True, ) else: return get_path_prefix( self.account, self.provider_type, self.provider_uuid, start_of_invoice, Config.PARQUET_DATA_TYPE )
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" try: blob = self._azure_client.get_cost_export_for_key(key, self.container_name) etag = blob.etag except AzureCostReportNotFound as ex: msg = f"Error when downloading Azure report for key: {key}. Error {ex}" LOG.error(log_json(self.request_id, msg, self.context)) raise AzureReportDownloaderError(msg) if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) blob = self._azure_client.download_cost_export(key, self.container_name, destination=full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context ) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download an S3 object to file. Args: key (str): The S3 object key identified. Returns: (String): The path and file name of the saved file """ s3_filename = key.split("/")[-1] directory_path = f"{DATA_DIR}/{self.customer_name}/aws/{self.bucket}" local_s3_filename = utils.get_local_file_name(key) msg = f"Local S3 filename: {local_s3_filename}" LOG.info(log_json(self.request_id, msg, self.context)) full_file_path = f"{directory_path}/{local_s3_filename}" # Make sure the data directory exists os.makedirs(directory_path, exist_ok=True) s3_etag = None try: s3_file = self.s3_client.get_object( Bucket=self.report.get("S3Bucket"), Key=key) s3_etag = s3_file.get("ETag") except ClientError as ex: if ex.response["Error"]["Code"] == "NoSuchKey": msg = "Unable to find {} in S3 Bucket: {}".format( s3_filename, self.report.get("S3Bucket")) LOG.info(log_json(self.request_id, msg, self.context)) raise AWSReportDownloaderNoFileError(msg) msg = f"Error downloading file: Error: {str(ex)}" LOG.error(log_json(self.request_id, msg, self.context)) raise AWSReportDownloaderError(str(ex)) if not self._check_size(key, check_inflate=True): raise AWSReportDownloaderError( f"Insufficient disk space to download file: {s3_file}") if s3_etag != stored_etag or not os.path.isfile(full_file_path): LOG.debug("Downloading key: %s to file path: %s", key, full_file_path) self.s3_client.download_file(self.report.get("S3Bucket"), key, full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) utils.copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_s3_filename, manifest_id, start_date, self.context) utils.remove_files_not_in_set_from_s3_bucket( self.request_id, s3_csv_path, manifest_id) return full_file_path, s3_etag
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" etag_hasher = hashlib.new("ripemd160") etag_hasher.update(bytes(local_filename, "utf-8")) etag = etag_hasher.hexdigest() if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) shutil.copy2(key, full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context ) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag
def test_remove_files_not_in_set_from_s3_bucket(self): """Test remove_files_not_in_set_from_s3_bucket.""" removed = utils.remove_files_not_in_set_from_s3_bucket( "request_id", None, "manifest_id") self.assertEqual(removed, []) date_accessor = DateAccessor() start_date = date_accessor.today_with_timezone("utc").replace(day=1) s3_csv_path = get_path_prefix("account", Provider.PROVIDER_AWS, "provider_uuid", start_date, Config.CSV_DATA_TYPE) expected_key = "removed_key" mock_object = Mock(metadata={}, key=expected_key) mock_summary = Mock() mock_summary.Object.return_value = mock_object with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True): with patch("masu.util.aws.common.get_s3_resource") as mock_s3: mock_s3.return_value.Bucket.return_value.objects.filter.return_value = [ mock_summary ] removed = utils.remove_files_not_in_set_from_s3_bucket( "request_id", s3_csv_path, "manifest_id") self.assertEqual(removed, [expected_key]) with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True): with patch("masu.util.aws.common.get_s3_resource") as mock_s3: mock_s3.side_effect = ClientError({}, "Error") removed = utils.remove_files_not_in_set_from_s3_bucket( "request_id", s3_csv_path, "manifest_id") self.assertEqual(removed, [])
def create_daily_archives(request_id, account, provider_uuid, filename, filepath, manifest_id, start_date, context={}): """ Create daily CSVs from incoming report and archive to S3. Args: request_id (str): The request id account (str): The account number provider_uuid (str): The uuid of a provider filename (str): The OCP file name filepath (str): The full path name of the file manifest_id (int): The manifest identifier start_date (Datetime): The start datetime of incoming report context (Dict): Logging context dictionary """ daily_file_names = [] if settings.ENABLE_S3_ARCHIVING: daily_files = divide_csv_daily(filepath, filename) for daily_file in daily_files: # Push to S3 s3_csv_path = get_path_prefix(account, provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket( request_id, s3_csv_path, daily_file.get("filepath"), daily_file.get("filename"), manifest_id, start_date, context, ) daily_file_names.append(daily_file.get("filename")) os.remove(daily_file.get("filepath")) return daily_file_names
def parquet_path_s3(self): """The path in the S3 bucket where Parquet files are loaded.""" return get_path_prefix( self.account, self.provider_type, self.provider_uuid, self.start_date, Config.PARQUET_DATA_TYPE, report_type=self.report_type, )
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download an S3 object to file. Args: key (str): The S3 object key identified. Returns: (String): The path and file name of the saved file """ local_s3_filename = utils.get_local_file_name(key) directory_path = f"{DATA_DIR}/{self.customer_name}/aws-local/{self.bucket}" full_file_path = f"{directory_path}/{local_s3_filename}" if not os.path.isfile(key): log_msg = f"Unable to locate {key} in {self.bucket_path}" raise AWSReportDownloaderNoFileError(log_msg) # Make sure the data directory exists os.makedirs(directory_path, exist_ok=True) s3_etag_hasher = hashlib.new("ripemd160") s3_etag_hasher.update(bytes(local_s3_filename, "utf-8")) s3_etag = s3_etag_hasher.hexdigest() file_creation_date = None if s3_etag != stored_etag or not os.path.isfile(full_file_path): msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.tracing_id, msg, self.context)) shutil.copy2(key, full_file_path) file_creation_date = datetime.datetime.fromtimestamp( os.path.getmtime(full_file_path)) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AWS, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) utils.copy_local_report_file_to_s3_bucket( self.tracing_id, s3_csv_path, full_file_path, local_s3_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): utils.remove_files_not_in_set_from_s3_bucket( self.tracing_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) return full_file_path, s3_etag, file_creation_date, []
def create_daily_archives( request_id, account, provider_uuid, filename, file_path, manifest_id, start_date, context={} ): if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider_uuid, Provider.PROVIDER_IBM, account): s3_csv_path = get_path_prefix(account, Provider.PROVIDER_IBM, provider_uuid, start_date, Config.CSV_DATA_TYPE) # add day to S3 CSV path because the IBM report is monthly and we want to diff between two days s3_csv_path = f"{s3_csv_path}/day={start_date.strftime('%d')}" copy_local_report_file_to_s3_bucket( request_id, s3_csv_path, file_path, filename, manifest_id, start_date, context ) return [file_path]
def parquet_ocp_on_cloud_path_s3(self): """The path in the S3 bucket where Parquet files are loaded.""" return get_path_prefix( self.account, self.provider_type, self.provider_uuid, self.start_date, Config.PARQUET_DATA_TYPE, report_type=OPENSHIFT_REPORT_TYPE, daily=True, )
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" file_creation_date = None try: blob = self._azure_client.get_cost_export_for_key( key, self.container_name) etag = blob.etag file_creation_date = blob.last_modified except AzureCostReportNotFound as ex: msg = f"Error when downloading Azure report for key: {key}. Error {ex}" LOG.error(log_json(self.tracing_id, msg, self.context)) raise AzureReportDownloaderError(msg) msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.tracing_id, msg, self.context)) blob = self._azure_client.download_cost_export( key, self.container_name, destination=full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket(self.tracing_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.tracing_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.tracing_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" etag_hasher = hashlib.new("ripemd160") etag_hasher.update(bytes(local_filename, "utf-8")) etag = etag_hasher.hexdigest() file_creation_date = None if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) shutil.copy2(key, full_file_path) file_creation_date = datetime.datetime.fromtimestamp( os.path.getmtime(full_file_path)) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket(self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.request_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
def create_daily_archives(tracing_id, account, provider_uuid, filename, filepath, manifest_id, start_date, context={}): """ Create daily CSVs from incoming report and archive to S3. Args: tracing_id (str): The tracing id account (str): The account number provider_uuid (str): The uuid of a provider filename (str): The OCP file name filepath (str): The full path name of the file manifest_id (int): The manifest identifier start_date (Datetime): The start datetime of incoming report context (Dict): Logging context dictionary """ daily_file_names = [] if settings.ENABLE_S3_ARCHIVING or enable_trino_processing( provider_uuid, Provider.PROVIDER_OCP, account): if context.get("version"): daily_files = [{"filepath": filepath, "filename": filename}] else: daily_files = divide_csv_daily(filepath, filename) for daily_file in daily_files: # Push to S3 s3_csv_path = get_path_prefix(account, Provider.PROVIDER_OCP, provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket( tracing_id, s3_csv_path, daily_file.get("filepath"), daily_file.get("filename"), manifest_id, start_date, context, ) daily_file_names.append(daily_file.get("filepath")) return daily_file_names
def create_daily_archives(tracing_id, account, provider_uuid, filename, filepath, manifest_id, start_date, last_export_time, context={}): """ Create daily CSVs from incoming report and archive to S3. Args: tracing_id (str): The tracing id account (str): The account number provider_uuid (str): The uuid of a provider filename (str): The OCP file name filepath (str): The full path name of the file manifest_id (int): The manifest identifier start_date (Datetime): The start datetime of incoming report context (Dict): Logging context dictionary """ download_hash = None daily_file_names = [] if last_export_time: download_hash = hashlib.md5(str(last_export_time).encode()) download_hash = download_hash.hexdigest() if settings.ENABLE_S3_ARCHIVING or enable_trino_processing( provider_uuid, Provider.PROVIDER_GCP, account): dh = DateHelper() directory = os.path.dirname(filepath) try: data_frame = pd.read_csv(filepath) except Exception as error: LOG.error( f"File {filepath} could not be parsed. Reason: {str(error)}") raise error for invoice_month in data_frame["invoice.month"].unique(): # daily_files = [] invoice_filter = data_frame["invoice.month"] == invoice_month invoice_data = data_frame[invoice_filter] unique_times = invoice_data.partition_date.unique() days = list({cur_dt[:10] for cur_dt in unique_times}) daily_data_frames = [{ "data_frame": invoice_data[invoice_data.partition_date.str.contains( cur_day)], "date": cur_day } for cur_day in days] start_of_invoice = dh.gcp_invoice_month_start(invoice_month) s3_csv_path = get_path_prefix(account, Provider.PROVIDER_GCP, provider_uuid, start_of_invoice, Config.CSV_DATA_TYPE) for daily_data in daily_data_frames: day = daily_data.get("date") df = daily_data.get("data_frame") if download_hash: day_file = f"{invoice_month}_{day}_{download_hash}.csv" else: day_file = f"{invoice_month}_{day}.csv" day_filepath = f"{directory}/{day_file}" df.to_csv(day_filepath, index=False, header=True) copy_local_report_file_to_s3_bucket(tracing_id, s3_csv_path, day_filepath, day_file, manifest_id, start_date, context) daily_file_names.append(day_filepath) return daily_file_names
def convert_to_parquet(request_id, account, provider_uuid, provider_type, start_date, manifest_id, files=[], context={}): """ Convert archived CSV data from our S3 bucket for a given provider to Parquet. This function chiefly follows the download of a providers data. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to convert the archived data. Args: request_id (str): The associated request id (ingress or celery task id) account (str): The account string provider_uuid (UUID): The provider UUID start_date (str): The report start time (YYYY-mm-dd) manifest_id (str): The identifier for the report manifest context (dict): A context object for logging """ if not context: context = {"account": account, "provider_uuid": provider_uuid} if not settings.ENABLE_S3_ARCHIVING: msg = "Skipping convert_to_parquet. S3 archiving feature is disabled." LOG.info(log_json(request_id, msg, context)) return if not request_id or not account or not provider_uuid: if not request_id: message = "missing required argument: request_id" LOG.error(message) if not account: message = "missing required argument: account" LOG.error(message) if not provider_uuid: message = "missing required argument: provider_uuid" LOG.error(message) if not provider_type: message = "missing required argument: provider_type" LOG.error(message) return if not start_date: msg = "S3 archiving feature is enabled, but no start_date was given for processing." LOG.warn(log_json(request_id, msg, context)) return try: cost_date = parser.parse(start_date) except ValueError: msg = "S3 archiving feature is enabled, but the start_date was not a valid date string ISO 8601 format." LOG.warn(log_json(request_id, msg, context)) return s3_csv_path = get_path_prefix(account, provider_uuid, cost_date, Config.CSV_DATA_TYPE) local_path = f"{Config.TMP_DIR}/{account}/{provider_uuid}" s3_parquet_path = get_path_prefix(account, provider_uuid, cost_date, Config.PARQUET_DATA_TYPE) if not files: file_keys = get_file_keys_from_s3_with_manifest_id( request_id, s3_csv_path, manifest_id, context) files = [os.path.basename(file_key) for file_key in file_keys] if not files: msg = "S3 archiving feature is enabled, but no files to process." LOG.info(log_json(request_id, msg, context)) return post_processor = None # OCP data is daily chunked report files. # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated if provider_type != Provider.PROVIDER_OCP: remove_files_not_in_set_from_s3_bucket(request_id, s3_parquet_path, manifest_id, context) if provider_type in [Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL]: post_processor = aws_post_processor failed_conversion = [] for csv_filename in files: kwargs = {} parquet_path = s3_parquet_path if provider_type == Provider.PROVIDER_OCP: for report_type in REPORT_TYPES.keys(): if report_type in csv_filename: parquet_path = f"{s3_parquet_path}/{report_type}" kwargs["report_type"] = report_type break converters = get_column_converters(provider_type, **kwargs) result = convert_csv_to_parquet( request_id, s3_csv_path, parquet_path, local_path, manifest_id, csv_filename, converters, post_processor, context, ) if not result: failed_conversion.append(csv_filename) if failed_conversion: msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}." LOG.warn(log_json(request_id, msg, context)) return
def csv_path_s3(self): """The path in the S3 bucket where CSV files are loaded.""" return get_path_prefix( self.account, self.provider_type, self.provider_uuid, self.start_date, Config.CSV_DATA_TYPE )