def _write_parquet_to_file(self, file_path, file_name, data_frame, file_type=None): """Write Parquet file and send to S3.""" if self._provider_type == Provider.PROVIDER_GCP: # We need to determine the parquet file path based off # of the start of the invoice month and usage start for GCP. s3_path = self._determin_s3_path_for_gcp(file_type, file_name) else: s3_path = self._determin_s3_path(file_type) data_frame.to_parquet(file_path, allow_truncated_timestamps=True, coerce_timestamps="ms", index=False) try: with open(file_path, "rb") as fin: copy_data_to_s3_bucket( self.tracing_id, s3_path, file_name, fin, manifest_id=self.manifest_id, context=self.error_context ) msg = f"{file_path} sent to S3." LOG.info(log_json(self.tracing_id, msg, self.error_context)) except Exception as err: s3_key = f"{self.parquet_path_s3}/{file_path}" msg = f"File {file_name} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}" LOG.warn(log_json(self.tracing_id, msg, self.error_context)) return False finally: self.files_to_remove.append(file_path) return True
def _get_manifest(self, date_time): """ Download and return the CUR manifest for the given date. Args: date_time (DateTime): The starting datetime object Returns: (Dict): A dict-like object serialized from JSON data. """ manifest = "{}/{}-Manifest.json".format( self._get_report_path(date_time), self.report_name) msg = f"Will attempt to download manifest: {manifest}" LOG.info(log_json(self.request_id, msg, self.context)) try: manifest_file, _, manifest_modified_timestamp, __ = self.download_file( manifest) except AWSReportDownloaderNoFileError as err: msg = f"Unable to get report manifest. Reason: {str(err)}" LOG.info(log_json(self.request_id, msg, self.context)) return "", self.empty_manifest, None manifest_json = None with open(manifest_file, "r") as manifest_file_handle: manifest_json = json.load(manifest_file_handle) return manifest_file, manifest_json, manifest_modified_timestamp
def update_summary_tables(self, start_date, end_date, tracing_id): """ Update report summary tables. Args: start_date (str, datetime): When to start. end_date (str, datetime): When to end. tracing_id (str): The tracing_id. Returns: None """ msg = f"Summary processing starting for source {self._provider_uuid}" LOG.info(log_json(self._tracing_id, msg)) start_date, end_date = self._format_dates(start_date, end_date) LOG.info(log_json(tracing_id, f"Using start date: {start_date}")) LOG.info(log_json(tracing_id, f"Using end date: {end_date}")) start_date, end_date = self._updater.update_summary_tables(start_date, end_date) msg = f"Summary processing completed for source {self._provider_uuid}" LOG.info(log_json(self._tracing_id, msg)) invalidate_view_cache_for_tenant_and_source_type(self._schema, self._provider.type) return start_date, end_date
def remove_files_not_in_set_from_s3_bucket(request_id, s3_path, manifest_id, context={}): """ Removes all files in a given prefix if they are not within the given set. """ if not settings.ENABLE_S3_ARCHIVING: return [] removed = [] if s3_path: try: s3_resource = get_s3_resource() existing_objects = s3_resource.Bucket( settings.S3_BUCKET_NAME).objects.filter(Prefix=s3_path) for obj_summary in existing_objects: existing_object = obj_summary.Object() metadata = existing_object.metadata manifest = metadata.get("manifestid") manifest_id_str = str(manifest_id) key = existing_object.key if manifest != manifest_id_str: s3_resource.Object(settings.S3_BUCKET_NAME, key).delete() removed.append(key) if removed: msg = f"Removed files from s3 bucket {settings.S3_BUCKET_NAME}: {','.join(removed)}." LOG.info(log_json(request_id, msg, context)) except ClientError as err: msg = f"Unable to remove data in bucket {settings.S3_BUCKET_NAME}. Reason: {str(err)}" LOG.info(log_json(request_id, msg, context)) return removed
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" etag_hasher = hashlib.new("ripemd160") etag_hasher.update(bytes(local_filename, "utf-8")) etag = etag_hasher.hexdigest() if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) shutil.copy2(key, full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context ) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag
def get_report_for(self, date_time): """ Get OCP usage report files corresponding to a date. Args: date_time (DateTime): Start date of the usage report. Returns: ([]) List of file paths for a particular report. """ dates = utils.month_date_range(date_time) msg = f"Looking for cluster {self.cluster_id} report for date {str(dates)}" LOG.debug(log_json(self.request_id, msg, self.context)) directory = f"{REPORTS_DIR}/{self.cluster_id}/{dates}" manifest = self._get_manifest(date_time) msg = f"manifest found: {str(manifest)}" LOG.info(log_json(self.request_id, msg, self.context)) reports = [] for file in manifest.get("files", []): report_full_path = os.path.join(directory, file) reports.append(report_full_path) return reports
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download an S3 object to file. Args: key (str): The S3 object key identified. Returns: (String): The path and file name of the saved file """ s3_filename = key.split("/")[-1] directory_path = f"{DATA_DIR}/{self.customer_name}/aws/{self.bucket}" local_s3_filename = utils.get_local_file_name(key) msg = f"Local S3 filename: {local_s3_filename}" LOG.info(log_json(self.request_id, msg, self.context)) full_file_path = f"{directory_path}/{local_s3_filename}" # Make sure the data directory exists os.makedirs(directory_path, exist_ok=True) s3_etag = None try: s3_file = self.s3_client.get_object( Bucket=self.report.get("S3Bucket"), Key=key) s3_etag = s3_file.get("ETag") except ClientError as ex: if ex.response["Error"]["Code"] == "NoSuchKey": msg = "Unable to find {} in S3 Bucket: {}".format( s3_filename, self.report.get("S3Bucket")) LOG.info(log_json(self.request_id, msg, self.context)) raise AWSReportDownloaderNoFileError(msg) msg = f"Error downloading file: Error: {str(ex)}" LOG.error(log_json(self.request_id, msg, self.context)) raise AWSReportDownloaderError(str(ex)) if not self._check_size(key, check_inflate=True): raise AWSReportDownloaderError( f"Insufficient disk space to download file: {s3_file}") if s3_etag != stored_etag or not os.path.isfile(full_file_path): LOG.debug("Downloading key: %s to file path: %s", key, full_file_path) self.s3_client.download_file(self.report.get("S3Bucket"), key, full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) utils.copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_s3_filename, manifest_id, start_date, self.context) utils.remove_files_not_in_set_from_s3_bucket( self.request_id, s3_csv_path, manifest_id) return full_file_path, s3_etag
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from GCP storage bucket. If we have a stored etag and it matches the current GCP blob, we can safely skip download since the blob/file content must not have changed. Args: key (str): name of the blob in the GCP storage bucket stored_etag (str): optional etag stored in our DB for comparison Returns: tuple(str, str) with the local filesystem path to file and GCP's etag. """ blob = self._bucket_info.get_blob(key) if not blob: raise GCPReportDownloaderNoFileError(f'No blob found in bucket "{self.bucket_name}" with name "{key}"') if stored_etag is not None and stored_etag != blob.etag: # Should we abort download here? Just log a warning for now... msg = f"etag for {key} is {blob.etag}, but stored etag is {stored_etag}" LOG.warning(log_json(self.request_id, msg, self.context)) directory_path = self._get_local_directory_path() full_local_path = self._get_local_file_path(directory_path, key) os.makedirs(directory_path, exist_ok=True) msg = f"Downloading {key} to {full_local_path}" LOG.info(log_json(self.request_id, msg, self.context)) blob.download_to_filename(full_local_path) msg = f"Returning full_file_path: {full_local_path}, etag: {blob.etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_local_path, blob.etag
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" try: blob = self._azure_client.get_cost_export_for_key(key, self.container_name) etag = blob.etag except AzureCostReportNotFound as ex: msg = f"Error when downloading Azure report for key: {key}. Error {ex}" LOG.error(log_json(self.request_id, msg, self.context)) raise AzureReportDownloaderError(msg) if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) blob = self._azure_client.download_cost_export(key, self.container_name, destination=full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket( self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context ) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag
def _write_parquet_to_file(self, file_path, file_name, data_frame, file_type=None): """Write Parquet file and send to S3.""" s3_path = self._determin_s3_path(file_type) data_frame.to_parquet(file_path, allow_truncated_timestamps=True, coerce_timestamps="ms", index=False) try: with open(file_path, "rb") as fin: copy_data_to_s3_bucket(self.request_id, s3_path, file_name, fin, manifest_id=self.manifest_id, context=self.error_context) msg = f"{file_path} sent to S3." LOG.info(log_json(self.request_id, msg, self.error_context)) except Exception as err: s3_key = f"{self.parquet_path_s3}/{file_path}" msg = f"File {file_name} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}" LOG.warn(log_json(self.request_id, msg, self.error_context)) return False finally: self.files_to_remove.append(file_path) return True
def process(self): """ Process the current cost usage report. Args: None Returns: (List) List of filenames downloaded. """ msg = f"Report processing started for {self.report_path}" LOG.info(log_json(self.tracing_id, msg)) try: if self.trino_enabled: parquet_base_filename, daily_data_frames = self._processor.process() if self.ocp_on_cloud_processor: self.ocp_on_cloud_processor.process(parquet_base_filename, daily_data_frames) return msg = f"Report processing completed for {self.report_path}" LOG.info(log_json(self.tracing_id, msg)) if self._secondary_processor: try: self._secondary_processor.process() except (ConnectTimeout, InvalidURL, ConnectionError): pass return self._processor.process() except (InterfaceError, DjangoInterfaceError) as err: raise ReportProcessorDBError(str(err)) except OperationalError as o_err: db_exc = get_extended_exception_by_type(o_err) LOG.error(log_json(self.tracing_id, str(db_exc), context=db_exc.as_dict())) raise db_exc except Exception as err: raise ReportProcessorError(str(err))
def record_report_status(manifest_id, file_name, request_id, context={}): """ Creates initial report status database entry for new report files. If a report has already been downloaded from the ingress service there is a chance that processing has already been complete. The function returns the last completed date time to determine if the report processing should continue in extract_payload. Args: manifest_id (Integer): Manifest Identifier. file_name (String): Report file name request_id (String): Identifier associated with the payload context (Dict): Context for logging (account, etc) Returns: DateTime - Last completed date time for a given report file. """ already_processed = False with ReportStatsDBAccessor(file_name, manifest_id) as db_accessor: already_processed = db_accessor.get_last_completed_datetime() if already_processed: msg = f"Report {file_name} has already been processed." LOG.info(log_json(request_id, msg, context)) else: msg = f"Recording stats entry for {file_name}" LOG.info(log_json(request_id, msg, context)) return already_processed
def _process_manifest_db_record(self, assembly_id, billing_start, num_of_files, manifest_modified_datetime, **kwargs): """Insert or update the manifest DB record.""" msg = f"Inserting/updating manifest in database for assembly_id: {assembly_id}" LOG.info(log_json(self.tracing_id, msg)) with ReportManifestDBAccessor() as manifest_accessor: manifest_entry = manifest_accessor.get_manifest( assembly_id, self._provider_uuid) if not manifest_entry: msg = f"No manifest entry found in database. Adding for bill period start: {billing_start}" LOG.info(log_json(self.tracing_id, msg, self.context)) manifest_dict = { "assembly_id": assembly_id, "billing_period_start_datetime": billing_start, "num_total_files": num_of_files, "provider_uuid": self._provider_uuid, "manifest_modified_datetime": manifest_modified_datetime, } manifest_dict.update(kwargs) try: manifest_entry = manifest_accessor.add(**manifest_dict) except IntegrityError as error: fk_violation = FKViolation(error) if fk_violation: LOG.warning(fk_violation) raise ReportDownloaderError( f"Method: _process_manifest_db_record :: {fk_violation}" ) msg = ( f"Manifest entry uniqueness collision: Error {error}. " "Manifest already added, getting manifest_entry_id.") LOG.warning(log_json(self.tracing_id, msg, self.context)) with ReportManifestDBAccessor() as manifest_accessor: manifest_entry = manifest_accessor.get_manifest( assembly_id, self._provider_uuid) if not manifest_entry: msg = f"Manifest entry not found for given manifest {manifest_dict}." with ProviderDBAccessor( self._provider_uuid) as provider_accessor: provider = provider_accessor.get_provider() if not provider: msg = f"Provider entry not found for {self._provider_uuid}." LOG.warning( log_json(self.tracing_id, msg, self.context)) raise ReportDownloaderError(msg) LOG.warning(log_json(self.tracing_id, msg, self.context)) raise IntegrityError(msg) else: if num_of_files != manifest_entry.num_total_files: manifest_accessor.update_number_of_files_for_manifest( manifest_entry) manifest_accessor.mark_manifest_as_updated(manifest_entry) manifest_id = manifest_entry.id return manifest_id
def refresh_materialized_views( # noqa: C901 schema_name, provider_type, manifest_id=None, provider_uuid="", synchronous=False, queue_name=None, tracing_id=None, ): """Refresh the database's materialized views for reporting.""" task_name = "masu.processor.tasks.refresh_materialized_views" cache_args = [schema_name, provider_type, provider_uuid] if not synchronous: worker_cache = WorkerCache() if worker_cache.single_task_is_running(task_name, cache_args): msg = f"Task {task_name} already running for {cache_args}. Requeuing." LOG.info(log_json(tracing_id, msg)) refresh_materialized_views.s( schema_name, provider_type, manifest_id=manifest_id, provider_uuid=provider_uuid, synchronous=synchronous, queue_name=queue_name, tracing_id=tracing_id, ).apply_async(queue=queue_name or REFRESH_MATERIALIZED_VIEWS_QUEUE) return worker_cache.lock_single_task(task_name, cache_args, timeout=settings.WORKER_CACHE_TIMEOUT) materialized_views = () try: with schema_context(schema_name): for view in materialized_views: table_name = view._meta.db_table with connection.cursor() as cursor: cursor.execute( f"REFRESH MATERIALIZED VIEW CONCURRENTLY {table_name}") LOG.info(log_json(tracing_id, f"Refreshed {table_name}.")) invalidate_view_cache_for_tenant_and_source_type( schema_name, provider_type) if provider_uuid: ProviderDBAccessor(provider_uuid).set_data_updated_timestamp() if manifest_id: # Processing for this monifest should be complete after this step with ReportManifestDBAccessor() as manifest_accessor: manifest = manifest_accessor.get_manifest_by_id(manifest_id) manifest_accessor.mark_manifest_as_completed(manifest) except Exception as ex: if not synchronous: worker_cache.release_single_task(task_name, cache_args) raise ex if not synchronous: worker_cache.release_single_task(task_name, cache_args)
def _check_size(self, s3key, check_inflate=False): """Check the size of an S3 file. Determine if there is enough local space to download and decompress the file. Args: s3key (str): the key name of the S3 object to check check_inflate (bool): if the file is compressed, evaluate the file's decompressed size. Returns: (bool): whether the file can be safely stored (and decompressed) """ size_ok = False try: s3fileobj = self.s3_client.get_object( Bucket=self.report.get("S3Bucket"), Key=s3key) size = int(s3fileobj.get("ContentLength", -1)) except ClientError as ex: if ex.response["Error"]["Code"] == "AccessDenied": msg = "Unable to access S3 Bucket {}: (AccessDenied)".format( self.report.get("S3Bucket")) LOG.info(log_json(self.tracing_id, msg, self.context)) raise AWSReportDownloaderNoFileError(msg) msg = f"Error downloading file: Error: {str(ex)}" LOG.error(log_json(self.tracing_id, msg, self.context)) raise AWSReportDownloaderError(str(ex)) if size < 0: raise AWSReportDownloaderError( f"Invalid size for S3 object: {s3fileobj}") free_space = shutil.disk_usage(self.download_path)[2] if size < free_space: size_ok = True LOG.debug("%s is %s bytes; Download path has %s free", s3key, size, free_space) ext = os.path.splitext(s3key)[1] if ext == ".gz" and check_inflate and size_ok and size > 0: # isize block is the last 4 bytes of the file; see: RFC1952 resp = self.s3_client.get_object( Bucket=self.report.get("S3Bucket"), Key=s3key, Range=f"bytes={size - 4}-{size}") isize = struct.unpack("<I", resp["Body"].read(4))[0] if isize > free_space: size_ok = False LOG.debug("%s is %s bytes uncompressed; Download path has %s free", s3key, isize, free_space) return size_ok
def _remove_manifest_file(self, manifest_file): """Clean up the manifest file after extracting information.""" try: os.remove(manifest_file) msg = f"Deleted manifest file at {manifest_file}" LOG.info(log_json(self.request_id, msg, self.context)) except OSError: msg = f"Could not delete manifest file at {manifest_file}" LOG.info(log_json(self.request_id, msg, self.context)) return None
def convert_csv_to_parquet(self, csv_filename): # noqa: C901 """Convert CSV file to parquet and send to S3.""" daily_data_frames = [] converters = self._get_column_converters() csv_path, csv_name = os.path.split(csv_filename) unique_keys = set() parquet_file = None parquet_base_filename = csv_name.replace(self.file_extension, "") kwargs = {} if self.file_extension == CSV_GZIP_EXT: kwargs = {"compression": "gzip"} msg = f"Running convert_csv_to_parquet on file {csv_filename}." LOG.info(log_json(self.tracing_id, msg, self.error_context)) try: col_names = pd.read_csv(csv_filename, nrows=0, **kwargs).columns csv_converters = { col_name: converters[col_name.lower()] for col_name in col_names if col_name.lower() in converters } csv_converters.update({col: str for col in col_names if col not in csv_converters}) with pd.read_csv( csv_filename, converters=csv_converters, chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE, **kwargs ) as reader: for i, data_frame in enumerate(reader): if data_frame.empty: continue parquet_filename = f"{parquet_base_filename}_{i}{PARQUET_EXT}" parquet_file = f"{self.local_path}/{parquet_filename}" if self.post_processor: data_frame = self.post_processor(data_frame) if isinstance(data_frame, tuple): data_frame, data_frame_tag_keys = data_frame LOG.info(f"Updating unique keys with {len(data_frame_tag_keys)} keys") unique_keys.update(data_frame_tag_keys) LOG.info(f"Total unique keys for file {len(unique_keys)}") if self.daily_data_processor is not None: daily_data_frames.append(self.daily_data_processor(data_frame)) success = self._write_parquet_to_file(parquet_file, parquet_filename, data_frame) if not success: return parquet_base_filename, daily_data_frames, False if self.create_table and not self.presto_table_exists.get(self.report_type): self.create_parquet_table(parquet_file) create_enabled_keys(self._schema_name, self.enabled_tags_model, unique_keys) except Exception as err: msg = ( f"File {csv_filename} could not be written as parquet to temp file {parquet_file}. Reason: {str(err)}" ) LOG.warn(log_json(self.tracing_id, msg, self.error_context)) return parquet_base_filename, daily_data_frames, False return parquet_base_filename, daily_data_frames, True
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" file_creation_date = None try: blob = self._azure_client.get_cost_export_for_key( key, self.container_name) etag = blob.etag file_creation_date = blob.last_modified except AzureCostReportNotFound as ex: msg = f"Error when downloading Azure report for key: {key}. Error {ex}" LOG.error(log_json(self.tracing_id, msg, self.context)) raise AzureReportDownloaderError(msg) msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.tracing_id, msg, self.context)) blob = self._azure_client.download_cost_export( key, self.container_name, destination=full_file_path) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket(self.tracing_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.tracing_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.tracing_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from Azure bucket. Args: key (str): The object key identified. Returns: (String): The path and file name of the saved file """ local_filename = utils.get_local_file_name(key) full_file_path = f"{self._get_exports_data_directory()}/{local_filename}" etag_hasher = hashlib.new("ripemd160") etag_hasher.update(bytes(local_filename, "utf-8")) etag = etag_hasher.hexdigest() file_creation_date = None if etag != stored_etag: msg = f"Downloading {key} to {full_file_path}" LOG.info(log_json(self.request_id, msg, self.context)) shutil.copy2(key, full_file_path) file_creation_date = datetime.datetime.fromtimestamp( os.path.getmtime(full_file_path)) # Push to S3 s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE, self._provider_uuid, start_date, Config.CSV_DATA_TYPE) copy_local_report_file_to_s3_bucket(self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context) manifest_accessor = ReportManifestDBAccessor() manifest = manifest_accessor.get_manifest_by_id(manifest_id) if not manifest_accessor.get_s3_csv_cleared(manifest): remove_files_not_in_set_from_s3_bucket(self.request_id, s3_csv_path, manifest_id) manifest_accessor.mark_s3_csv_cleared(manifest) msg = f"Returning full_file_path: {full_file_path}, etag: {etag}" LOG.info(log_json(self.request_id, msg, self.context)) return full_file_path, etag, file_creation_date, []
def get_openshift_on_cloud_infra_map(self, start_date, end_date, tracing_id): """Get cloud infrastructure source and OpenShift source mapping.""" infra_map = {} try: if self._provider.type in Provider.OPENSHIFT_ON_CLOUD_PROVIDER_LIST: msg = f"Getting OpenShift on Cloud infrastructure map for {self._provider_uuid}" LOG.info(log_json(self._tracing_id, msg)) start_date, end_date = self._format_dates(start_date, end_date) LOG.info(log_json(tracing_id, f"Using start date: {start_date}")) LOG.info(log_json(tracing_id, f"Using end date: {end_date}")) infra_map = self._ocp_cloud_updater.get_infra_map(start_date, end_date) except Exception as ex: raise ReportSummaryUpdaterCloudError(str(ex)) return infra_map
def _remove_manifest_file(self, date_time): """Clean up the manifest file after extracting information.""" dates = utils.month_date_range(date_time) directory = f"{REPORTS_DIR}/{self.cluster_id}/{dates}" manifest_path = "{}/{}".format(directory, "manifest.json") try: os.remove(manifest_path) msg = f"Deleted manifest file at {directory}" LOG.debug(log_json(self.request_id, msg, self.context)) except OSError: msg = f"Could not delete manifest file at {directory}" LOG.info(log_json(self.request_id, msg, self.context)) return None
def download_report(self, date_time): """ Download CUR for a given date. Args: date_time (DateTime): The starting datetime object Returns: ([{}]) List of dictionaries containing file path and compression. """ msg = f"Attempting to get {self.provider_type,} manifest for {str(date_time)}..." LOG.info(log_json(self.request_id, msg, self.context)) report_context = self._downloader.get_report_context_for_date( date_time) manifest_id = report_context.get("manifest_id") reports = report_context.get("files", []) cur_reports = [] for report in reports: report_dictionary = {} local_file_name = self._downloader.get_local_file_for_report( report) if self.is_report_processed(local_file_name, manifest_id): msg = f"File has already been processed: {local_file_name}. Skipping..." LOG.info(log_json(self.request_id, msg, self.context)) continue with ReportStatsDBAccessor(local_file_name, manifest_id) as stats_recorder: stored_etag = stats_recorder.get_etag() file_name, etag = self._downloader.download_file( report, stored_etag, manifest_id=manifest_id, start_date=date_time) stats_recorder.update(etag=etag) report_dictionary["file"] = file_name report_dictionary["compression"] = report_context.get( "compression") report_dictionary["start_date"] = date_time report_dictionary["assembly_id"] = report_context.get( "assembly_id") report_dictionary["manifest_id"] = manifest_id report_dictionary["provider_uuid"] = self.provider_uuid cur_reports.append(report_dictionary) return cur_reports
def get_file_keys_from_s3_with_manifest_id(request_id, s3_path, manifest_id, context={}): """ Get all files in a given prefix that match the given manifest_id. """ if not settings.ENABLE_S3_ARCHIVING: return [] keys = [] if s3_path: try: s3_resource = get_s3_resource() existing_objects = s3_resource.Bucket( settings.S3_BUCKET_NAME).objects.filter(Prefix=s3_path) for obj_summary in existing_objects: existing_object = obj_summary.Object() metadata = existing_object.metadata manifest = metadata.get("manifestid") manifest_id_str = str(manifest_id) key = existing_object.key if manifest == manifest_id_str: keys.append(key) except ClientError as err: msg = f"Unable to find data in bucket {settings.S3_BUCKET_NAME}. Reason: {str(err)}" LOG.info(log_json(request_id, msg, context)) return keys
def copy_data_to_s3_bucket(request_id, path, filename, data, manifest_id=None, context={}): """ Copies data to s3 bucket file """ if not settings.ENABLE_S3_ARCHIVING: return None upload = None upload_key = f"{path}/{filename}" try: s3_resource = get_s3_resource() s3_obj = {"bucket_name": settings.S3_BUCKET_NAME, "key": upload_key} upload = s3_resource.Object(**s3_obj) put_value = {"Body": data} if manifest_id: put_value["Metadata"] = {"ManifestId": str(manifest_id)} upload.put(**put_value) except ClientError as err: msg = f"Unable to copy data to {upload_key} in bucket {settings.S3_BUCKET_NAME}. Reason: {str(err)}" LOG.info(log_json(request_id, msg, context)) return upload
def check_if_manifest_should_be_downloaded(self, assembly_id): """Check if we should download this manifest. We first check if we have a database record of this manifest. That would indicate that we have already downloaded and at least begun processing. We then check the last completed time for a file in this manifest. This second check is to cover the case when we did not complete processing and need to re-downlaod and process the manifest. Returns True if the manifest should be downloaded and processed. """ if self._cache_key and self.worker_cache.task_is_running(self._cache_key): msg = f"{self._cache_key} is currently running." LOG.info(log_json(self.request_id, msg, self.context)) return False with ReportManifestDBAccessor() as manifest_accessor: manifest = manifest_accessor.get_manifest(assembly_id, self._provider_uuid) if manifest: manifest_id = manifest.id # check if `last_completed_datetime` is null for any report in the manifest. # if nulls exist, report processing is not complete and reports should be downloaded. need_to_download = manifest_accessor.is_last_completed_datetime_null(manifest_id) if need_to_download: self.worker_cache.add_task_to_cache(self._cache_key) return need_to_download # The manifest does not exist, this is the first time we are # downloading and processing it. self.worker_cache.add_task_to_cache(self._cache_key) return True
def __init__(self, customer_name, data_source, **kwargs): """ Constructor. Args: customer_name (str): Name of the customer data_source (dict): dict containing name of GCP storage bucket """ super().__init__(**kwargs) self.customer_name = customer_name.replace(" ", "_") self.credentials = kwargs.get("credentials", {}) self.data_source = data_source self._provider_uuid = kwargs.get("provider_uuid") self.gcp_big_query_columns = [ "billing_account_id", "service.id", "service.description", "sku.id", "sku.description", "usage_start_time", "usage_end_time", "project.id", "project.name", "project.labels", "project.ancestry_numbers", "labels", "system_labels", "location.location", "location.country", "location.region", "location.zone", "export_time", "cost", "currency", "currency_conversion_rate", "usage.amount", "usage.unit", "usage.amount_in_pricing_units", "usage.pricing_unit", "credits", "invoice.month", "cost_type", ] self.table_name = ".".join([ self.credentials.get("project_id"), self._get_dataset_name(), self.data_source.get("table_id") ]) self.scan_start, self.scan_end = self._generate_default_scan_range() try: GCPProvider().cost_usage_source_is_reachable( self.credentials, self.data_source) self.etag = self._generate_etag() except ValidationError as ex: msg = f"GCP source ({self._provider_uuid}) for {customer_name} is not reachable. Error: {str(ex)}" LOG.warning(log_json(self.tracing_id, msg, self.context)) raise GCPReportDownloaderError(str(ex)) self.big_query_export_time = None
def _get_manifest(self, date_time): dates = utils.month_date_range(date_time) directory = f"{REPORTS_DIR}/{self.cluster_id}/{dates}" msg = f"Looking for manifest at {directory}" LOG.info(log_json(self.request_id, msg, self.context)) report_meta = utils.get_report_details(directory) return report_meta
def __init__(self, customer_name, data_source, **kwargs): """ Constructor. Args: customer_name (str): Name of the customer data_source (dict): dict containing name of GCP storage bucket """ super().__init__(**kwargs) self.bucket_name = data_source.get("bucket") self.report_prefix = data_source.get("report_prefix", "") self.customer_name = customer_name.replace(" ", "_") self._provider_uuid = kwargs.get("provider_uuid") try: GCPProvider().cost_usage_source_is_reachable(None, data_source) self._storage_client = storage.Client() self._bucket_info = self._storage_client.lookup_bucket( self.bucket_name) except ValidationError as ex: msg = f"GCP bucket {self.bucket_name} for customer {customer_name} is not reachable. Error: {str(ex)}" LOG.error(log_json(self.request_id, msg, self.context)) raise GCPReportDownloaderError(str(ex))
def get_account(provider_uuid, request_id, context={}): """ Retrieve a provider's account configuration needed for processing. Args: provider_uuid (String): Provider unique identifier. request_id (String): Identifier associated with the payload context (Dict): Context for logging (account, etc) Returns: (dict) - keys: value authentication: String, customer_name: String, billing_source: String, provider_type: String, schema_name: String, provider_uuid: String """ all_accounts = [] try: all_accounts = AccountsAccessor().get_accounts(provider_uuid) except AccountsAccessorError as error: msg = f"Unable to get accounts. Error: {str(error)}" LOG.warning(log_json(request_id, msg, context)) return None return all_accounts.pop() if all_accounts else None
def get_account_from_cluster_id(cluster_id, request_id, context={}): """ Returns the provider details for a given OCP cluster id. Args: cluster_id (String): Cluster UUID. request_id (String): Identifier associated with the payload context (Dict): Context for logging (account, etc) Returns: (dict) - keys: value authentication: String, customer_name: String, billing_source: String, provider_type: String, schema_name: String, provider_uuid: String """ account = None provider_uuid = utils.get_provider_uuid_from_cluster_id(cluster_id) if provider_uuid: msg = f"Found provider_uuid: {str(provider_uuid)} for cluster_id: {str(cluster_id)}" LOG.info(log_json(request_id, msg, context)) if context: context["provider_uuid"] = provider_uuid account = get_account(provider_uuid, request_id, context) return account