def _determin_s3_path_for_gcp(self, file_type, gcp_file_name): """Determine the s3 path based off of the invoice month.""" invoice_month = gcp_file_name.split("_")[0] dh = DateHelper() start_of_invoice = dh.gcp_invoice_month_start(invoice_month) if file_type == DAILY_FILE_TYPE: report_type = self.report_type if report_type is None: report_type = "raw" return get_path_prefix( self.account, self.provider_type, self.provider_uuid, start_of_invoice, Config.PARQUET_DATA_TYPE, report_type=report_type, daily=True, ) else: if self.report_type == OPENSHIFT_REPORT_TYPE: return get_path_prefix( self.account, self.provider_type, self.provider_uuid, start_of_invoice, Config.PARQUET_DATA_TYPE, report_type=self.report_type, daily=True, ) else: return get_path_prefix( self.account, self.provider_type, self.provider_uuid, start_of_invoice, Config.PARQUET_DATA_TYPE )
def _get_or_create_cost_entry_bill(self, row, report_db_accessor): """Get or Create a GCP cost entry bill object. Args: row (OrderedDict): A dictionary representation of a CSV file row. Returns: (string) An id of a GCP Bill. """ table_name = GCPCostEntryBill dh = DateHelper() invoice_month = row["invoice.month"] start_time = dh.gcp_invoice_month_start(invoice_month) report_date_range = utils.month_date_range(start_time) start_date, end_date = report_date_range.split("-") start_date_utc = parser.parse(start_date).replace(hour=0, minute=0, tzinfo=pytz.UTC) end_date_utc = parser.parse(end_date).replace(hour=0, minute=0, tzinfo=pytz.UTC) billing_period_start = datetime.strftime(start_date_utc, "%Y-%m-%d %H:%M%z") key = (billing_period_start, self._provider_uuid) if key in self.existing_bill_map: return self.existing_bill_map[key] data = { "billing_period_start": billing_period_start, "billing_period_end": datetime.strftime(end_date_utc, "%Y-%m-%d %H:%M%z"), "provider_id": self._provider_uuid, } key = (start_date_utc, self._provider_uuid) if key in self.processed_report.bills: return self.processed_report.bills[key] if key in self.existing_bill_map: return self.existing_bill_map[key] with transaction.atomic(): bill_id = report_db_accessor.insert_on_conflict_do_nothing( table_name, data, conflict_columns=["billing_period_start", "provider_id"]) self.processed_report.bills[key] = bill_id return bill_id
def _generate_monthly_pseudo_manifest(self, start_date): """ Generate a dict representing an analog to other providers' "manifest" files. GCP does not produce a manifest file for monthly periods. So, we check for files in the bucket that match dates within the monthly period starting on the requested start_date. Args: start_date (datetime.datetime): when to start gathering reporting data Returns: Manifest-like dict with list of relevant found files. """ etag = None invoice_month = start_date.strftime("%Y%m") etags = self.file_mapping.get(str(invoice_month), {}) for etag_key in etags.keys(): with ReportManifestDBAccessor() as manifest_accessor: assembly_id = ":".join( [str(self._provider_uuid), etag_key, str(invoice_month)]) manifest = manifest_accessor.get_manifest( assembly_id, self._provider_uuid) if manifest: continue etag = etag_key break if not etag: return {} dh = DateHelper() start_date = dh.gcp_invoice_month_start(str(invoice_month)) end_date = self.file_mapping[invoice_month][etag]["end"] file_names = [self.file_mapping[invoice_month][etag]["filename"]] manifest_data = { "assembly_id": assembly_id, "compression": UNCOMPRESSED, "start_date": start_date, "end_date": end_date, # inclusive end date "file_names": file_names, } return manifest_data
def create_daily_archives(tracing_id, account, provider_uuid, filename, filepath, manifest_id, start_date, last_export_time, context={}): """ Create daily CSVs from incoming report and archive to S3. Args: tracing_id (str): The tracing id account (str): The account number provider_uuid (str): The uuid of a provider filename (str): The OCP file name filepath (str): The full path name of the file manifest_id (int): The manifest identifier start_date (Datetime): The start datetime of incoming report context (Dict): Logging context dictionary """ download_hash = None daily_file_names = [] if last_export_time: download_hash = hashlib.md5(str(last_export_time).encode()) download_hash = download_hash.hexdigest() if settings.ENABLE_S3_ARCHIVING or enable_trino_processing( provider_uuid, Provider.PROVIDER_GCP, account): dh = DateHelper() directory = os.path.dirname(filepath) try: data_frame = pd.read_csv(filepath) except Exception as error: LOG.error( f"File {filepath} could not be parsed. Reason: {str(error)}") raise error for invoice_month in data_frame["invoice.month"].unique(): # daily_files = [] invoice_filter = data_frame["invoice.month"] == invoice_month invoice_data = data_frame[invoice_filter] unique_times = invoice_data.partition_date.unique() days = list({cur_dt[:10] for cur_dt in unique_times}) daily_data_frames = [{ "data_frame": invoice_data[invoice_data.partition_date.str.contains( cur_day)], "date": cur_day } for cur_day in days] start_of_invoice = dh.gcp_invoice_month_start(invoice_month) s3_csv_path = get_path_prefix(account, Provider.PROVIDER_GCP, provider_uuid, start_of_invoice, Config.CSV_DATA_TYPE) for daily_data in daily_data_frames: day = daily_data.get("date") df = daily_data.get("data_frame") if download_hash: day_file = f"{invoice_month}_{day}_{download_hash}.csv" else: day_file = f"{invoice_month}_{day}.csv" day_filepath = f"{directory}/{day_file}" df.to_csv(day_filepath, index=False, header=True) copy_local_report_file_to_s3_bucket(tracing_id, s3_csv_path, day_filepath, day_file, manifest_id, start_date, context) daily_file_names.append(day_filepath) return daily_file_names