コード例 #1
0
def create_daily_archives(request_id, account, provider_uuid, filename, filepath, manifest_id, start_date, context={}):
    """
    Create daily CSVs from incoming report and archive to S3.

    Args:
        request_id (str): The request id
        account (str): The account number
        provider_uuid (str): The uuid of a provider
        filename (str): The OCP file name
        filepath (str): The full path name of the file
        manifest_id (int): The manifest identifier
        start_date (Datetime): The start datetime of incoming report
        context (Dict): Logging context dictionary
    """
    daily_file_names = []
    if settings.ENABLE_S3_ARCHIVING:
        daily_files = divide_csv_daily(filepath, filename)
        for daily_file in daily_files:
            # Push to S3
            s3_csv_path = get_path_prefix(account, provider_uuid, start_date, Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(
                request_id,
                s3_csv_path,
                daily_file.get("filepath"),
                daily_file.get("filename"),
                manifest_id,
                start_date,
                context,
            )
            daily_file_names.append(daily_file.get("filename"))
            os.remove(daily_file.get("filepath"))
    return daily_file_names
コード例 #2
0
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download an S3 object to file.

        Args:
            key (str): The S3 object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        s3_filename = key.split("/")[-1]
        directory_path = f"{DATA_DIR}/{self.customer_name}/aws/{self.bucket}"

        local_s3_filename = utils.get_local_file_name(key)
        msg = f"Local S3 filename: {local_s3_filename}"
        LOG.info(log_json(self.request_id, msg, self.context))
        full_file_path = f"{directory_path}/{local_s3_filename}"

        # Make sure the data directory exists
        os.makedirs(directory_path, exist_ok=True)
        s3_etag = None
        try:
            s3_file = self.s3_client.get_object(
                Bucket=self.report.get("S3Bucket"), Key=key)
            s3_etag = s3_file.get("ETag")
        except ClientError as ex:
            if ex.response["Error"]["Code"] == "NoSuchKey":
                msg = "Unable to find {} in S3 Bucket: {}".format(
                    s3_filename, self.report.get("S3Bucket"))
                LOG.info(log_json(self.request_id, msg, self.context))
                raise AWSReportDownloaderNoFileError(msg)

            msg = f"Error downloading file: Error: {str(ex)}"
            LOG.error(log_json(self.request_id, msg, self.context))
            raise AWSReportDownloaderError(str(ex))

        if not self._check_size(key, check_inflate=True):
            raise AWSReportDownloaderError(
                f"Insufficient disk space to download file: {s3_file}")

        if s3_etag != stored_etag or not os.path.isfile(full_file_path):
            LOG.debug("Downloading key: %s to file path: %s", key,
                      full_file_path)
            self.s3_client.download_file(self.report.get("S3Bucket"), key,
                                         full_file_path)
            # Push to S3
            s3_csv_path = get_path_prefix(self.account, self._provider_uuid,
                                          start_date, Config.CSV_DATA_TYPE)
            utils.copy_local_report_file_to_s3_bucket(
                self.request_id, s3_csv_path, full_file_path,
                local_s3_filename, manifest_id, start_date, self.context)
            utils.remove_files_not_in_set_from_s3_bucket(
                self.request_id, s3_csv_path, manifest_id)

        return full_file_path, s3_etag
コード例 #3
0
    def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        try:
            blob = self._azure_client.get_cost_export_for_key(key, self.container_name)
            etag = blob.etag
        except AzureCostReportNotFound as ex:
            msg = f"Error when downloading Azure report for key: {key}. Error {ex}"
            LOG.error(log_json(self.request_id, msg, self.context))
            raise AzureReportDownloaderError(msg)

        if etag != stored_etag:
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.request_id, msg, self.context))
            blob = self._azure_client.download_cost_export(key, self.container_name, destination=full_file_path)
            # Push to S3
            s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(
                self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context
            )
        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_file_path, etag
コード例 #4
0
    def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        etag_hasher = hashlib.new("ripemd160")
        etag_hasher.update(bytes(local_filename, "utf-8"))
        etag = etag_hasher.hexdigest()

        if etag != stored_etag:
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.request_id, msg, self.context))
            shutil.copy2(key, full_file_path)
            # Push to S3
            s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(
                self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context
            )
        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_file_path, etag
コード例 #5
0
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download an S3 object to file.

        Args:
            key (str): The S3 object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_s3_filename = utils.get_local_file_name(key)

        directory_path = f"{DATA_DIR}/{self.customer_name}/aws-local/{self.bucket}"
        full_file_path = f"{directory_path}/{local_s3_filename}"

        if not os.path.isfile(key):
            log_msg = f"Unable to locate {key} in {self.bucket_path}"
            raise AWSReportDownloaderNoFileError(log_msg)

        # Make sure the data directory exists
        os.makedirs(directory_path, exist_ok=True)
        s3_etag_hasher = hashlib.new("ripemd160")
        s3_etag_hasher.update(bytes(local_s3_filename, "utf-8"))
        s3_etag = s3_etag_hasher.hexdigest()

        file_creation_date = None
        if s3_etag != stored_etag or not os.path.isfile(full_file_path):
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.tracing_id, msg, self.context))
            shutil.copy2(key, full_file_path)
            file_creation_date = datetime.datetime.fromtimestamp(
                os.path.getmtime(full_file_path))
            # Push to S3

            s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AWS,
                                          self._provider_uuid, start_date,
                                          Config.CSV_DATA_TYPE)
            utils.copy_local_report_file_to_s3_bucket(
                self.tracing_id, s3_csv_path, full_file_path,
                local_s3_filename, manifest_id, start_date, self.context)

            manifest_accessor = ReportManifestDBAccessor()
            manifest = manifest_accessor.get_manifest_by_id(manifest_id)

            if not manifest_accessor.get_s3_csv_cleared(manifest):
                utils.remove_files_not_in_set_from_s3_bucket(
                    self.tracing_id, s3_csv_path, manifest_id)
                manifest_accessor.mark_s3_csv_cleared(manifest)
        return full_file_path, s3_etag, file_creation_date, []
コード例 #6
0
def create_daily_archives(
    request_id, account, provider_uuid, filename, file_path, manifest_id, start_date, context={}
):
    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider_uuid, Provider.PROVIDER_IBM, account):
        s3_csv_path = get_path_prefix(account, Provider.PROVIDER_IBM, provider_uuid, start_date, Config.CSV_DATA_TYPE)
        # add day to S3 CSV path because the IBM report is monthly and we want to diff between two days
        s3_csv_path = f"{s3_csv_path}/day={start_date.strftime('%d')}"
        copy_local_report_file_to_s3_bucket(
            request_id, s3_csv_path, file_path, filename, manifest_id, start_date, context
        )
    return [file_path]
コード例 #7
0
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        file_creation_date = None
        try:
            blob = self._azure_client.get_cost_export_for_key(
                key, self.container_name)
            etag = blob.etag
            file_creation_date = blob.last_modified
        except AzureCostReportNotFound as ex:
            msg = f"Error when downloading Azure report for key: {key}. Error {ex}"
            LOG.error(log_json(self.tracing_id, msg, self.context))
            raise AzureReportDownloaderError(msg)

        msg = f"Downloading {key} to {full_file_path}"
        LOG.info(log_json(self.tracing_id, msg, self.context))
        blob = self._azure_client.download_cost_export(
            key, self.container_name, destination=full_file_path)
        # Push to S3
        s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE,
                                      self._provider_uuid, start_date,
                                      Config.CSV_DATA_TYPE)
        copy_local_report_file_to_s3_bucket(self.tracing_id, s3_csv_path,
                                            full_file_path, local_filename,
                                            manifest_id, start_date,
                                            self.context)

        manifest_accessor = ReportManifestDBAccessor()
        manifest = manifest_accessor.get_manifest_by_id(manifest_id)

        if not manifest_accessor.get_s3_csv_cleared(manifest):
            remove_files_not_in_set_from_s3_bucket(self.tracing_id,
                                                   s3_csv_path, manifest_id)
            manifest_accessor.mark_s3_csv_cleared(manifest)

        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.tracing_id, msg, self.context))
        return full_file_path, etag, file_creation_date, []
コード例 #8
0
    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        etag_hasher = hashlib.new("ripemd160")
        etag_hasher.update(bytes(local_filename, "utf-8"))
        etag = etag_hasher.hexdigest()

        file_creation_date = None
        if etag != stored_etag:
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.request_id, msg, self.context))
            shutil.copy2(key, full_file_path)
            file_creation_date = datetime.datetime.fromtimestamp(
                os.path.getmtime(full_file_path))
            # Push to S3
            s3_csv_path = get_path_prefix(self.account,
                                          Provider.PROVIDER_AZURE,
                                          self._provider_uuid, start_date,
                                          Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(self.request_id, s3_csv_path,
                                                full_file_path, local_filename,
                                                manifest_id, start_date,
                                                self.context)

            manifest_accessor = ReportManifestDBAccessor()
            manifest = manifest_accessor.get_manifest_by_id(manifest_id)

            if not manifest_accessor.get_s3_csv_cleared(manifest):
                remove_files_not_in_set_from_s3_bucket(self.request_id,
                                                       s3_csv_path,
                                                       manifest_id)
                manifest_accessor.mark_s3_csv_cleared(manifest)

        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_file_path, etag, file_creation_date, []
コード例 #9
0
def create_daily_archives(tracing_id,
                          account,
                          provider_uuid,
                          filename,
                          filepath,
                          manifest_id,
                          start_date,
                          context={}):
    """
    Create daily CSVs from incoming report and archive to S3.

    Args:
        tracing_id (str): The tracing id
        account (str): The account number
        provider_uuid (str): The uuid of a provider
        filename (str): The OCP file name
        filepath (str): The full path name of the file
        manifest_id (int): The manifest identifier
        start_date (Datetime): The start datetime of incoming report
        context (Dict): Logging context dictionary
    """
    daily_file_names = []

    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(
            provider_uuid, Provider.PROVIDER_OCP, account):
        if context.get("version"):
            daily_files = [{"filepath": filepath, "filename": filename}]
        else:
            daily_files = divide_csv_daily(filepath, filename)
        for daily_file in daily_files:
            # Push to S3
            s3_csv_path = get_path_prefix(account, Provider.PROVIDER_OCP,
                                          provider_uuid, start_date,
                                          Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(
                tracing_id,
                s3_csv_path,
                daily_file.get("filepath"),
                daily_file.get("filename"),
                manifest_id,
                start_date,
                context,
            )
            daily_file_names.append(daily_file.get("filepath"))
    return daily_file_names
コード例 #10
0
ファイル: csv_file_handler.py プロジェクト: project-koku/koku
    def write_csv_to_s3(self, date, data, cols, tracing_id=None):
        """
        Generates an HCS CSV from the specified schema and provider.
        :param date
        :param data
        :param cols
        :param tracing_id

        :return none
        """
        my_df = pd.DataFrame(data)
        filename = f"hcs_{date}.csv"
        month = date.strftime("%m")
        year = date.strftime("%Y")
        s3_csv_path = (
            f"hcs/csv/{self._schema_name}/{self._provider}/source={self._provider_uuid}/year={year}/month={month}"
        )

        LOG.info(log_json(tracing_id, "preparing to write file to object storage"))
        my_df.to_csv(filename, header=cols, index=False)
        copy_local_report_file_to_s3_bucket(tracing_id, s3_csv_path, filename, filename, "", date)
        os.remove(filename)
コード例 #11
0
def create_daily_archives(tracing_id,
                          account,
                          provider_uuid,
                          filename,
                          filepath,
                          manifest_id,
                          start_date,
                          last_export_time,
                          context={}):
    """
    Create daily CSVs from incoming report and archive to S3.

    Args:
        tracing_id (str): The tracing id
        account (str): The account number
        provider_uuid (str): The uuid of a provider
        filename (str): The OCP file name
        filepath (str): The full path name of the file
        manifest_id (int): The manifest identifier
        start_date (Datetime): The start datetime of incoming report
        context (Dict): Logging context dictionary
    """
    download_hash = None
    daily_file_names = []
    if last_export_time:
        download_hash = hashlib.md5(str(last_export_time).encode())
        download_hash = download_hash.hexdigest()
    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(
            provider_uuid, Provider.PROVIDER_GCP, account):
        dh = DateHelper()
        directory = os.path.dirname(filepath)
        try:
            data_frame = pd.read_csv(filepath)
        except Exception as error:
            LOG.error(
                f"File {filepath} could not be parsed. Reason: {str(error)}")
            raise error
        for invoice_month in data_frame["invoice.month"].unique():
            # daily_files = []
            invoice_filter = data_frame["invoice.month"] == invoice_month
            invoice_data = data_frame[invoice_filter]
            unique_times = invoice_data.partition_date.unique()
            days = list({cur_dt[:10] for cur_dt in unique_times})
            daily_data_frames = [{
                "data_frame":
                invoice_data[invoice_data.partition_date.str.contains(
                    cur_day)],
                "date":
                cur_day
            } for cur_day in days]
            start_of_invoice = dh.gcp_invoice_month_start(invoice_month)
            s3_csv_path = get_path_prefix(account, Provider.PROVIDER_GCP,
                                          provider_uuid, start_of_invoice,
                                          Config.CSV_DATA_TYPE)
            for daily_data in daily_data_frames:
                day = daily_data.get("date")
                df = daily_data.get("data_frame")
                if download_hash:
                    day_file = f"{invoice_month}_{day}_{download_hash}.csv"
                else:
                    day_file = f"{invoice_month}_{day}.csv"
                day_filepath = f"{directory}/{day_file}"
                df.to_csv(day_filepath, index=False, header=True)
                copy_local_report_file_to_s3_bucket(tracing_id, s3_csv_path,
                                                    day_filepath, day_file,
                                                    manifest_id, start_date,
                                                    context)
                daily_file_names.append(day_filepath)
        return daily_file_names