Esempi in Python per get_path_prefix, esempi in Python per masu.util.common.get_path_prefix

Esempio n. 1

0

Mostra file

    def test_get_path_prefix(self):
        """Test that path prefix is returned."""
        account = "10001"
        provider_type = Provider.PROVIDER_AWS
        provider_uuid = self.aws_provider_uuid
        start_date = datetime.utcnow().date()
        expected_path_prefix = f"{Config.WAREHOUSE_PATH}/{Config.PARQUET_DATA_TYPE}"
        expected_path = (
            f"{expected_path_prefix}/{account}/{provider_type}/"
            f"source={provider_uuid}/year={start_date.year}/month={start_date.month}"
        )

        path = common_utils.get_path_prefix(account, provider_type,
                                            provider_uuid, start_date,
                                            "parquet")
        self.assertEqual(path, expected_path)

        # Test with report_type
        report_type = "pod_report"
        expected_path = (
            f"{expected_path_prefix}/{account}/{provider_type}/{report_type}/"
            f"source={provider_uuid}/year={start_date.year}/month={start_date.month}"
        )
        path = common_utils.get_path_prefix(account,
                                            provider_type,
                                            provider_uuid,
                                            start_date,
                                            "parquet",
                                            report_type=report_type)
        self.assertEqual(path, expected_path)

Esempio n. 2

0

Mostra file

File: parquet_report_processor.py Progetto: project-koku/koku

 def _determin_s3_path_for_gcp(self, file_type, gcp_file_name):
     """Determine the s3 path based off of the invoice month."""
     invoice_month = gcp_file_name.split("_")[0]
     dh = DateHelper()
     start_of_invoice = dh.gcp_invoice_month_start(invoice_month)
     if file_type == DAILY_FILE_TYPE:
         report_type = self.report_type
         if report_type is None:
             report_type = "raw"
         return get_path_prefix(
             self.account,
             self.provider_type,
             self.provider_uuid,
             start_of_invoice,
             Config.PARQUET_DATA_TYPE,
             report_type=report_type,
             daily=True,
         )
     else:
         if self.report_type == OPENSHIFT_REPORT_TYPE:
             return get_path_prefix(
                 self.account,
                 self.provider_type,
                 self.provider_uuid,
                 start_of_invoice,
                 Config.PARQUET_DATA_TYPE,
                 report_type=self.report_type,
                 daily=True,
             )
         else:
             return get_path_prefix(
                 self.account, self.provider_type, self.provider_uuid, start_of_invoice, Config.PARQUET_DATA_TYPE
             )

Esempio n. 3

0

Mostra file

File: azure_report_downloader.py Progetto: dccurtis/koku

    def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        try:
            blob = self._azure_client.get_cost_export_for_key(key, self.container_name)
            etag = blob.etag
        except AzureCostReportNotFound as ex:
            msg = f"Error when downloading Azure report for key: {key}. Error {ex}"
            LOG.error(log_json(self.request_id, msg, self.context))
            raise AzureReportDownloaderError(msg)

        if etag != stored_etag:
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.request_id, msg, self.context))
            blob = self._azure_client.download_cost_export(key, self.container_name, destination=full_file_path)
            # Push to S3
            s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(
                self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context
            )
        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_file_path, etag

Esempio n. 4

0

Mostra file

    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download an S3 object to file.

        Args:
            key (str): The S3 object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        s3_filename = key.split("/")[-1]
        directory_path = f"{DATA_DIR}/{self.customer_name}/aws/{self.bucket}"

        local_s3_filename = utils.get_local_file_name(key)
        msg = f"Local S3 filename: {local_s3_filename}"
        LOG.info(log_json(self.request_id, msg, self.context))
        full_file_path = f"{directory_path}/{local_s3_filename}"

        # Make sure the data directory exists
        os.makedirs(directory_path, exist_ok=True)
        s3_etag = None
        try:
            s3_file = self.s3_client.get_object(
                Bucket=self.report.get("S3Bucket"), Key=key)
            s3_etag = s3_file.get("ETag")
        except ClientError as ex:
            if ex.response["Error"]["Code"] == "NoSuchKey":
                msg = "Unable to find {} in S3 Bucket: {}".format(
                    s3_filename, self.report.get("S3Bucket"))
                LOG.info(log_json(self.request_id, msg, self.context))
                raise AWSReportDownloaderNoFileError(msg)

            msg = f"Error downloading file: Error: {str(ex)}"
            LOG.error(log_json(self.request_id, msg, self.context))
            raise AWSReportDownloaderError(str(ex))

        if not self._check_size(key, check_inflate=True):
            raise AWSReportDownloaderError(
                f"Insufficient disk space to download file: {s3_file}")

        if s3_etag != stored_etag or not os.path.isfile(full_file_path):
            LOG.debug("Downloading key: %s to file path: %s", key,
                      full_file_path)
            self.s3_client.download_file(self.report.get("S3Bucket"), key,
                                         full_file_path)
            # Push to S3
            s3_csv_path = get_path_prefix(self.account, self._provider_uuid,
                                          start_date, Config.CSV_DATA_TYPE)
            utils.copy_local_report_file_to_s3_bucket(
                self.request_id, s3_csv_path, full_file_path,
                local_s3_filename, manifest_id, start_date, self.context)
            utils.remove_files_not_in_set_from_s3_bucket(
                self.request_id, s3_csv_path, manifest_id)

        return full_file_path, s3_etag

Esempio n. 5

0

Mostra file

    def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        etag_hasher = hashlib.new("ripemd160")
        etag_hasher.update(bytes(local_filename, "utf-8"))
        etag = etag_hasher.hexdigest()

        if etag != stored_etag:
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.request_id, msg, self.context))
            shutil.copy2(key, full_file_path)
            # Push to S3
            s3_csv_path = get_path_prefix(self.account, self._provider_uuid, start_date, Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(
                self.request_id, s3_csv_path, full_file_path, local_filename, manifest_id, start_date, self.context
            )
        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_file_path, etag

Esempio n. 6

0

Mostra file

    def test_remove_files_not_in_set_from_s3_bucket(self):
        """Test remove_files_not_in_set_from_s3_bucket."""
        removed = utils.remove_files_not_in_set_from_s3_bucket(
            "request_id", None, "manifest_id")
        self.assertEqual(removed, [])

        date_accessor = DateAccessor()
        start_date = date_accessor.today_with_timezone("utc").replace(day=1)
        s3_csv_path = get_path_prefix("account", Provider.PROVIDER_AWS,
                                      "provider_uuid", start_date,
                                      Config.CSV_DATA_TYPE)
        expected_key = "removed_key"
        mock_object = Mock(metadata={}, key=expected_key)
        mock_summary = Mock()
        mock_summary.Object.return_value = mock_object
        with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True):
            with patch("masu.util.aws.common.get_s3_resource") as mock_s3:
                mock_s3.return_value.Bucket.return_value.objects.filter.return_value = [
                    mock_summary
                ]
                removed = utils.remove_files_not_in_set_from_s3_bucket(
                    "request_id", s3_csv_path, "manifest_id")
                self.assertEqual(removed, [expected_key])

        with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True):
            with patch("masu.util.aws.common.get_s3_resource") as mock_s3:
                mock_s3.side_effect = ClientError({}, "Error")
                removed = utils.remove_files_not_in_set_from_s3_bucket(
                    "request_id", s3_csv_path, "manifest_id")
                self.assertEqual(removed, [])

Esempio n. 7

0

Mostra file

File: ocp_report_downloader.py Progetto: brad-payne/koku

def create_daily_archives(request_id, account, provider_uuid, filename, filepath, manifest_id, start_date, context={}):
    """
    Create daily CSVs from incoming report and archive to S3.

    Args:
        request_id (str): The request id
        account (str): The account number
        provider_uuid (str): The uuid of a provider
        filename (str): The OCP file name
        filepath (str): The full path name of the file
        manifest_id (int): The manifest identifier
        start_date (Datetime): The start datetime of incoming report
        context (Dict): Logging context dictionary
    """
    daily_file_names = []
    if settings.ENABLE_S3_ARCHIVING:
        daily_files = divide_csv_daily(filepath, filename)
        for daily_file in daily_files:
            # Push to S3
            s3_csv_path = get_path_prefix(account, provider_uuid, start_date, Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(
                request_id,
                s3_csv_path,
                daily_file.get("filepath"),
                daily_file.get("filename"),
                manifest_id,
                start_date,
                context,
            )
            daily_file_names.append(daily_file.get("filename"))
            os.remove(daily_file.get("filepath"))
    return daily_file_names

Esempio n. 8

0

Mostra file

File: parquet_report_processor.py Progetto: project-koku/koku

 def parquet_path_s3(self):
     """The path in the S3 bucket where Parquet files are loaded."""
     return get_path_prefix(
         self.account,
         self.provider_type,
         self.provider_uuid,
         self.start_date,
         Config.PARQUET_DATA_TYPE,
         report_type=self.report_type,
     )

Esempio n. 9

0

Mostra file

File: aws_local_report_downloader.py Progetto: project-koku/koku

    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download an S3 object to file.

        Args:
            key (str): The S3 object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_s3_filename = utils.get_local_file_name(key)

        directory_path = f"{DATA_DIR}/{self.customer_name}/aws-local/{self.bucket}"
        full_file_path = f"{directory_path}/{local_s3_filename}"

        if not os.path.isfile(key):
            log_msg = f"Unable to locate {key} in {self.bucket_path}"
            raise AWSReportDownloaderNoFileError(log_msg)

        # Make sure the data directory exists
        os.makedirs(directory_path, exist_ok=True)
        s3_etag_hasher = hashlib.new("ripemd160")
        s3_etag_hasher.update(bytes(local_s3_filename, "utf-8"))
        s3_etag = s3_etag_hasher.hexdigest()

        file_creation_date = None
        if s3_etag != stored_etag or not os.path.isfile(full_file_path):
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.tracing_id, msg, self.context))
            shutil.copy2(key, full_file_path)
            file_creation_date = datetime.datetime.fromtimestamp(
                os.path.getmtime(full_file_path))
            # Push to S3

            s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AWS,
                                          self._provider_uuid, start_date,
                                          Config.CSV_DATA_TYPE)
            utils.copy_local_report_file_to_s3_bucket(
                self.tracing_id, s3_csv_path, full_file_path,
                local_s3_filename, manifest_id, start_date, self.context)

            manifest_accessor = ReportManifestDBAccessor()
            manifest = manifest_accessor.get_manifest_by_id(manifest_id)

            if not manifest_accessor.get_s3_csv_cleared(manifest):
                utils.remove_files_not_in_set_from_s3_bucket(
                    self.tracing_id, s3_csv_path, manifest_id)
                manifest_accessor.mark_s3_csv_cleared(manifest)
        return full_file_path, s3_etag, file_creation_date, []

Esempio n. 10

0

Mostra file

def create_daily_archives(
    request_id, account, provider_uuid, filename, file_path, manifest_id, start_date, context={}
):
    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(provider_uuid, Provider.PROVIDER_IBM, account):
        s3_csv_path = get_path_prefix(account, Provider.PROVIDER_IBM, provider_uuid, start_date, Config.CSV_DATA_TYPE)
        # add day to S3 CSV path because the IBM report is monthly and we want to diff between two days
        s3_csv_path = f"{s3_csv_path}/day={start_date.strftime('%d')}"
        copy_local_report_file_to_s3_bucket(
            request_id, s3_csv_path, file_path, filename, manifest_id, start_date, context
        )
    return [file_path]

Esempio n. 11

0

Mostra file

File: parquet_report_processor.py Progetto: project-koku/koku

 def parquet_ocp_on_cloud_path_s3(self):
     """The path in the S3 bucket where Parquet files are loaded."""
     return get_path_prefix(
         self.account,
         self.provider_type,
         self.provider_uuid,
         self.start_date,
         Config.PARQUET_DATA_TYPE,
         report_type=OPENSHIFT_REPORT_TYPE,
         daily=True,
     )

Esempio n. 12

0

Mostra file

File: azure_report_downloader.py Progetto: project-koku/koku

    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        file_creation_date = None
        try:
            blob = self._azure_client.get_cost_export_for_key(
                key, self.container_name)
            etag = blob.etag
            file_creation_date = blob.last_modified
        except AzureCostReportNotFound as ex:
            msg = f"Error when downloading Azure report for key: {key}. Error {ex}"
            LOG.error(log_json(self.tracing_id, msg, self.context))
            raise AzureReportDownloaderError(msg)

        msg = f"Downloading {key} to {full_file_path}"
        LOG.info(log_json(self.tracing_id, msg, self.context))
        blob = self._azure_client.download_cost_export(
            key, self.container_name, destination=full_file_path)
        # Push to S3
        s3_csv_path = get_path_prefix(self.account, Provider.PROVIDER_AZURE,
                                      self._provider_uuid, start_date,
                                      Config.CSV_DATA_TYPE)
        copy_local_report_file_to_s3_bucket(self.tracing_id, s3_csv_path,
                                            full_file_path, local_filename,
                                            manifest_id, start_date,
                                            self.context)

        manifest_accessor = ReportManifestDBAccessor()
        manifest = manifest_accessor.get_manifest_by_id(manifest_id)

        if not manifest_accessor.get_s3_csv_cleared(manifest):
            remove_files_not_in_set_from_s3_bucket(self.tracing_id,
                                                   s3_csv_path, manifest_id)
            manifest_accessor.mark_s3_csv_cleared(manifest)

        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.tracing_id, msg, self.context))
        return full_file_path, etag, file_creation_date, []

Esempio n. 13

0

Mostra file

File: azure_local_report_downloader.py Progetto: project-koku/koku

    def download_file(self,
                      key,
                      stored_etag=None,
                      manifest_id=None,
                      start_date=None):
        """
        Download a file from Azure bucket.

        Args:
            key (str): The object key identified.

        Returns:
            (String): The path and file name of the saved file

        """
        local_filename = utils.get_local_file_name(key)
        full_file_path = f"{self._get_exports_data_directory()}/{local_filename}"

        etag_hasher = hashlib.new("ripemd160")
        etag_hasher.update(bytes(local_filename, "utf-8"))
        etag = etag_hasher.hexdigest()

        file_creation_date = None
        if etag != stored_etag:
            msg = f"Downloading {key} to {full_file_path}"
            LOG.info(log_json(self.request_id, msg, self.context))
            shutil.copy2(key, full_file_path)
            file_creation_date = datetime.datetime.fromtimestamp(
                os.path.getmtime(full_file_path))
            # Push to S3
            s3_csv_path = get_path_prefix(self.account,
                                          Provider.PROVIDER_AZURE,
                                          self._provider_uuid, start_date,
                                          Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(self.request_id, s3_csv_path,
                                                full_file_path, local_filename,
                                                manifest_id, start_date,
                                                self.context)

            manifest_accessor = ReportManifestDBAccessor()
            manifest = manifest_accessor.get_manifest_by_id(manifest_id)

            if not manifest_accessor.get_s3_csv_cleared(manifest):
                remove_files_not_in_set_from_s3_bucket(self.request_id,
                                                       s3_csv_path,
                                                       manifest_id)
                manifest_accessor.mark_s3_csv_cleared(manifest)

        msg = f"Returning full_file_path: {full_file_path}, etag: {etag}"
        LOG.info(log_json(self.request_id, msg, self.context))
        return full_file_path, etag, file_creation_date, []

Esempio n. 14

0

Mostra file

def create_daily_archives(tracing_id,
                          account,
                          provider_uuid,
                          filename,
                          filepath,
                          manifest_id,
                          start_date,
                          context={}):
    """
    Create daily CSVs from incoming report and archive to S3.

    Args:
        tracing_id (str): The tracing id
        account (str): The account number
        provider_uuid (str): The uuid of a provider
        filename (str): The OCP file name
        filepath (str): The full path name of the file
        manifest_id (int): The manifest identifier
        start_date (Datetime): The start datetime of incoming report
        context (Dict): Logging context dictionary
    """
    daily_file_names = []

    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(
            provider_uuid, Provider.PROVIDER_OCP, account):
        if context.get("version"):
            daily_files = [{"filepath": filepath, "filename": filename}]
        else:
            daily_files = divide_csv_daily(filepath, filename)
        for daily_file in daily_files:
            # Push to S3
            s3_csv_path = get_path_prefix(account, Provider.PROVIDER_OCP,
                                          provider_uuid, start_date,
                                          Config.CSV_DATA_TYPE)
            copy_local_report_file_to_s3_bucket(
                tracing_id,
                s3_csv_path,
                daily_file.get("filepath"),
                daily_file.get("filename"),
                manifest_id,
                start_date,
                context,
            )
            daily_file_names.append(daily_file.get("filepath"))
    return daily_file_names

Esempio n. 15

0

Mostra file

File: gcp_report_downloader.py Progetto: project-koku/koku

def create_daily_archives(tracing_id,
                          account,
                          provider_uuid,
                          filename,
                          filepath,
                          manifest_id,
                          start_date,
                          last_export_time,
                          context={}):
    """
    Create daily CSVs from incoming report and archive to S3.

    Args:
        tracing_id (str): The tracing id
        account (str): The account number
        provider_uuid (str): The uuid of a provider
        filename (str): The OCP file name
        filepath (str): The full path name of the file
        manifest_id (int): The manifest identifier
        start_date (Datetime): The start datetime of incoming report
        context (Dict): Logging context dictionary
    """
    download_hash = None
    daily_file_names = []
    if last_export_time:
        download_hash = hashlib.md5(str(last_export_time).encode())
        download_hash = download_hash.hexdigest()
    if settings.ENABLE_S3_ARCHIVING or enable_trino_processing(
            provider_uuid, Provider.PROVIDER_GCP, account):
        dh = DateHelper()
        directory = os.path.dirname(filepath)
        try:
            data_frame = pd.read_csv(filepath)
        except Exception as error:
            LOG.error(
                f"File {filepath} could not be parsed. Reason: {str(error)}")
            raise error
        for invoice_month in data_frame["invoice.month"].unique():
            # daily_files = []
            invoice_filter = data_frame["invoice.month"] == invoice_month
            invoice_data = data_frame[invoice_filter]
            unique_times = invoice_data.partition_date.unique()
            days = list({cur_dt[:10] for cur_dt in unique_times})
            daily_data_frames = [{
                "data_frame":
                invoice_data[invoice_data.partition_date.str.contains(
                    cur_day)],
                "date":
                cur_day
            } for cur_day in days]
            start_of_invoice = dh.gcp_invoice_month_start(invoice_month)
            s3_csv_path = get_path_prefix(account, Provider.PROVIDER_GCP,
                                          provider_uuid, start_of_invoice,
                                          Config.CSV_DATA_TYPE)
            for daily_data in daily_data_frames:
                day = daily_data.get("date")
                df = daily_data.get("data_frame")
                if download_hash:
                    day_file = f"{invoice_month}_{day}_{download_hash}.csv"
                else:
                    day_file = f"{invoice_month}_{day}.csv"
                day_filepath = f"{directory}/{day_file}"
                df.to_csv(day_filepath, index=False, header=True)
                copy_local_report_file_to_s3_bucket(tracing_id, s3_csv_path,
                                                    day_filepath, day_file,
                                                    manifest_id, start_date,
                                                    context)
                daily_file_names.append(day_filepath)
        return daily_file_names

Esempio n. 16

0

Mostra file

File: tasks.py Progetto: brad-payne/koku

def convert_to_parquet(request_id,
                       account,
                       provider_uuid,
                       provider_type,
                       start_date,
                       manifest_id,
                       files=[],
                       context={}):
    """
    Convert archived CSV data from our S3 bucket for a given provider to Parquet.

    This function chiefly follows the download of a providers data.

    This task is defined to attempt up to 10 retries using exponential backoff
    starting with a 10-second delay. This is intended to allow graceful handling
    of temporary AWS S3 connectivity issues because it is relatively important
    for us to convert the archived data.

    Args:
        request_id (str): The associated request id (ingress or celery task id)
        account (str): The account string
        provider_uuid (UUID): The provider UUID
        start_date (str): The report start time (YYYY-mm-dd)
        manifest_id (str): The identifier for the report manifest
        context (dict): A context object for logging

    """
    if not context:
        context = {"account": account, "provider_uuid": provider_uuid}

    if not settings.ENABLE_S3_ARCHIVING:
        msg = "Skipping convert_to_parquet. S3 archiving feature is disabled."
        LOG.info(log_json(request_id, msg, context))
        return

    if not request_id or not account or not provider_uuid:
        if not request_id:
            message = "missing required argument: request_id"
            LOG.error(message)
        if not account:
            message = "missing required argument: account"
            LOG.error(message)
        if not provider_uuid:
            message = "missing required argument: provider_uuid"
            LOG.error(message)
        if not provider_type:
            message = "missing required argument: provider_type"
            LOG.error(message)
        return

    if not start_date:
        msg = "S3 archiving feature is enabled, but no start_date was given for processing."
        LOG.warn(log_json(request_id, msg, context))
        return

    try:
        cost_date = parser.parse(start_date)
    except ValueError:
        msg = "S3 archiving feature is enabled, but the start_date was not a valid date string ISO 8601 format."
        LOG.warn(log_json(request_id, msg, context))
        return

    s3_csv_path = get_path_prefix(account, provider_uuid, cost_date,
                                  Config.CSV_DATA_TYPE)
    local_path = f"{Config.TMP_DIR}/{account}/{provider_uuid}"
    s3_parquet_path = get_path_prefix(account, provider_uuid, cost_date,
                                      Config.PARQUET_DATA_TYPE)

    if not files:
        file_keys = get_file_keys_from_s3_with_manifest_id(
            request_id, s3_csv_path, manifest_id, context)
        files = [os.path.basename(file_key) for file_key in file_keys]
        if not files:
            msg = "S3 archiving feature is enabled, but no files to process."
            LOG.info(log_json(request_id, msg, context))
            return

    post_processor = None
    # OCP data is daily chunked report files.
    # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated
    if provider_type != Provider.PROVIDER_OCP:
        remove_files_not_in_set_from_s3_bucket(request_id, s3_parquet_path,
                                               manifest_id, context)

    if provider_type in [Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL]:
        post_processor = aws_post_processor

    failed_conversion = []
    for csv_filename in files:
        kwargs = {}
        parquet_path = s3_parquet_path
        if provider_type == Provider.PROVIDER_OCP:
            for report_type in REPORT_TYPES.keys():
                if report_type in csv_filename:
                    parquet_path = f"{s3_parquet_path}/{report_type}"
                    kwargs["report_type"] = report_type
                    break
        converters = get_column_converters(provider_type, **kwargs)
        result = convert_csv_to_parquet(
            request_id,
            s3_csv_path,
            parquet_path,
            local_path,
            manifest_id,
            csv_filename,
            converters,
            post_processor,
            context,
        )
        if not result:
            failed_conversion.append(csv_filename)

    if failed_conversion:
        msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}."
        LOG.warn(log_json(request_id, msg, context))
        return

Esempio n. 17

0

Mostra file

File: parquet_report_processor.py Progetto: project-koku/koku

 def csv_path_s3(self):
     """The path in the S3 bucket where CSV files are loaded."""
     return get_path_prefix(
         self.account, self.provider_type, self.provider_uuid, self.start_date, Config.CSV_DATA_TYPE
     )