Python convert_csv_to_parquet Exemples

Langage de programmation: Python

Espace de nommage/Pack: masu.util.aws.common

Méthode/Fonction: convert_csv_to_parquet

Exemples au hotexamples.com: 2

Python convert_csv_to_parquet - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de masu.util.aws.common.convert_csv_to_parquet extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Exemple #1

0

Afficher le fichier

Fichier : tasks.py Projet : brad-payne/koku

def convert_to_parquet(request_id, account, provider_uuid, provider_type, start_date, manifest_id, files=[], context={}): """ Convert archived CSV data from our S3 bucket for a given provider to Parquet. This function chiefly follows the download of a providers data. This task is defined to attempt up to 10 retries using exponential backoff starting with a 10-second delay. This is intended to allow graceful handling of temporary AWS S3 connectivity issues because it is relatively important for us to convert the archived data. Args: request_id (str): The associated request id (ingress or celery task id) account (str): The account string provider_uuid (UUID): The provider UUID start_date (str): The report start time (YYYY-mm-dd) manifest_id (str): The identifier for the report manifest context (dict): A context object for logging """ if not context: context = {"account": account, "provider_uuid": provider_uuid} if not settings.ENABLE_S3_ARCHIVING: msg = "Skipping convert_to_parquet. S3 archiving feature is disabled." LOG.info(log_json(request_id, msg, context)) return if not request_id or not account or not provider_uuid: if not request_id: message = "missing required argument: request_id" LOG.error(message) if not account: message = "missing required argument: account" LOG.error(message) if not provider_uuid: message = "missing required argument: provider_uuid" LOG.error(message) if not provider_type: message = "missing required argument: provider_type" LOG.error(message) return if not start_date: msg = "S3 archiving feature is enabled, but no start_date was given for processing." LOG.warn(log_json(request_id, msg, context)) return try: cost_date = parser.parse(start_date) except ValueError: msg = "S3 archiving feature is enabled, but the start_date was not a valid date string ISO 8601 format." LOG.warn(log_json(request_id, msg, context)) return s3_csv_path = get_path_prefix(account, provider_uuid, cost_date, Config.CSV_DATA_TYPE) local_path = f"{Config.TMP_DIR}/{account}/{provider_uuid}" s3_parquet_path = get_path_prefix(account, provider_uuid, cost_date, Config.PARQUET_DATA_TYPE) if not files: file_keys = get_file_keys_from_s3_with_manifest_id( request_id, s3_csv_path, manifest_id, context) files = [os.path.basename(file_key) for file_key in file_keys] if not files: msg = "S3 archiving feature is enabled, but no files to process." LOG.info(log_json(request_id, msg, context)) return post_processor = None # OCP data is daily chunked report files. # AWS and Azure are monthly reports. Previous reports should be removed so data isn't duplicated if provider_type != Provider.PROVIDER_OCP: remove_files_not_in_set_from_s3_bucket(request_id, s3_parquet_path, manifest_id, context) if provider_type in [Provider.PROVIDER_AWS, Provider.PROVIDER_AWS_LOCAL]: post_processor = aws_post_processor failed_conversion = [] for csv_filename in files: kwargs = {} parquet_path = s3_parquet_path if provider_type == Provider.PROVIDER_OCP: for report_type in REPORT_TYPES.keys(): if report_type in csv_filename: parquet_path = f"{s3_parquet_path}/{report_type}" kwargs["report_type"] = report_type break converters = get_column_converters(provider_type, **kwargs) result = convert_csv_to_parquet( request_id, s3_csv_path, parquet_path, local_path, manifest_id, csv_filename, converters, post_processor, context, ) if not result: failed_conversion.append(csv_filename) if failed_conversion: msg = f"Failed to convert the following files to parquet:{','.join(failed_conversion)}." LOG.warn(log_json(request_id, msg, context)) return

Exemple #2

0

Afficher le fichier

Fichier : test_common.py Projet : thearifismail/koku

def test_convert_csv_to_parquet(self): """Test convert_csv_to_parquet.""" result = utils.convert_csv_to_parquet("request_id", None, "s3_parquet_path", "local_path", "manifest_id", "csv_filename") self.assertFalse(result) result = utils.convert_csv_to_parquet("request_id", "s3_csv_path", "s3_parquet_path", "local_path", "manifest_id", "csv_filename") self.assertFalse(result) with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True): with patch("masu.util.aws.common.get_s3_resource") as mock_s3: with patch("masu.util.aws.common.shutil.rmtree"): with patch("masu.util.aws.common.Path"): mock_s3.side_effect = ClientError({}, "Error") result = utils.convert_csv_to_parquet( "request_id", "s3_csv_path", "s3_parquet_path", "local_path", "manifest_id", "csv_filename.csv", ) self.assertFalse(result) with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True): with patch("masu.util.aws.common.get_s3_resource"): with patch("masu.util.aws.common.shutil.rmtree"): with patch("masu.util.aws.common.Path"): result = utils.convert_csv_to_parquet( "request_id", "s3_csv_path", "s3_parquet_path", "local_path", "manifest_id", "csv_filename.csv.gz", ) self.assertFalse(result) with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True): with patch("masu.util.aws.common.get_s3_resource"): with patch("masu.util.aws.common.shutil.rmtree"): with patch("masu.util.aws.common.Path"): with patch("masu.util.aws.common.pd"): with patch( "masu.util.aws.common.open") as mock_open: mock_open.side_effect = ValueError() result = utils.convert_csv_to_parquet( "request_id", "s3_csv_path", "s3_parquet_path", "local_path", "manifest_id", "csv_filename.csv.gz", ) self.assertFalse(result) with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True): with patch("masu.util.aws.common.get_s3_resource"): with patch("masu.util.aws.common.Path"): with patch("masu.util.aws.common.shutil.rmtree"): with patch("masu.util.aws.common.pd"): with patch("masu.util.aws.common.open"): with patch("masu.util.aws.common.BytesIO"): with patch( "masu.util.aws.common.copy_data_to_s3_bucket" ): result = utils.convert_csv_to_parquet( "request_id", "s3_csv_path", "s3_parquet_path", "local_path", "manifest_id", "csv_filename.csv.gz", ) self.assertTrue(result)