Exemple #1
0
    def _write_parquet_to_file(self,
                               file_path,
                               file_name,
                               data_frame,
                               file_type=None):
        """Write Parquet file and send to S3."""
        s3_path = self._determin_s3_path(file_type)
        data_frame.to_parquet(file_path,
                              allow_truncated_timestamps=True,
                              coerce_timestamps="ms",
                              index=False)
        try:
            with open(file_path, "rb") as fin:
                copy_data_to_s3_bucket(self.request_id,
                                       s3_path,
                                       file_name,
                                       fin,
                                       manifest_id=self.manifest_id,
                                       context=self.error_context)
                msg = f"{file_path} sent to S3."
                LOG.info(log_json(self.request_id, msg, self.error_context))
        except Exception as err:
            s3_key = f"{self.parquet_path_s3}/{file_path}"
            msg = f"File {file_name} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}"
            LOG.warn(log_json(self.request_id, msg, self.error_context))
            return False
        finally:
            self.files_to_remove.append(file_path)

        return True
    def _write_parquet_to_file(self, file_path, file_name, data_frame, file_type=None):
        """Write Parquet file and send to S3."""
        if self._provider_type == Provider.PROVIDER_GCP:
            # We need to determine the parquet file path based off
            # of the start of the invoice month and usage start for GCP.
            s3_path = self._determin_s3_path_for_gcp(file_type, file_name)
        else:
            s3_path = self._determin_s3_path(file_type)
        data_frame.to_parquet(file_path, allow_truncated_timestamps=True, coerce_timestamps="ms", index=False)
        try:
            with open(file_path, "rb") as fin:
                copy_data_to_s3_bucket(
                    self.tracing_id, s3_path, file_name, fin, manifest_id=self.manifest_id, context=self.error_context
                )
                msg = f"{file_path} sent to S3."
                LOG.info(log_json(self.tracing_id, msg, self.error_context))
        except Exception as err:
            s3_key = f"{self.parquet_path_s3}/{file_path}"
            msg = f"File {file_name} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}"
            LOG.warn(log_json(self.tracing_id, msg, self.error_context))
            return False
        finally:
            self.files_to_remove.append(file_path)

        return True
Exemple #3
0
    def test_copy_data_to_s3_bucket(self):
        """Test copy_data_to_s3_bucket."""
        upload = utils.copy_data_to_s3_bucket("request_id", "path", "filename", "data", "manifest_id")
        self.assertEqual(upload, None)

        with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True):
            with patch("masu.util.aws.common.get_s3_resource") as mock_s3:
                upload = utils.copy_data_to_s3_bucket("request_id", "path", "filename", "data", "manifest_id")
                self.assertIsNotNone(upload)

        with patch("masu.util.aws.common.settings", ENABLE_S3_ARCHIVING=True):
            with patch("masu.util.aws.common.get_s3_resource") as mock_s3:
                mock_s3.side_effect = ClientError({}, "Error")
                upload = utils.copy_data_to_s3_bucket("request_id", "path", "filename", "data", "manifest_id")
                self.assertEqual(upload, None)
    def convert_csv_to_parquet(  # noqa: C901
        self,
        request_id,
        s3_csv_path,
        s3_parquet_path,
        local_path,
        manifest_id,
        csv_filename,
        converters={},
        post_processor=None,
        context={},
        report_type=None,
    ):
        """
        Convert CSV files to parquet on S3.
        """
        if s3_csv_path is None or s3_parquet_path is None or local_path is None:
            msg = (
                f"Invalid paths provided to convert_csv_to_parquet."
                f"CSV path={s3_csv_path}, Parquet path={s3_parquet_path}, and local_path={local_path}."
            )
            LOG.error(log_json(request_id, msg, context))
            return False

        msg = f"Running convert_csv_to_parquet on file {csv_filename} in S3 path {s3_csv_path}."
        LOG.info(log_json(request_id, msg, context))

        kwargs = {}
        parquet_file = None
        csv_file = f"{s3_csv_path}/{csv_filename}"
        if csv_filename.lower().endswith(CSV_EXT):
            ext = -len(CSV_EXT)
            parquet_file = f"{csv_filename[:ext]}.parquet"
        elif csv_filename.lower().endswith(CSV_GZIP_EXT):
            ext = -len(CSV_GZIP_EXT)
            parquet_file = f"{csv_filename[:ext]}.parquet"
            kwargs = {"compression": "gzip"}
        else:
            msg = f"File {csv_filename} is not valid CSV. Conversion to parquet skipped."
            LOG.warn(log_json(request_id, msg, context))
            return False

        Path(local_path).mkdir(parents=True, exist_ok=True)
        tmpfile = f"{local_path}/{csv_filename}"
        try:
            s3_resource = get_s3_resource()
            csv_obj = s3_resource.Object(bucket_name=settings.S3_BUCKET_NAME,
                                         key=csv_file)
            csv_obj.download_file(tmpfile)
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            msg = f"File {csv_filename} could not obtained for parquet conversion. Reason: {str(err)}"
            LOG.warn(log_json(request_id, msg, context))
            return False

        output_file = f"{local_path}/{parquet_file}"
        try:
            col_names = pd.read_csv(tmpfile, nrows=0, **kwargs).columns
            converters.update(
                {col: str
                 for col in col_names if col not in converters})
            data_frame = pd.read_csv(tmpfile, converters=converters, **kwargs)
            if post_processor:
                data_frame = post_processor(data_frame)
            data_frame.to_parquet(output_file,
                                  allow_truncated_timestamps=True,
                                  coerce_timestamps="ms")
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            msg = f"File {csv_filename} could not be written as parquet to temp file {output_file}. Reason: {str(err)}"
            LOG.warn(log_json(request_id, msg, context))
            return False

        try:
            with open(output_file, "rb") as fin:
                data = BytesIO(fin.read())
                copy_data_to_s3_bucket(request_id,
                                       s3_parquet_path,
                                       parquet_file,
                                       data,
                                       manifest_id=manifest_id,
                                       context=context)
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            s3_key = f"{s3_parquet_path}/{parquet_file}"
            msg = f"File {csv_filename} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}"
            LOG.warn(log_json(request_id, msg, context))
            return False

        s3_hive_table_path = get_hive_table_path(context.get("account"),
                                                 self._provider_type,
                                                 report_type=report_type)

        if not self.presto_table_exists.get(report_type):
            self.create_parquet_table(
                context.get("account"),
                context.get("provider_uuid"),
                manifest_id,
                s3_hive_table_path,
                output_file,
                report_type,
            )

        shutil.rmtree(local_path, ignore_errors=True)
        return True
Exemple #5
0
    def convert_csv_to_parquet(  # noqa: C901
        self,
        request_id,
        s3_csv_path,
        s3_parquet_path,
        local_path,
        manifest_id,
        csv_filename,
        converters={},
        post_processor=None,
        context={},
        report_type=None,
    ):
        """
        Convert CSV files to parquet on S3.
        """
        csv_path, csv_name = os.path.split(csv_filename)
        if s3_csv_path is None or s3_parquet_path is None or local_path is None:
            msg = (
                f"Invalid paths provided to convert_csv_to_parquet."
                f"CSV path={s3_csv_path}, Parquet path={s3_parquet_path}, and local_path={local_path}."
            )
            LOG.error(log_json(request_id, msg, context))
            return False

        msg = f"Running convert_csv_to_parquet on file {csv_filename} in S3 path {s3_csv_path}."
        LOG.info(log_json(request_id, msg, context))

        kwargs = {}
        parquet_file = None
        if csv_name.lower().endswith(CSV_EXT):
            ext = -len(CSV_EXT)
            parquet_base_filename = f"{csv_name[:ext]}"
        elif csv_name.lower().endswith(CSV_GZIP_EXT):
            ext = -len(CSV_GZIP_EXT)
            parquet_base_filename = f"{csv_name[:ext]}"
            kwargs = {"compression": "gzip"}
        else:
            msg = f"File {csv_name} is not valid CSV. Conversion to parquet skipped."
            LOG.warn(log_json(request_id, msg, context))
            return False

        Path(local_path).mkdir(parents=True, exist_ok=True)

        try:
            col_names = pd.read_csv(csv_filename, nrows=0, **kwargs).columns
            converters.update(
                {col: str
                 for col in col_names if col not in converters})
            data_frame = pd.read_csv(
                csv_filename,
                converters=converters,
                chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE,
                **kwargs)
            with pd.read_csv(csv_filename,
                             converters=converters,
                             chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE,
                             **kwargs) as reader:
                for i, data_frame in enumerate(reader):
                    parquet_filename = f"{parquet_base_filename}_{i}{PARQUET_EXT}"
                    parquet_file = f"{local_path}/{parquet_filename}"
                    if post_processor:
                        data_frame = post_processor(data_frame)
                    data_frame.to_parquet(parquet_file,
                                          allow_truncated_timestamps=True,
                                          coerce_timestamps="ms")
                    try:
                        with open(parquet_file, "rb") as fin:
                            data = BytesIO(fin.read())
                            copy_data_to_s3_bucket(
                                request_id,
                                s3_parquet_path,
                                parquet_filename,
                                data,
                                manifest_id=manifest_id,
                                context=context,
                            )
                            msg = f"{parquet_file} sent to S3."
                            LOG.info(msg)
                    except Exception as err:
                        shutil.rmtree(local_path, ignore_errors=True)
                        s3_key = f"{s3_parquet_path}/{parquet_file}"
                        msg = f"File {csv_filename} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}"
                        LOG.warn(log_json(request_id, msg, context))
                        return False
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            msg = (
                f"File {csv_filename} could not be written as parquet to temp file {parquet_file}. Reason: {str(err)}"
            )
            LOG.warn(log_json(request_id, msg, context))
            return False

        s3_hive_table_path = get_hive_table_path(context.get("account"),
                                                 self._provider_type,
                                                 report_type=report_type)

        if not self.presto_table_exists.get(report_type):
            self.create_parquet_table(
                context.get("account"),
                context.get("provider_uuid"),
                manifest_id,
                s3_hive_table_path,
                parquet_file,
                report_type,
            )

        # Delete the local parquet files
        shutil.rmtree(local_path, ignore_errors=True)
        # Now we can delete the local CSV
        if os.path.exists(csv_filename):
            os.remove(csv_filename)
        return True