Exemple #1
0
def deleted_archived_with_prefix(s3_bucket_name, prefix):
    """
    Delete data from archive with given prefix.

    Args:
        s3_bucket_name (str): The s3 bucket name
        prefix (str): The prefix for deletion
    """
    s3_resource = get_s3_resource()
    s3_bucket = s3_resource.Bucket(s3_bucket_name)
    object_keys = [{
        "Key": s3_object.key
    } for s3_object in s3_bucket.objects.filter(Prefix=prefix)]
    batch_size = 1000  # AWS S3 delete API limits to 1000 objects per request.
    for batch_number in range(math.ceil(len(object_keys) / batch_size)):
        batch_start = batch_size * batch_number
        batch_end = batch_start + batch_size
        object_keys_batch = object_keys[batch_start:batch_end]
        s3_bucket.delete_objects(Delete={"Objects": object_keys_batch})

    remaining_objects = list(s3_bucket.objects.filter(Prefix=prefix))
    if remaining_objects:
        LOG.warning(
            "Found %s objects after attempting to delete all objects with prefix %s",
            len(remaining_objects), prefix)
    def get_file_keys_from_s3_with_manifest_id(self,
                                               request_id,
                                               s3_path,
                                               manifest_id,
                                               context={}):
        """
        Get all files in a given prefix that match the given manifest_id.
        """
        if not settings.ENABLE_PARQUET_PROCESSING:
            return []

        keys = []
        if s3_path:
            try:
                s3_resource = get_s3_resource()
                existing_objects = s3_resource.Bucket(
                    settings.S3_BUCKET_NAME).objects.filter(Prefix=s3_path)
                for obj_summary in existing_objects:
                    existing_object = obj_summary.Object()
                    metadata = existing_object.metadata
                    manifest = metadata.get("manifestid")
                    manifest_id_str = str(manifest_id)
                    key = existing_object.key
                    if manifest == manifest_id_str:
                        keys.append(key)
            except (EndpointConnectionError, ClientError) as err:
                msg = f"Unable to find data in bucket {settings.S3_BUCKET_NAME}.  Reason: {str(err)}"
                LOG.info(log_json(request_id, msg, context))
        return keys
    def convert_csv_to_parquet(  # noqa: C901
        self,
        request_id,
        s3_csv_path,
        s3_parquet_path,
        local_path,
        manifest_id,
        csv_filename,
        converters={},
        post_processor=None,
        context={},
        report_type=None,
    ):
        """
        Convert CSV files to parquet on S3.
        """
        if s3_csv_path is None or s3_parquet_path is None or local_path is None:
            msg = (
                f"Invalid paths provided to convert_csv_to_parquet."
                f"CSV path={s3_csv_path}, Parquet path={s3_parquet_path}, and local_path={local_path}."
            )
            LOG.error(log_json(request_id, msg, context))
            return False

        msg = f"Running convert_csv_to_parquet on file {csv_filename} in S3 path {s3_csv_path}."
        LOG.info(log_json(request_id, msg, context))

        kwargs = {}
        parquet_file = None
        csv_file = f"{s3_csv_path}/{csv_filename}"
        if csv_filename.lower().endswith(CSV_EXT):
            ext = -len(CSV_EXT)
            parquet_file = f"{csv_filename[:ext]}.parquet"
        elif csv_filename.lower().endswith(CSV_GZIP_EXT):
            ext = -len(CSV_GZIP_EXT)
            parquet_file = f"{csv_filename[:ext]}.parquet"
            kwargs = {"compression": "gzip"}
        else:
            msg = f"File {csv_filename} is not valid CSV. Conversion to parquet skipped."
            LOG.warn(log_json(request_id, msg, context))
            return False

        Path(local_path).mkdir(parents=True, exist_ok=True)
        tmpfile = f"{local_path}/{csv_filename}"
        try:
            s3_resource = get_s3_resource()
            csv_obj = s3_resource.Object(bucket_name=settings.S3_BUCKET_NAME,
                                         key=csv_file)
            csv_obj.download_file(tmpfile)
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            msg = f"File {csv_filename} could not obtained for parquet conversion. Reason: {str(err)}"
            LOG.warn(log_json(request_id, msg, context))
            return False

        output_file = f"{local_path}/{parquet_file}"
        try:
            col_names = pd.read_csv(tmpfile, nrows=0, **kwargs).columns
            converters.update(
                {col: str
                 for col in col_names if col not in converters})
            data_frame = pd.read_csv(tmpfile, converters=converters, **kwargs)
            if post_processor:
                data_frame = post_processor(data_frame)
            data_frame.to_parquet(output_file,
                                  allow_truncated_timestamps=True,
                                  coerce_timestamps="ms")
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            msg = f"File {csv_filename} could not be written as parquet to temp file {output_file}. Reason: {str(err)}"
            LOG.warn(log_json(request_id, msg, context))
            return False

        try:
            with open(output_file, "rb") as fin:
                data = BytesIO(fin.read())
                copy_data_to_s3_bucket(request_id,
                                       s3_parquet_path,
                                       parquet_file,
                                       data,
                                       manifest_id=manifest_id,
                                       context=context)
        except Exception as err:
            shutil.rmtree(local_path, ignore_errors=True)
            s3_key = f"{s3_parquet_path}/{parquet_file}"
            msg = f"File {csv_filename} could not be written as parquet to S3 {s3_key}. Reason: {str(err)}"
            LOG.warn(log_json(request_id, msg, context))
            return False

        s3_hive_table_path = get_hive_table_path(context.get("account"),
                                                 self._provider_type,
                                                 report_type=report_type)

        if not self.presto_table_exists.get(report_type):
            self.create_parquet_table(
                context.get("account"),
                context.get("provider_uuid"),
                manifest_id,
                s3_hive_table_path,
                output_file,
                report_type,
            )

        shutil.rmtree(local_path, ignore_errors=True)
        return True
Exemple #4
0
def delete_archived_data(schema_name, provider_type, provider_uuid):
    """
    Delete archived data from our S3 bucket for a given provider.

    This function chiefly follows the deletion of a provider.

    This task is defined to attempt up to 10 retries using exponential backoff
    starting with a 10-second delay. This is intended to allow graceful handling
    of temporary AWS S3 connectivity issues because it is relatively important
    for us to delete this archived data.

    Args:
        schema_name (str): Koku user account (schema) name.
        provider_type (str): Koku backend provider type identifier.
        provider_uuid (UUID): Koku backend provider UUID.

    """
    if not schema_name or not provider_type or not provider_uuid:
        # Sanity-check all of these inputs in case somehow any receives an
        # empty value such as None or '' because we need to minimize the risk
        # of deleting unrelated files from our S3 bucket.
        messages = []
        if not schema_name:
            message = "missing required argument: schema_name"
            LOG.error(message)
            messages.append(message)
        if not provider_type:
            message = "missing required argument: provider_type"
            LOG.error(message)
            messages.append(message)
        if not provider_uuid:
            message = "missing required argument: provider_uuid"
            LOG.error(message)
            messages.append(message)
        raise TypeError("delete_archived_data() %s", ", ".join(messages))

    if not settings.ENABLE_S3_ARCHIVING:
        LOG.info("Skipping delete_archived_data. Upload feature is disabled.")
        return

    # We need to normalize capitalization and "-local" dev providers.
    account = schema_name[4:]
    path_prefix = f"{Config.WAREHOUSE_PATH}/{Config.CSV_DATA_TYPE}"
    prefix = f"{path_prefix}/{account}/{provider_uuid}/"
    LOG.info("attempting to delete our archived data in S3 under %s", prefix)

    s3_resource = get_s3_resource()
    s3_bucket = s3_resource.Bucket(settings.S3_BUCKET_NAME)
    object_keys = [{"Key": s3_object.key} for s3_object in s3_bucket.objects.filter(Prefix=prefix)]
    batch_size = 1000  # AWS S3 delete API limits to 1000 objects per request.
    for batch_number in range(math.ceil(len(object_keys) / batch_size)):
        batch_start = batch_size * batch_number
        batch_end = batch_start + batch_size
        object_keys_batch = object_keys[batch_start:batch_end]
        s3_bucket.delete_objects(Delete={"Objects": object_keys_batch})

    remaining_objects = list(s3_bucket.objects.filter(Prefix=prefix))
    if remaining_objects:
        LOG.warning(
            "Found %s objects after attempting to delete all objects with prefix %s", len(remaining_objects), prefix
        )