def split_and_zip_data_files(zip_file_path, source_path, data_file_name, file_format, download_job=None):
    try:
        # Split data files into separate files
        # e.g. `Assistance_prime_transactions_delta_%s.csv`
        log_time = time.perf_counter()
        delim = FILE_FORMATS[file_format]["delimiter"]
        extension = FILE_FORMATS[file_format]["extension"]

        output_template = f"{data_file_name}_%s.{extension}"
        write_to_log(message="Beginning the delimited text file partition", download_job=download_job)
        list_of_files = partition_large_delimited_file(
            file_path=source_path, delimiter=delim, row_limit=EXCEL_ROW_LIMIT, output_name_template=output_template
        )

        msg = f"Partitioning data into {len(list_of_files)} files took {time.perf_counter() - log_time:.4f}s"
        write_to_log(message=msg, download_job=download_job)

        # Zip the split files into one zipfile
        write_to_log(message="Beginning zipping and compression", download_job=download_job)
        log_time = time.perf_counter()
        append_files_to_zip_file(list_of_files, zip_file_path)

        write_to_log(
            message=f"Writing to zipfile took {time.perf_counter() - log_time:.4f}s", download_job=download_job
        )

    except Exception as e:
        message = "Exception while partitioning text file"
        if download_job:
            fail_download(download_job, e, message)
            write_to_log(message=message, download_job=download_job, is_error=True)
        logger.error(e)
        raise e
def add_data_dictionary_to_zip(working_dir, zip_file_path):
    write_to_log(message="Adding data dictionary to zip file")
    data_dictionary_file_name = "Data_Dictionary_Crosswalk.xlsx"
    data_dictionary_file_path = os.path.join(working_dir, data_dictionary_file_name)
    data_dictionary_url = settings.DATA_DICTIONARY_DOWNLOAD_URL
    RetrieveFileFromUri(data_dictionary_url).copy(data_dictionary_file_path)
    append_files_to_zip_file([data_dictionary_file_path], zip_file_path)
def split_and_zip_csvs(zip_file_path, source_path, source_name, download_job=None):
    try:
        # Split CSV into separate files
        # e.g. `Assistance_prime_transactions_delta_%s.csv`
        log_time = time.perf_counter()

        output_template = '{}_%s.csv'.format(source_name)
        write_to_log(message='Beginning the CSV file partition', download_job=download_job)
        list_of_csv_files = partition_large_csv_file(source_path, row_limit=EXCEL_ROW_LIMIT,
                                                     output_name_template=output_template)

        if download_job:
            write_to_log(
                message='Partitioning CSV file into {} files took {:.4f} seconds'.format(
                    len(list_of_csv_files),
                    time.perf_counter() - log_time
                ),
                download_job=download_job
            )

        # Zip the split CSVs into one zipfile
        write_to_log(message="Beginning zipping and compression", download_job=download_job)
        log_time = time.perf_counter()
        append_files_to_zip_file(list_of_csv_files, zip_file_path)

        if download_job:
            write_to_log(message='Writing to zipfile took {:.4f} seconds'.format(time.perf_counter() - log_time),
                         download_job=download_job)

    except Exception as e:
        message = "Exception while partitioning CSV"
        fail_download(download_job, e, message)
        write_to_log(message=message, download_job=download_job, is_error=True)
        logger.error(e)
        raise e
def test_append_files_to_zip_file():
    with NamedTemporaryFile() as zip_file:
        with NamedTemporaryFile() as include_file_1:
            with NamedTemporaryFile() as include_file_2:
                include_file_1.write(b"this is a test")
                include_file_1.flush()
                include_file_2.write(b"this is also a test")
                include_file_2.flush()
                append_files_to_zip_file([include_file_1.name, include_file_2.name], zip_file.name)

                with zipfile.ZipFile(zip_file.name, "r") as zf:
                    assert [z.filename for z in zf.filelist] == [
                        os.path.basename(include_file_1.name),
                        os.path.basename(include_file_2.name)
                    ]
    def finalize_zip_contents(self):
        self.filepaths_to_delete.append(self.working_dir_path /
                                        "Data_Dictionary_Crosswalk.xlsx")

        add_data_dictionary_to_zip(str(self.zip_file_path.parent),
                                   str(self.zip_file_path))

        file_description = build_file_description(str(self.readme_path),
                                                  dict())
        file_description_path = save_file_description(
            str(self.zip_file_path.parent), self.readme_path.name,
            file_description)
        self.filepaths_to_delete.append(Path(file_description_path))
        append_files_to_zip_file([file_description_path],
                                 str(self.zip_file_path))
        self.total_download_size = self.zip_file_path.stat().st_size
def create_empty_data_file(
    source: DownloadSource,
    download_job: DownloadJob,
    working_dir: str,
    piid: str,
    assistance_id: str,
    zip_file_path: str,
    file_format: str,
) -> None:
    data_file_name = build_data_file_name(source, download_job, piid, assistance_id)
    extension = FILE_FORMATS[file_format]["extension"]
    source.file_name = f"{data_file_name}.{extension}"
    source_path = os.path.join(working_dir, source.file_name)
    write_to_log(
        message=f"Skipping download of {source.file_name} due to no valid columns provided", download_job=download_job
    )
    Path(source_path).touch()
    append_files_to_zip_file([source_path], zip_file_path)
Esempio n. 7
0
def generate_download(download_job: DownloadJob,
                      origination: Optional[str] = None):
    """Create data archive files from the download job object"""

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get("columns", None)
    limit = json_request.get("limit", None)
    piid = json_request.get("piid", None)
    award_id = json_request.get("award_id")
    assistance_id = json_request.get("assistance_id")
    file_format = json_request.get("file_format")

    file_name = start_download(download_job)
    working_dir = None
    try:
        # Create temporary files and working directory
        zip_file_path = settings.CSV_LOCAL_PATH + file_name
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            # Clean up a zip file that might exist from a prior attempt at this download
            os.remove(zip_file_path)
        working_dir = os.path.splitext(zip_file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message=f"Generating {file_name}",
                     download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_download_sources(json_request, origination)
        for source in sources:
            # Parse and write data to the file; if there are no matching columns for a source then add an empty file
            source_column_count = len(source.columns(columns))
            if source_column_count == 0:
                create_empty_data_file(source, download_job, working_dir, piid,
                                       assistance_id, zip_file_path,
                                       file_format)
            else:
                download_job.number_of_columns += source_column_count
                parse_source(source, columns, download_job, working_dir, piid,
                             assistance_id, zip_file_path, limit, file_format)
        include_data_dictionary = json_request.get("include_data_dictionary")
        if include_data_dictionary:
            add_data_dictionary_to_zip(working_dir, zip_file_path)
        include_file_description = json_request.get("include_file_description")
        if include_file_description:
            write_to_log(message="Adding file description to zip file")
            file_description = build_file_description(
                include_file_description["source"], sources)
            file_description = file_description.replace(
                "[AWARD_ID]", str(award_id))
            file_description_path = save_file_description(
                working_dir, include_file_description["destination"],
                file_description)
            append_files_to_zip_file([file_description_path], zip_file_path)
        download_job.file_size = os.stat(zip_file_path).st_size
    except InvalidParameterException as e:
        exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise InvalidParameterException(e)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise Exception(download_job.error_message) from e
    finally:
        # Remove working directory
        if working_dir and os.path.exists(working_dir):
            shutil.rmtree(working_dir)
        _kill_spawned_processes(download_job)

    try:
        # push file to S3 bucket, if not local
        if not settings.IS_LOCAL:
            bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
            region = settings.USASPENDING_AWS_REGION
            start_uploading = time.perf_counter()
            multipart_upload(bucket, region, zip_file_path,
                             os.path.basename(zip_file_path))
            write_to_log(
                message=
                f"Uploading took {time.perf_counter() - start_uploading:.2f}s",
                download_job=download_job)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to upload the file"
        fail_download(download_job, e, exc_msg)
        if isinstance(e, InvalidParameterException):
            raise InvalidParameterException(e)
        else:
            raise Exception(download_job.error_message) from e
    finally:
        # Remove generated file
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            os.remove(zip_file_path)
        _kill_spawned_processes(download_job)

    return finish_download(download_job)
def generate_csvs(download_job):
    """Derive the relevant file location and write CSVs to it"""

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get('columns', None)
    limit = json_request.get('limit', None)
    piid = json_request.get('piid', None)

    file_name = start_download(download_job)
    try:
        # Create temporary files and working directory
        zip_file_path = settings.CSV_LOCAL_PATH + file_name
        working_dir = os.path.splitext(zip_file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message='Generating {}'.format(file_name), download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_csv_sources(json_request)
        for source in sources:
            # Parse and write data to the file
            download_job.number_of_columns = max(download_job.number_of_columns, len(source.columns(columns)))
            parse_source(source, columns, download_job, working_dir, piid, zip_file_path, limit)
        include_file_description = json_request.get('include_file_description')
        if include_file_description:
            write_to_log(message="Adding file description to zip file")
            file_description = build_file_description(include_file_description["source"], sources)
            file_description_path = save_file_description(
                working_dir, include_file_description["destination"], file_description)
            append_files_to_zip_file([file_description_path], zip_file_path)
        download_job.file_size = os.stat(zip_file_path).st_size
    except InvalidParameterException as e:
        exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise InvalidParameterException(e)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise Exception(download_job.error_message) from e
    finally:
        # Remove working directory
        if os.path.exists(working_dir):
            shutil.rmtree(working_dir)

    try:
        # push file to S3 bucket, if not local
        if not settings.IS_LOCAL:
            bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
            region = settings.USASPENDING_AWS_REGION
            start_uploading = time.perf_counter()
            multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path))
            write_to_log(message='Uploading took {} seconds'.format(time.perf_counter() - start_uploading),
                         download_job=download_job)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to upload the file"
        fail_download(download_job, e, exc_msg)
        if isinstance(e, InvalidParameterException):
            raise InvalidParameterException(e)
        else:
            raise Exception(download_job.error_message) from e
    finally:
        # Remove generated file
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            os.remove(zip_file_path)

    return finish_download(download_job)
def generate_download(download_job: DownloadJob,
                      origination: Optional[str] = None):
    """Create data archive files from the download job object"""

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get("columns", None)
    limit = json_request.get("limit", None)
    piid = json_request.get("piid", None)
    award_id = json_request.get("award_id")
    assistance_id = json_request.get("assistance_id")
    file_format = json_request.get("file_format")
    request_type = json_request.get("request_type")

    span = tracer.current_span()
    if span and request_type:
        span.resource = request_type

    file_name = start_download(download_job)
    working_dir = None
    try:
        # Create temporary files and working directory
        zip_file_path = settings.CSV_LOCAL_PATH + file_name
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            # Clean up a zip file that might exist from a prior attempt at this download
            os.remove(zip_file_path)
        working_dir = os.path.splitext(zip_file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message=f"Generating {file_name}",
                     download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_download_sources(json_request, origination)
        for source in sources:
            # Parse and write data to the file; if there are no matching columns for a source then add an empty file
            source_column_count = len(source.columns(columns))
            if source_column_count == 0:
                create_empty_data_file(source, download_job, working_dir, piid,
                                       assistance_id, zip_file_path,
                                       file_format)
            else:
                download_job.number_of_columns += source_column_count
                parse_source(source, columns, download_job, working_dir, piid,
                             assistance_id, zip_file_path, limit, file_format)
        include_data_dictionary = json_request.get("include_data_dictionary")
        if include_data_dictionary:
            add_data_dictionary_to_zip(working_dir, zip_file_path)
        include_file_description = json_request.get("include_file_description")
        if include_file_description:
            write_to_log(message="Adding file description to zip file")
            file_description = build_file_description(
                include_file_description["source"], sources)
            file_description = file_description.replace(
                "[AWARD_ID]", str(award_id))
            file_description_path = save_file_description(
                working_dir, include_file_description["destination"],
                file_description)
            append_files_to_zip_file([file_description_path], zip_file_path)
        download_job.file_size = os.stat(zip_file_path).st_size
    except InvalidParameterException as e:
        exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise InvalidParameterException(e)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise Exception(download_job.error_message) from e
    finally:
        # Remove working directory
        if working_dir and os.path.exists(working_dir):
            shutil.rmtree(working_dir)
        _kill_spawned_processes(download_job)

    # push file to S3 bucket, if not local
    if not settings.IS_LOCAL:
        with tracer.trace(
                name=f"job.{JOB_TYPE}.download.s3",
                service="bulk-download",
                resource=f"s3://{settings.BULK_DOWNLOAD_S3_BUCKET_NAME}",
                span_type=SpanTypes.WORKER,
        ) as span, tracer.trace(
                name="s3.command",
                service="aws.s3",
                resource=".".join([
                    multipart_upload.__module__,
                    (multipart_upload.__qualname__
                     or multipart_upload.__name__)
                ]),
                span_type=SpanTypes.WEB,
        ) as s3_span:
            # NOTE: Traces still not auto-picking-up aws.s3 service upload activity
            # Could be that the patches for boto and botocore don't cover the newer boto3 S3Transfer upload approach
            span.set_tag("file_name", file_name)
            try:
                bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
                region = settings.USASPENDING_AWS_REGION
                s3_span.set_tags({
                    "bucket": bucket,
                    "region": region,
                    "file": zip_file_path
                })
                start_uploading = time.perf_counter()
                multipart_upload(bucket, region, zip_file_path,
                                 os.path.basename(zip_file_path))
                write_to_log(
                    message=
                    f"Uploading took {time.perf_counter() - start_uploading:.2f}s",
                    download_job=download_job)
            except Exception as e:
                # Set error message; job_status_id will be set in download_sqs_worker.handle()
                exc_msg = "An exception was raised while attempting to upload the file"
                fail_download(download_job, e, exc_msg)
                if isinstance(e, InvalidParameterException):
                    raise InvalidParameterException(e)
                else:
                    raise Exception(download_job.error_message) from e
            finally:
                # Remove generated file
                if os.path.exists(zip_file_path):
                    os.remove(zip_file_path)
                _kill_spawned_processes(download_job)

    return finish_download(download_job)