def upload_placeholder(self, file_name, empty_file):
        bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
        region = settings.BULK_DOWNLOAD_AWS_REGION

        logger.info('Uploading {}'.format(file_name))
        multipart_upload(bucket, region, empty_file, file_name, acl='public-read',
                         parallel_processes=multiprocessing.cpu_count())
Ejemplo n.º 2
0
def generate_csvs(download_job, sqs_message=None):
    """Derive the relevant file location and write CSVs to it"""
    start_time = time.time()

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get('columns', None)
    limit = json_request.get('limit', None)

    file_name = start_download(download_job)
    try:
        # Create temporary files and working directory
        file_path = settings.CSV_LOCAL_PATH + file_name
        working_dir = os.path.splitext(file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message='Generating {}'.format(file_name), download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_csv_sources(json_request)
        for source in sources:
            # Parse and write data to the file
            download_job.number_of_columns = max(download_job.number_of_columns, len(source.columns(columns)))
            parse_source(source, columns, download_job, working_dir, start_time, sqs_message, file_path, limit)
        download_job.file_size = os.stat(file_path).st_size
    except Exception as e:
        # Set error message; job_status_id will be set in generate_zip.handle()
        download_job.error_message = 'An exception was raised while attempting to write the file:\n{}'.format(str(e))
        download_job.save()
        raise type(e)(download_job.error_message)
    finally:
        # Remove working directory
        if os.path.exists(working_dir):
            shutil.rmtree(working_dir)

    try:
        # push file to S3 bucket, if not local
        if not settings.IS_LOCAL:
            bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
            region = settings.USASPENDING_AWS_REGION
            start_uploading = time.time()
            multipart_upload(bucket, region, file_path, os.path.basename(file_path),
                             parallel_processes=multiprocessing.cpu_count())
            write_to_log(message='Uploading took {} seconds'.format(time.time() - start_uploading),
                         download_job=download_job)
    except Exception as e:
        # Set error message; job_status_id will be set in generate_zip.handle()
        download_job.error_message = 'An exception was raised while attempting to upload the file:\n{}'.format(str(e))
        download_job.save()
        raise type(e)(download_job.error_message)
    finally:
        # Remove generated file
        if not settings.IS_LOCAL and os.path.exists(file_path):
            os.remove(file_path)

    return finish_download(download_job)
Ejemplo n.º 3
0
    def download(self, award_type, agency='all', generate_since=None):
        """ Create a delta file based on award_type, and agency_code (or all agencies) """
        logger.info('Starting generation. {}, Agency: {}'.format(
            award_type, agency if agency == 'all' else agency['name']))
        award_map = AWARD_MAPPINGS[award_type]

        # Create Source and update fields to include correction_delete_ind
        source = CsvSource(
            'transaction', award_map['letter_name'].lower(), 'transactions',
            'all' if agency == 'all' else agency['toptier_agency_id'])
        source.query_paths.update(
            {'correction_delete_ind': award_map['correction_delete_ind']})
        if award_type == 'Contracts':
            # Add the agency_id column to the mappings
            source.query_paths.update(
                {'agency_id': 'transaction__contract_data__agency_id'})
            source.query_paths.move_to_end('agency_id', last=False)
        source.query_paths.move_to_end('correction_delete_ind', last=False)
        source.human_names = list(source.query_paths.keys())

        # Apply filters to the queryset
        filters, agency_code = self.parse_filters(award_map['award_types'],
                                                  agency)
        source.queryset = VALUE_MAPPINGS['transactions']['filter_function'](
            filters)
        if award_type == 'Contracts':
            # Derive the correction_delete_ind from the created_at of the records
            source.queryset = source.queryset. \
                annotate(correction_delete_ind=Case(When(transaction__contract_data__created_at__lt=generate_since,
                                                    then=Value('C')), default=Value(''), output_field=CharField()))
        source.queryset = source.queryset.filter(
            **{
                'transaction__{}__{}__gte'.format(award_map['model'], award_map['date_filter']):
                generate_since
            })

        # Generate file
        file_path = self.create_local_file(award_type, source, agency_code,
                                           generate_since)
        if file_path is None:
            logger.info('No new, modified, or deleted data; discarding file')
        elif not settings.IS_LOCAL:
            # Upload file to S3 and delete local version
            logger.info('Uploading file to S3 bucket and deleting local copy')
            multipart_upload(settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME,
                             settings.USASPENDING_AWS_REGION, file_path,
                             os.path.basename(file_path))
            os.remove(file_path)

        logger.info('Finished generation. {}, Agency: {}'.format(
            award_type, agency if agency == 'all' else agency['name']))
    def download(self, award_type, agency='all', generate_since=None):
        """ Create a delta file based on award_type, and agency_code (or all agencies) """
        logger.info('Starting generation. {}, Agency: {}'.format(award_type, agency if agency == 'all' else
                                                                 agency['name']))
        award_map = AWARD_MAPPINGS[award_type]

        # Create Source and update fields to include correction_delete_ind
        source = CsvSource('transaction', award_map['letter_name'].lower(), 'transactions',
                           'all' if agency == 'all' else agency['toptier_agency_id'])
        source.query_paths.update({
            'correction_delete_ind': award_map['correction_delete_ind']
        })
        if award_type == 'Contracts':
            # Add the agency_id column to the mappings
            source.query_paths.update({'agency_id': 'transaction__contract_data__agency_id'})
            source.query_paths.move_to_end('agency_id', last=False)
        source.query_paths.move_to_end('correction_delete_ind', last=False)
        source.human_names = list(source.query_paths.keys())

        # Apply filters to the queryset
        filters, agency_code = self.parse_filters(award_map['award_types'], agency)
        source.queryset = VALUE_MAPPINGS['transactions']['filter_function'](filters)
        if award_type == 'Contracts':
            # Derive the correction_delete_ind from the created_at of the records
            source.queryset = source.queryset. \
                annotate(correction_delete_ind=Case(When(transaction__contract_data__created_at__lt=generate_since,
                                                    then=Value('C')), default=Value(''), output_field=CharField()))
        source.queryset = source.queryset.filter(**{
            'transaction__{}__{}__gte'.format(award_map['model'], award_map['date_filter']): generate_since
        })

        # Generate file
        file_path = self.create_local_file(award_type, source, agency_code, generate_since)
        if file_path is None:
            logger.info('No new, modified, or deleted data; discarding file')
        elif not settings.IS_LOCAL:
            # Upload file to S3 and delete local version
            logger.info('Uploading file to S3 bucket and deleting local copy')
            multipart_upload(settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME, settings.USASPENDING_AWS_REGION, file_path,
                             os.path.basename(file_path))
            os.remove(file_path)

        logger.info('Finished generation. {}, Agency: {}'.format(award_type, agency if agency == 'all' else
                                                                 agency['name']))
Ejemplo n.º 5
0
def generate_download(download_job: DownloadJob,
                      origination: Optional[str] = None):
    """Create data archive files from the download job object"""

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get("columns", None)
    limit = json_request.get("limit", None)
    piid = json_request.get("piid", None)
    award_id = json_request.get("award_id")
    assistance_id = json_request.get("assistance_id")
    file_format = json_request.get("file_format")

    file_name = start_download(download_job)
    working_dir = None
    try:
        # Create temporary files and working directory
        zip_file_path = settings.CSV_LOCAL_PATH + file_name
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            # Clean up a zip file that might exist from a prior attempt at this download
            os.remove(zip_file_path)
        working_dir = os.path.splitext(zip_file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message=f"Generating {file_name}",
                     download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_download_sources(json_request, origination)
        for source in sources:
            # Parse and write data to the file; if there are no matching columns for a source then add an empty file
            source_column_count = len(source.columns(columns))
            if source_column_count == 0:
                create_empty_data_file(source, download_job, working_dir, piid,
                                       assistance_id, zip_file_path,
                                       file_format)
            else:
                download_job.number_of_columns += source_column_count
                parse_source(source, columns, download_job, working_dir, piid,
                             assistance_id, zip_file_path, limit, file_format)
        include_data_dictionary = json_request.get("include_data_dictionary")
        if include_data_dictionary:
            add_data_dictionary_to_zip(working_dir, zip_file_path)
        include_file_description = json_request.get("include_file_description")
        if include_file_description:
            write_to_log(message="Adding file description to zip file")
            file_description = build_file_description(
                include_file_description["source"], sources)
            file_description = file_description.replace(
                "[AWARD_ID]", str(award_id))
            file_description_path = save_file_description(
                working_dir, include_file_description["destination"],
                file_description)
            append_files_to_zip_file([file_description_path], zip_file_path)
        download_job.file_size = os.stat(zip_file_path).st_size
    except InvalidParameterException as e:
        exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise InvalidParameterException(e)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise Exception(download_job.error_message) from e
    finally:
        # Remove working directory
        if working_dir and os.path.exists(working_dir):
            shutil.rmtree(working_dir)
        _kill_spawned_processes(download_job)

    try:
        # push file to S3 bucket, if not local
        if not settings.IS_LOCAL:
            bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
            region = settings.USASPENDING_AWS_REGION
            start_uploading = time.perf_counter()
            multipart_upload(bucket, region, zip_file_path,
                             os.path.basename(zip_file_path))
            write_to_log(
                message=
                f"Uploading took {time.perf_counter() - start_uploading:.2f}s",
                download_job=download_job)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to upload the file"
        fail_download(download_job, e, exc_msg)
        if isinstance(e, InvalidParameterException):
            raise InvalidParameterException(e)
        else:
            raise Exception(download_job.error_message) from e
    finally:
        # Remove generated file
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            os.remove(zip_file_path)
        _kill_spawned_processes(download_job)

    return finish_download(download_job)
    def upload_placeholder(self, file_name, empty_file):
        bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
        region = settings.USASPENDING_AWS_REGION

        logger.info('Uploading {}'.format(file_name))
        multipart_upload(bucket, region, empty_file, file_name)
Ejemplo n.º 7
0
    def download(self, award_type, agency="all", generate_since=None):
        """ Create a delta file based on award_type, and agency_code (or all agencies) """
        logger.info("Starting generation. {}, Agency: {}".format(
            award_type, agency if agency == "all" else agency["name"]))
        award_map = AWARD_MAPPINGS[award_type]

        # Create Source and update fields to include correction_delete_ind
        source = DownloadSource(
            "transaction",
            award_map["letter_name"].lower(),
            "transactions",
            "all" if agency == "all" else agency["toptier_agency_id"],
        )
        source.query_paths.update(
            {"correction_delete_ind": award_map["correction_delete_ind"]})
        if award_type == "Contracts":
            # Add the agency_id column to the mappings
            source.query_paths.update(
                {"agency_id": "transaction__contract_data__agency_id"})
            source.query_paths.move_to_end("agency_id", last=False)
        source.query_paths.move_to_end("correction_delete_ind", last=False)
        source.human_names = list(source.query_paths.keys())

        # Apply filters to the queryset
        filters, agency_code = self.parse_filters(award_map["award_types"],
                                                  agency)
        source.queryset = VALUE_MAPPINGS["transactions"]["filter_function"](
            filters)

        if award_type == "Contracts":
            source.queryset = source.queryset.annotate(
                correction_delete_ind=Case(
                    When(transaction__contract_data__created_at__lt=
                         generate_since,
                         then=Value("C")),
                    default=Value(""),
                    output_field=CharField(),
                ))
        else:
            indicator_field = F(
                "transaction__assistance_data__correction_delete_indicatr")
            source.queryset = source.queryset.annotate(
                correction_delete_ind=Case(
                    When(transaction__assistance_data__updated_at__gt=
                         generate_since,
                         then=indicator_field),
                    When(transaction__transactiondelta__isnull=False,
                         then=Value("C")),
                    default=indicator_field,
                    output_field=CharField(),
                ))

        transaction_delta_queryset = source.queryset

        _filter = {
            "transaction__{}__{}__gte".format(award_map["model"], award_map["date_filter"]):
            generate_since
        }
        if self.debugging_end_date:
            _filter["transaction__{}__{}__lt".format(
                award_map["model"],
                award_map["date_filter"])] = self.debugging_end_date

        source.queryset = source.queryset.filter(**_filter)

        # UNION the normal results to the transaction_delta results.
        source.queryset = source.queryset.union(
            transaction_delta_queryset.filter(
                transaction__transactiondelta__isnull=False))

        # Generate file
        file_path = self.create_local_file(award_type, source, agency_code,
                                           generate_since)
        if file_path is None:
            logger.info("No new, modified, or deleted data; discarding file")
        elif not settings.IS_LOCAL:
            # Upload file to S3 and delete local version
            logger.info("Uploading file to S3 bucket and deleting local copy")
            multipart_upload(
                settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME,
                settings.USASPENDING_AWS_REGION,
                file_path,
                os.path.basename(file_path),
            )
            os.remove(file_path)

        logger.info("Finished generation. {}, Agency: {}".format(
            award_type, agency if agency == "all" else agency["name"]))
def generate_csvs(download_job):
    """Derive the relevant file location and write CSVs to it"""

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get('columns', None)
    limit = json_request.get('limit', None)
    piid = json_request.get('piid', None)

    file_name = start_download(download_job)
    try:
        # Create temporary files and working directory
        zip_file_path = settings.CSV_LOCAL_PATH + file_name
        working_dir = os.path.splitext(zip_file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)

        write_to_log(message='Generating {}'.format(file_name), download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_csv_sources(json_request)
        for source in sources:
            # Parse and write data to the file
            download_job.number_of_columns = max(download_job.number_of_columns, len(source.columns(columns)))
            parse_source(source, columns, download_job, working_dir, piid, zip_file_path, limit)
        include_file_description = json_request.get('include_file_description')
        if include_file_description:
            write_to_log(message="Adding file description to zip file")
            file_description = build_file_description(include_file_description["source"], sources)
            file_description_path = save_file_description(
                working_dir, include_file_description["destination"], file_description)
            append_files_to_zip_file([file_description_path], zip_file_path)
        download_job.file_size = os.stat(zip_file_path).st_size
    except InvalidParameterException as e:
        exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise InvalidParameterException(e)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to process the DownloadJob"
        fail_download(download_job, e, exc_msg)
        raise Exception(download_job.error_message) from e
    finally:
        # Remove working directory
        if os.path.exists(working_dir):
            shutil.rmtree(working_dir)

    try:
        # push file to S3 bucket, if not local
        if not settings.IS_LOCAL:
            bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
            region = settings.USASPENDING_AWS_REGION
            start_uploading = time.perf_counter()
            multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path))
            write_to_log(message='Uploading took {} seconds'.format(time.perf_counter() - start_uploading),
                         download_job=download_job)
    except Exception as e:
        # Set error message; job_status_id will be set in download_sqs_worker.handle()
        exc_msg = "An exception was raised while attempting to upload the file"
        fail_download(download_job, e, exc_msg)
        if isinstance(e, InvalidParameterException):
            raise InvalidParameterException(e)
        else:
            raise Exception(download_job.error_message) from e
    finally:
        # Remove generated file
        if not settings.IS_LOCAL and os.path.exists(zip_file_path):
            os.remove(zip_file_path)

    return finish_download(download_job)
Ejemplo n.º 9
0
def generate_csvs(download_job, sqs_message=None):
    """Derive the relevant file location and write CSVs to it"""
    start_time = time.time()

    # Parse data from download_job
    json_request = json.loads(download_job.json_request)
    columns = json_request.get('columns', None)
    limit = json_request.get('limit', None)

    file_name = start_download(download_job)
    try:
        # Create temporary files and working directory
        file_path = settings.BULK_DOWNLOAD_LOCAL_PATH + file_name
        working_dir = os.path.splitext(file_path)[0]
        if not os.path.exists(working_dir):
            os.mkdir(working_dir)
        zipped_csvs = zipfile.ZipFile(file_path, 'w', allowZip64=True)

        write_to_log(message='Generating {}'.format(file_name),
                     download_job=download_job)

        # Generate sources from the JSON request object
        sources = get_csv_sources(json_request)
        for source in sources:
            # Parse and write data to the file
            download_job.number_of_columns = max(
                download_job.number_of_columns, len(source.columns(columns)))
            parse_source(source, columns, download_job, working_dir,
                         start_time, sqs_message, zipped_csvs, limit)

        # Remove temporary files and working directory
        shutil.rmtree(working_dir)
        zipped_csvs.close()
        download_job.file_size = os.stat(file_path).st_size
    except Exception as e:
        logger.error(e)
        handle_file_generation_exception(file_path, download_job, 'write',
                                         str(e))

    try:
        # push file to S3 bucket, if not local
        if not settings.IS_LOCAL:
            bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
            region = settings.BULK_DOWNLOAD_AWS_REGION
            start_uploading = time.time()
            multipart_upload(bucket,
                             region,
                             file_path,
                             os.path.basename(file_path),
                             acl='public-read',
                             parallel_processes=multiprocessing.cpu_count())
            os.remove(file_path)

            write_to_log(
                message='Uploading took {} seconds'.format(time.time() -
                                                           start_uploading),
                download_job=download_job)
    except Exception as e:
        logger.error(e)
        handle_file_generation_exception(file_path, download_job, 'upload',
                                         str(e))

    return finish_download(download_job)
    def upload_placeholder(self, file_name, empty_file):
        bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME
        region = settings.USASPENDING_AWS_REGION

        logger.info('Uploading {}'.format(file_name))
        multipart_upload(bucket, region, empty_file, file_name)