def upload_placeholder(self, file_name, empty_file): bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.BULK_DOWNLOAD_AWS_REGION logger.info('Uploading {}'.format(file_name)) multipart_upload(bucket, region, empty_file, file_name, acl='public-read', parallel_processes=multiprocessing.cpu_count())
def generate_csvs(download_job, sqs_message=None): """Derive the relevant file location and write CSVs to it""" start_time = time.time() # Parse data from download_job json_request = json.loads(download_job.json_request) columns = json_request.get('columns', None) limit = json_request.get('limit', None) file_name = start_download(download_job) try: # Create temporary files and working directory file_path = settings.CSV_LOCAL_PATH + file_name working_dir = os.path.splitext(file_path)[0] if not os.path.exists(working_dir): os.mkdir(working_dir) write_to_log(message='Generating {}'.format(file_name), download_job=download_job) # Generate sources from the JSON request object sources = get_csv_sources(json_request) for source in sources: # Parse and write data to the file download_job.number_of_columns = max(download_job.number_of_columns, len(source.columns(columns))) parse_source(source, columns, download_job, working_dir, start_time, sqs_message, file_path, limit) download_job.file_size = os.stat(file_path).st_size except Exception as e: # Set error message; job_status_id will be set in generate_zip.handle() download_job.error_message = 'An exception was raised while attempting to write the file:\n{}'.format(str(e)) download_job.save() raise type(e)(download_job.error_message) finally: # Remove working directory if os.path.exists(working_dir): shutil.rmtree(working_dir) try: # push file to S3 bucket, if not local if not settings.IS_LOCAL: bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION start_uploading = time.time() multipart_upload(bucket, region, file_path, os.path.basename(file_path), parallel_processes=multiprocessing.cpu_count()) write_to_log(message='Uploading took {} seconds'.format(time.time() - start_uploading), download_job=download_job) except Exception as e: # Set error message; job_status_id will be set in generate_zip.handle() download_job.error_message = 'An exception was raised while attempting to upload the file:\n{}'.format(str(e)) download_job.save() raise type(e)(download_job.error_message) finally: # Remove generated file if not settings.IS_LOCAL and os.path.exists(file_path): os.remove(file_path) return finish_download(download_job)
def download(self, award_type, agency='all', generate_since=None): """ Create a delta file based on award_type, and agency_code (or all agencies) """ logger.info('Starting generation. {}, Agency: {}'.format( award_type, agency if agency == 'all' else agency['name'])) award_map = AWARD_MAPPINGS[award_type] # Create Source and update fields to include correction_delete_ind source = CsvSource( 'transaction', award_map['letter_name'].lower(), 'transactions', 'all' if agency == 'all' else agency['toptier_agency_id']) source.query_paths.update( {'correction_delete_ind': award_map['correction_delete_ind']}) if award_type == 'Contracts': # Add the agency_id column to the mappings source.query_paths.update( {'agency_id': 'transaction__contract_data__agency_id'}) source.query_paths.move_to_end('agency_id', last=False) source.query_paths.move_to_end('correction_delete_ind', last=False) source.human_names = list(source.query_paths.keys()) # Apply filters to the queryset filters, agency_code = self.parse_filters(award_map['award_types'], agency) source.queryset = VALUE_MAPPINGS['transactions']['filter_function']( filters) if award_type == 'Contracts': # Derive the correction_delete_ind from the created_at of the records source.queryset = source.queryset. \ annotate(correction_delete_ind=Case(When(transaction__contract_data__created_at__lt=generate_since, then=Value('C')), default=Value(''), output_field=CharField())) source.queryset = source.queryset.filter( **{ 'transaction__{}__{}__gte'.format(award_map['model'], award_map['date_filter']): generate_since }) # Generate file file_path = self.create_local_file(award_type, source, agency_code, generate_since) if file_path is None: logger.info('No new, modified, or deleted data; discarding file') elif not settings.IS_LOCAL: # Upload file to S3 and delete local version logger.info('Uploading file to S3 bucket and deleting local copy') multipart_upload(settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME, settings.USASPENDING_AWS_REGION, file_path, os.path.basename(file_path)) os.remove(file_path) logger.info('Finished generation. {}, Agency: {}'.format( award_type, agency if agency == 'all' else agency['name']))
def download(self, award_type, agency='all', generate_since=None): """ Create a delta file based on award_type, and agency_code (or all agencies) """ logger.info('Starting generation. {}, Agency: {}'.format(award_type, agency if agency == 'all' else agency['name'])) award_map = AWARD_MAPPINGS[award_type] # Create Source and update fields to include correction_delete_ind source = CsvSource('transaction', award_map['letter_name'].lower(), 'transactions', 'all' if agency == 'all' else agency['toptier_agency_id']) source.query_paths.update({ 'correction_delete_ind': award_map['correction_delete_ind'] }) if award_type == 'Contracts': # Add the agency_id column to the mappings source.query_paths.update({'agency_id': 'transaction__contract_data__agency_id'}) source.query_paths.move_to_end('agency_id', last=False) source.query_paths.move_to_end('correction_delete_ind', last=False) source.human_names = list(source.query_paths.keys()) # Apply filters to the queryset filters, agency_code = self.parse_filters(award_map['award_types'], agency) source.queryset = VALUE_MAPPINGS['transactions']['filter_function'](filters) if award_type == 'Contracts': # Derive the correction_delete_ind from the created_at of the records source.queryset = source.queryset. \ annotate(correction_delete_ind=Case(When(transaction__contract_data__created_at__lt=generate_since, then=Value('C')), default=Value(''), output_field=CharField())) source.queryset = source.queryset.filter(**{ 'transaction__{}__{}__gte'.format(award_map['model'], award_map['date_filter']): generate_since }) # Generate file file_path = self.create_local_file(award_type, source, agency_code, generate_since) if file_path is None: logger.info('No new, modified, or deleted data; discarding file') elif not settings.IS_LOCAL: # Upload file to S3 and delete local version logger.info('Uploading file to S3 bucket and deleting local copy') multipart_upload(settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME, settings.USASPENDING_AWS_REGION, file_path, os.path.basename(file_path)) os.remove(file_path) logger.info('Finished generation. {}, Agency: {}'.format(award_type, agency if agency == 'all' else agency['name']))
def generate_download(download_job: DownloadJob, origination: Optional[str] = None): """Create data archive files from the download job object""" # Parse data from download_job json_request = json.loads(download_job.json_request) columns = json_request.get("columns", None) limit = json_request.get("limit", None) piid = json_request.get("piid", None) award_id = json_request.get("award_id") assistance_id = json_request.get("assistance_id") file_format = json_request.get("file_format") file_name = start_download(download_job) working_dir = None try: # Create temporary files and working directory zip_file_path = settings.CSV_LOCAL_PATH + file_name if not settings.IS_LOCAL and os.path.exists(zip_file_path): # Clean up a zip file that might exist from a prior attempt at this download os.remove(zip_file_path) working_dir = os.path.splitext(zip_file_path)[0] if not os.path.exists(working_dir): os.mkdir(working_dir) write_to_log(message=f"Generating {file_name}", download_job=download_job) # Generate sources from the JSON request object sources = get_download_sources(json_request, origination) for source in sources: # Parse and write data to the file; if there are no matching columns for a source then add an empty file source_column_count = len(source.columns(columns)) if source_column_count == 0: create_empty_data_file(source, download_job, working_dir, piid, assistance_id, zip_file_path, file_format) else: download_job.number_of_columns += source_column_count parse_source(source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit, file_format) include_data_dictionary = json_request.get("include_data_dictionary") if include_data_dictionary: add_data_dictionary_to_zip(working_dir, zip_file_path) include_file_description = json_request.get("include_file_description") if include_file_description: write_to_log(message="Adding file description to zip file") file_description = build_file_description( include_file_description["source"], sources) file_description = file_description.replace( "[AWARD_ID]", str(award_id)) file_description_path = save_file_description( working_dir, include_file_description["destination"], file_description) append_files_to_zip_file([file_description_path], zip_file_path) download_job.file_size = os.stat(zip_file_path).st_size except InvalidParameterException as e: exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise InvalidParameterException(e) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise Exception(download_job.error_message) from e finally: # Remove working directory if working_dir and os.path.exists(working_dir): shutil.rmtree(working_dir) _kill_spawned_processes(download_job) try: # push file to S3 bucket, if not local if not settings.IS_LOCAL: bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION start_uploading = time.perf_counter() multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) write_to_log( message= f"Uploading took {time.perf_counter() - start_uploading:.2f}s", download_job=download_job) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to upload the file" fail_download(download_job, e, exc_msg) if isinstance(e, InvalidParameterException): raise InvalidParameterException(e) else: raise Exception(download_job.error_message) from e finally: # Remove generated file if not settings.IS_LOCAL and os.path.exists(zip_file_path): os.remove(zip_file_path) _kill_spawned_processes(download_job) return finish_download(download_job)
def upload_placeholder(self, file_name, empty_file): bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION logger.info('Uploading {}'.format(file_name)) multipart_upload(bucket, region, empty_file, file_name)
def download(self, award_type, agency="all", generate_since=None): """ Create a delta file based on award_type, and agency_code (or all agencies) """ logger.info("Starting generation. {}, Agency: {}".format( award_type, agency if agency == "all" else agency["name"])) award_map = AWARD_MAPPINGS[award_type] # Create Source and update fields to include correction_delete_ind source = DownloadSource( "transaction", award_map["letter_name"].lower(), "transactions", "all" if agency == "all" else agency["toptier_agency_id"], ) source.query_paths.update( {"correction_delete_ind": award_map["correction_delete_ind"]}) if award_type == "Contracts": # Add the agency_id column to the mappings source.query_paths.update( {"agency_id": "transaction__contract_data__agency_id"}) source.query_paths.move_to_end("agency_id", last=False) source.query_paths.move_to_end("correction_delete_ind", last=False) source.human_names = list(source.query_paths.keys()) # Apply filters to the queryset filters, agency_code = self.parse_filters(award_map["award_types"], agency) source.queryset = VALUE_MAPPINGS["transactions"]["filter_function"]( filters) if award_type == "Contracts": source.queryset = source.queryset.annotate( correction_delete_ind=Case( When(transaction__contract_data__created_at__lt= generate_since, then=Value("C")), default=Value(""), output_field=CharField(), )) else: indicator_field = F( "transaction__assistance_data__correction_delete_indicatr") source.queryset = source.queryset.annotate( correction_delete_ind=Case( When(transaction__assistance_data__updated_at__gt= generate_since, then=indicator_field), When(transaction__transactiondelta__isnull=False, then=Value("C")), default=indicator_field, output_field=CharField(), )) transaction_delta_queryset = source.queryset _filter = { "transaction__{}__{}__gte".format(award_map["model"], award_map["date_filter"]): generate_since } if self.debugging_end_date: _filter["transaction__{}__{}__lt".format( award_map["model"], award_map["date_filter"])] = self.debugging_end_date source.queryset = source.queryset.filter(**_filter) # UNION the normal results to the transaction_delta results. source.queryset = source.queryset.union( transaction_delta_queryset.filter( transaction__transactiondelta__isnull=False)) # Generate file file_path = self.create_local_file(award_type, source, agency_code, generate_since) if file_path is None: logger.info("No new, modified, or deleted data; discarding file") elif not settings.IS_LOCAL: # Upload file to S3 and delete local version logger.info("Uploading file to S3 bucket and deleting local copy") multipart_upload( settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME, settings.USASPENDING_AWS_REGION, file_path, os.path.basename(file_path), ) os.remove(file_path) logger.info("Finished generation. {}, Agency: {}".format( award_type, agency if agency == "all" else agency["name"]))
def generate_csvs(download_job): """Derive the relevant file location and write CSVs to it""" # Parse data from download_job json_request = json.loads(download_job.json_request) columns = json_request.get('columns', None) limit = json_request.get('limit', None) piid = json_request.get('piid', None) file_name = start_download(download_job) try: # Create temporary files and working directory zip_file_path = settings.CSV_LOCAL_PATH + file_name working_dir = os.path.splitext(zip_file_path)[0] if not os.path.exists(working_dir): os.mkdir(working_dir) write_to_log(message='Generating {}'.format(file_name), download_job=download_job) # Generate sources from the JSON request object sources = get_csv_sources(json_request) for source in sources: # Parse and write data to the file download_job.number_of_columns = max(download_job.number_of_columns, len(source.columns(columns))) parse_source(source, columns, download_job, working_dir, piid, zip_file_path, limit) include_file_description = json_request.get('include_file_description') if include_file_description: write_to_log(message="Adding file description to zip file") file_description = build_file_description(include_file_description["source"], sources) file_description_path = save_file_description( working_dir, include_file_description["destination"], file_description) append_files_to_zip_file([file_description_path], zip_file_path) download_job.file_size = os.stat(zip_file_path).st_size except InvalidParameterException as e: exc_msg = "InvalidParameterException was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise InvalidParameterException(e) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to process the DownloadJob" fail_download(download_job, e, exc_msg) raise Exception(download_job.error_message) from e finally: # Remove working directory if os.path.exists(working_dir): shutil.rmtree(working_dir) try: # push file to S3 bucket, if not local if not settings.IS_LOCAL: bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION start_uploading = time.perf_counter() multipart_upload(bucket, region, zip_file_path, os.path.basename(zip_file_path)) write_to_log(message='Uploading took {} seconds'.format(time.perf_counter() - start_uploading), download_job=download_job) except Exception as e: # Set error message; job_status_id will be set in download_sqs_worker.handle() exc_msg = "An exception was raised while attempting to upload the file" fail_download(download_job, e, exc_msg) if isinstance(e, InvalidParameterException): raise InvalidParameterException(e) else: raise Exception(download_job.error_message) from e finally: # Remove generated file if not settings.IS_LOCAL and os.path.exists(zip_file_path): os.remove(zip_file_path) return finish_download(download_job)
def generate_csvs(download_job, sqs_message=None): """Derive the relevant file location and write CSVs to it""" start_time = time.time() # Parse data from download_job json_request = json.loads(download_job.json_request) columns = json_request.get('columns', None) limit = json_request.get('limit', None) file_name = start_download(download_job) try: # Create temporary files and working directory file_path = settings.BULK_DOWNLOAD_LOCAL_PATH + file_name working_dir = os.path.splitext(file_path)[0] if not os.path.exists(working_dir): os.mkdir(working_dir) zipped_csvs = zipfile.ZipFile(file_path, 'w', allowZip64=True) write_to_log(message='Generating {}'.format(file_name), download_job=download_job) # Generate sources from the JSON request object sources = get_csv_sources(json_request) for source in sources: # Parse and write data to the file download_job.number_of_columns = max( download_job.number_of_columns, len(source.columns(columns))) parse_source(source, columns, download_job, working_dir, start_time, sqs_message, zipped_csvs, limit) # Remove temporary files and working directory shutil.rmtree(working_dir) zipped_csvs.close() download_job.file_size = os.stat(file_path).st_size except Exception as e: logger.error(e) handle_file_generation_exception(file_path, download_job, 'write', str(e)) try: # push file to S3 bucket, if not local if not settings.IS_LOCAL: bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.BULK_DOWNLOAD_AWS_REGION start_uploading = time.time() multipart_upload(bucket, region, file_path, os.path.basename(file_path), acl='public-read', parallel_processes=multiprocessing.cpu_count()) os.remove(file_path) write_to_log( message='Uploading took {} seconds'.format(time.time() - start_uploading), download_job=download_job) except Exception as e: logger.error(e) handle_file_generation_exception(file_path, download_job, 'upload', str(e)) return finish_download(download_job)
def upload_placeholder(self, file_name, empty_file): bucket = settings.BULK_DOWNLOAD_S3_BUCKET_NAME region = settings.USASPENDING_AWS_REGION logger.info('Uploading {}'.format(file_name)) multipart_upload(bucket, region, empty_file, file_name)