Esempio n. 1
0
def get_file_path(file_name: str) -> str:
    if settings.IS_LOCAL:
        file_path = settings.CSV_LOCAL_PATH + file_name
    else:
        s3_handler = S3Handler(
            bucket_name=settings.BULK_DOWNLOAD_S3_BUCKET_NAME,
            redirect_dir=settings.BULK_DOWNLOAD_S3_REDIRECT_DIR)
        file_path = s3_handler.get_simple_url(file_name=file_name)

    return file_path
Esempio n. 2
0
class ListMonthlyDownloadsViewset(APIDocumentationView):
    """
    This route lists all the agencies and the subagencies or federal accounts associated under specific agencies.

    endpoint_doc: /download/list_downloads.md
    """
    s3_handler = S3Handler(
        bucket_name=settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME,
        redirect_dir=settings.MONTHLY_DOWNLOAD_S3_REDIRECT_DIR)

    # This is intentionally not cached so that the latest updates to these monthly generated files are always returned
    def post(self, request):
        """Return list of downloads that match the requested params"""
        agency_id = request.data.get('agency', None)
        fiscal_year = request.data.get('fiscal_year', None)
        type_param = request.data.get('type', None)

        # Check required params
        required_params = {
            'agency': agency_id,
            'fiscal_year': fiscal_year,
            'type': type_param
        }
        for required, param_value in required_params.items():
            if param_value is None:
                raise InvalidParameterException(
                    'Missing one or more required query parameters: {}'.format(
                        required))

        # Capitalize type_param and retrieve agency information from agency ID
        download_type = type_param.capitalize()
        if agency_id == 'all':
            agency = {'cgac_code': 'all', 'name': 'All', 'abbreviation': None}
        else:
            agency_check = ToptierAgency.objects.filter(
                toptier_agency_id=agency_id).values('cgac_code', 'name',
                                                    'abbreviation')
            if agency_check:
                agency = agency_check[0]
            else:
                raise InvalidParameterException(
                    '{} agency not found'.format(agency_id))

        # Populate regex
        monthly_download_prefixes = '{}_{}_{}'.format(fiscal_year,
                                                      agency['cgac_code'],
                                                      download_type)
        monthly_download_regex = '{}_Full_.*\.zip'.format(
            monthly_download_prefixes)
        delta_download_prefixes = '{}_{}'.format(agency['cgac_code'],
                                                 download_type)
        delta_download_regex = '{}_Delta_.*\.zip'.format(
            delta_download_prefixes)

        # Retrieve and filter the files we need
        bucket = boto3.resource('s3',
                                region_name=self.s3_handler.region).Bucket(
                                    self.s3_handler.bucketRoute)
        monthly_download_names = list(
            filter(
                re.compile(monthly_download_regex).search, [
                    key.key for key in bucket.objects.filter(
                        Prefix=monthly_download_prefixes)
                ]))
        delta_download_names = list(
            filter(
                re.compile(delta_download_regex).search, [
                    key.key for key in bucket.objects.filter(
                        Prefix=delta_download_prefixes)
                ]))

        # Generate response
        downloads = []
        for filename in monthly_download_names:
            downloads.append(
                self.create_download_response_obj(filename, fiscal_year,
                                                  type_param, agency))
        for filename in delta_download_names:
            downloads.append(
                self.create_download_response_obj(filename,
                                                  None,
                                                  type_param,
                                                  agency,
                                                  is_delta=True))

        return Response({'monthly_files': downloads})

    def create_download_response_obj(self,
                                     filename,
                                     fiscal_year,
                                     type_param,
                                     agency,
                                     is_delta=False):
        """Return a """
        regex = '(.*)_(.*)_Delta_(.*)\.zip' if is_delta else '(.*)_(.*)_(.*)_Full_(.*)\.zip'
        filename_data = re.findall(regex, filename)[0]

        # Simply adds dashes for the date, 20180101 -> 2018-01-01, could also use strftime
        unformatted_date = filename_data[2 if is_delta else 3]
        updated_date = '-'.join([
            unformatted_date[:4], unformatted_date[4:6], unformatted_date[6:]
        ])

        return {
            'fiscal_year': fiscal_year,
            'agency_name': agency['name'],
            'agency_acronym': agency['abbreviation'],
            'type': type_param,
            'updated_date': updated_date,
            'file_name': filename,
            'url': self.s3_handler.get_simple_url(file_name=filename)
        }
Esempio n. 3
0
class BaseDownloadViewSet(APIDocumentationView):
    s3_handler = S3Handler(bucket_name=settings.BULK_DOWNLOAD_S3_BUCKET_NAME,
                           redirect_dir=settings.BUCK_DOWNLOAD_S3_REDIRECT_DIR)

    def post(self, request, request_type='award'):
        """Push a message to SQS with the validated request JSON"""
        json_request = (self.validate_award_request(request.data)
                        if request_type == 'award' else
                        self.validate_account_request(request.data))
        json_request['request_type'] = request_type
        ordered_json_request = json.dumps(order_nested_object(json_request))

        # Check if the same request has been called today
        updated_date_timestamp = datetime.datetime.strftime(
            datetime.datetime.utcnow(), '%Y-%m-%d')
        cached_download = DownloadJob.objects. \
            filter(json_request=ordered_json_request, update_date__gte=updated_date_timestamp). \
            exclude(job_status_id=4).values('download_job_id', 'file_name')
        if cached_download and not settings.IS_LOCAL:
            # By returning the cached files, there should be no duplicates on a daily basis
            write_to_log(
                message='Generating file from cached download job ID: {}'.
                format(cached_download[0]['download_job_id']))
            cached_filename = cached_download[0]['file_name']
            return self.get_download_response(file_name=cached_filename)

        # Create download name and timestamped name for uniqueness
        toptier_agency_filter = ToptierAgency.objects.filter(
            toptier_agency_id=json_request.get('filters', {}).get(
                'agency', None)).first()
        download_name = '{}_{}'.format(
            toptier_agency_filter.cgac_code if toptier_agency_filter else
            'all', '_'.join(VALUE_MAPPINGS[award_level]['download_name']
                            for award_level in json_request['download_types']))
        timestamped_file_name = self.s3_handler.get_timestamped_filename(
            download_name + '.zip')

        download_job = DownloadJob.objects.create(
            job_status_id=JOB_STATUS_DICT['ready'],
            file_name=timestamped_file_name,
            json_request=ordered_json_request)

        write_to_log(message='Starting new download job'.format(
            download_job.download_job_id),
                     download_job=download_job,
                     other_params={'request_addr': get_remote_addr(request)})
        self.process_request(download_job)

        return self.get_download_response(file_name=timestamped_file_name)

    def validate_award_request(self, request_data):
        """Analyze request and raise any formatting errors as Exceptions"""
        json_request = {}
        constraint_type = request_data.get('constraint_type', None)

        # Validate required parameters
        for required_param in ['award_levels', 'filters']:
            if required_param not in request_data:
                raise InvalidParameterException(
                    'Missing one or more required query parameters: {}'.format(
                        required_param))

        if not isinstance(request_data['award_levels'], list):
            raise InvalidParameterException(
                'Award levels parameter not provided as a list')
        elif len(request_data['award_levels']) == 0:
            raise InvalidParameterException(
                'At least one award level is required.')
        for award_level in request_data['award_levels']:
            if award_level not in VALUE_MAPPINGS:
                raise InvalidParameterException(
                    'Invalid award_level: {}'.format(award_level))
        json_request['download_types'] = request_data['award_levels']

        # Overriding all other filters if the keyword filter is provided in year-constraint download
        # Make sure this is after checking the award_levels
        if constraint_type == 'year' and 'elasticsearch_keyword' in request_data[
                'filters']:
            json_request['filters'] = {
                'elasticsearch_keyword':
                request_data['filters']['elasticsearch_keyword'],
                'award_type_codes':
                list(award_type_mapping.keys())
            }
            json_request['limit'] = settings.MAX_DOWNLOAD_LIMIT
            return json_request

        if not isinstance(request_data['filters'], dict):
            raise InvalidParameterException(
                'Filters parameter not provided as a dict')
        elif len(request_data['filters']) == 0:
            raise InvalidParameterException('At least one filter is required.')
        json_request['filters'] = {}

        # Set defaults of non-required parameters
        json_request['columns'] = request_data.get('columns', [])
        json_request['file_format'] = request_data.get('file_format', 'csv')

        # Validate shared filter types and assign defaults
        filters = request_data['filters']
        check_types_and_assign_defaults(filters, json_request['filters'],
                                        SHARED_AWARD_FILTER_DEFAULTS)

        # Validate award type types
        if not filters.get('award_type_codes',
                           None) or len(filters['award_type_codes']) < 1:
            filters['award_type_codes'] = list(award_type_mapping.keys())
        for award_type_code in filters['award_type_codes']:
            if award_type_code not in award_type_mapping:
                raise InvalidParameterException(
                    'Invalid award_type: {}'.format(award_type_code))
        json_request['filters']['award_type_codes'] = filters[
            'award_type_codes']

        # Validate locations
        for location_filter in [
                'place_of_performance_locations', 'recipient_locations'
        ]:
            if filters.get(location_filter):
                for location_dict in filters[location_filter]:
                    if not isinstance(location_dict, dict):
                        raise InvalidParameterException(
                            'Location is not a dictionary: {}'.format(
                                location_dict))
                    location_error_handling(location_dict.keys())
                json_request['filters'][location_filter] = filters[
                    location_filter]

        # Validate time periods
        total_range_count = validate_time_periods(filters, json_request)

        if constraint_type == 'row_count':
            # Validate limit exists and is below MAX_DOWNLOAD_LIMIT
            json_request['limit'] = parse_limit(request_data)

            # Validate row_count-constrainted filter types and assign defaults
            check_types_and_assign_defaults(filters, json_request['filters'],
                                            ROW_CONSTRAINT_FILTER_DEFAULTS)
        elif constraint_type == 'year':
            # Validate combined total dates within one year (allow for leap years)
            if total_range_count > 366:
                raise InvalidParameterException(
                    'Invalid Parameter: time_period total days must be within a year'
                )

            # Validate year-constrainted filter types and assign defaults
            check_types_and_assign_defaults(filters, json_request['filters'],
                                            YEAR_CONSTRAINT_FILTER_DEFAULTS)
        else:
            raise InvalidParameterException(
                'Invalid parameter: constraint_type must be "row_count" or "year"'
            )

        return json_request

    def validate_account_request(self, request_data):
        json_request = {}

        json_request['columns'] = request_data.get('columns', [])

        # Validate required parameters
        for required_param in ["account_level", "filters"]:
            if required_param not in request_data:
                raise InvalidParameterException(
                    'Missing one or more required query parameters: {}'.format(
                        required_param))

        # Validate account_level parameters
        if request_data.get('account_level', None) not in [
                "federal_account", "treasury_account"
        ]:
            raise InvalidParameterException(
                'Invalid Parameter: account_level must be either "federal_account" or '
                '"treasury_account"')
        json_request['account_level'] = request_data['account_level']

        # Validate the filters parameter and its contents
        json_request['filters'] = {}
        filters = request_data['filters']
        if not isinstance(filters, dict):
            raise InvalidParameterException(
                'Filters parameter not provided as a dict')
        elif len(filters) == 0:
            raise InvalidParameterException('At least one filter is required.')

        # Validate required filters
        for required_filter in ["fy", "quarter"]:
            if required_filter not in filters:
                raise InvalidParameterException(
                    'Missing one or more required filters: {}'.format(
                        required_filter))
            else:
                try:
                    filters[required_filter] = int(filters[required_filter])
                except (TypeError, ValueError):
                    raise InvalidParameterException(
                        '{} filter not provided as an integer'.format(
                            required_filter))
            json_request['filters'][required_filter] = filters[required_filter]

        # Validate fiscal_quarter
        if json_request['filters']['quarter'] not in [1, 2, 3, 4]:
            raise InvalidParameterException(
                'quarter filter must be a valid fiscal quarter (1, 2, 3, or 4)'
            )

        # Validate submission_type parameters
        if filters.get('submission_type', None) not in [
                "account_balances", "object_class_program_activity",
                "award_financial"
        ]:
            raise InvalidParameterException(
                'Invalid Parameter: submission_type must be "account_balances", '
                '"object_class_program_activity", or "award_financial"')
        json_request['download_types'] = [filters['submission_type']]

        # Validate the rest of the filters
        check_types_and_assign_defaults(filters, json_request['filters'],
                                        ACCOUNT_FILTER_DEFAULTS)

        return json_request

    def process_request(self, download_job):
        if settings.IS_LOCAL:
            # Locally, we do not use SQS
            csv_generation.generate_csvs(download_job=download_job)
        else:
            # Send a SQS message that will be processed by another server which will eventually run
            # csv_generation.write_csvs(**kwargs) (see generate_zip.py)
            write_to_log(message='Passing download_job {} to SQS'.format(
                download_job.download_job_id),
                         download_job=download_job)
            queue = sqs_queue(queue_name=settings.BULK_DOWNLOAD_SQS_QUEUE_NAME)
            queue.send_message(MessageBody=str(download_job.download_job_id))

    def get_download_response(self, file_name):
        """Generate download response which encompasses various elements to provide accurate status for state of a
        download job"""
        download_job = DownloadJob.objects.filter(file_name=file_name).first()
        if not download_job:
            raise NotFound(
                'Download job with filename {} does not exist.'.format(
                    file_name))

        # Compile url to file
        file_path = settings.CSV_LOCAL_PATH + file_name if settings.IS_LOCAL else \
            self.s3_handler.get_simple_url(file_name=file_name)

        # Add additional response elements that should be part of anything calling this function
        response = {
            'status':
            download_job.job_status.name,
            'url':
            file_path,
            'message':
            download_job.error_message,
            'file_name':
            file_name,
            # converting size from bytes to kilobytes if file_size isn't None
            'total_size':
            download_job.file_size / 1000 if download_job.file_size else None,
            'total_columns':
            download_job.number_of_columns,
            'total_rows':
            download_job.number_of_rows,
            'seconds_elapsed':
            download_job.seconds_elapsed()
        }

        return Response(response)
Esempio n. 4
0
class ListMonthlyDownloadsViewset(APIDocumentationView):
    """
    This route lists all the agencies and the subagencies or federal accounts associated under specific agencies.

    endpoint_doc: /download/list_downloads.md
    """
    s3_handler = S3Handler(name=settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME,
                           region=settings.BULK_DOWNLOAD_AWS_REGION)

    # This is intentionally not cached so that the latest updates to these monthly generated files are always returned
    def post(self, request):
        """Return list of downloads that match the requested params"""
        response_data = {}

        post_data = request.data
        agency_id = post_data.get('agency', None)
        fiscal_year = post_data.get('fiscal_year', None)
        download_type = post_data.get('type', None)

        required_params = {
            'agency': agency_id,
            'fiscal_year': fiscal_year,
            'type': download_type
        }
        for required_param, param_value in required_params.items():
            if param_value is None:
                raise InvalidParameterException(
                    'Missing one or more required query parameters: {}'.format(
                        required_param))

        # Populate regex
        fiscal_year_regex = str(fiscal_year) if fiscal_year else '\d{4}'
        download_type_regex = download_type.capitalize(
        ) if download_type else '(Contracts|Assistance)'

        cgac_regex = '.*'
        if agency_id and agency_id == 'all':
            cgac_regex = 'all'
        elif agency_id:
            cgac_codes = ToptierAgency.objects.filter(
                toptier_agency_id=agency_id).values('cgac_code')
            if cgac_codes:
                cgac_regex = cgac_codes[0]['cgac_code']
            else:
                raise InvalidParameterException(
                    '{} agency not found'.format(agency_id))
        monthly_dl_regex = '{}_{}_{}_Full_.*\.zip'.format(
            fiscal_year_regex, cgac_regex, download_type_regex)

        # Generate regex possible prefix
        prefixes = []
        for regex, add_regex in [(fiscal_year_regex, fiscal_year),
                                 (cgac_regex, agency_id),
                                 (download_type_regex, download_type)]:
            if not add_regex:
                break
            prefixes.append(regex)
        prefix = '_'.join(prefixes)

        # Get and filter the files we need
        bucket_name = self.s3_handler.bucketRoute
        region_name = S3Handler.REGION
        bucket = boto.s3.connect_to_region(region_name).get_bucket(bucket_name)
        monthly_dls_names = list(
            filter(
                re.compile(monthly_dl_regex).search,
                [key.name for key in bucket.list(prefix=prefix)]))
        # Generate response
        downloads = []
        for name in monthly_dls_names:
            name_data = re.findall('(.*)_(.*)_(.*)_Full_(.*)\.zip', name)[0]
            agency_name = None
            agency_abbr = None
            agency_cgac = name_data[1]
            if agency_cgac != 'all':
                agency = ToptierAgency.objects.filter(
                    cgac_code=agency_cgac).values('name', 'abbreviation')
                if agency:
                    agency_name = agency[0]['name']
                    agency_abbr = agency[0]['abbreviation']
            else:
                agency_name = 'All'
            # Simply adds dashes for the date, 20180101 -> 2018-01-01, could also use strftime
            updated_date = '-'.join(
                [name_data[3][:4], name_data[3][4:6], name_data[3][6:]])
            downloads.append({
                'fiscal_year':
                name_data[0],
                'agency_name':
                agency_name,
                'agency_acronym':
                agency_abbr,
                'type':
                name_data[2].lower(),
                'updated_date':
                updated_date,
                'file_name':
                name,
                'url':
                self.s3_handler.get_simple_url(file_name=name)
            })
        response_data['monthly_files'] = downloads
        return Response(response_data)
class BaseDownloadViewSet(APIDocumentationView):
    s3_handler = S3Handler(bucket_name=settings.BULK_DOWNLOAD_S3_BUCKET_NAME,
                           redirect_dir=settings.BULK_DOWNLOAD_S3_REDIRECT_DIR)

    def post(self, request, request_type='award'):
        if request_type == 'award':
            json_request = validate_award_request(request.data)
        elif request_type == 'idv':
            json_request = validate_idv_request(request.data)
        else:
            json_request = validate_account_request(request.data)

        json_request['request_type'] = request_type
        ordered_json_request = json.dumps(order_nested_object(json_request))

        # Check if the same request has been called today
        # TODO!!! Use external_data_load_date to determine data freshness
        updated_date_timestamp = datetime.strftime(datetime.now(timezone.utc),
                                                   "%Y-%m-%d")
        cached_download = (DownloadJob.objects.filter(
            json_request=ordered_json_request,
            update_date__gte=updated_date_timestamp).exclude(
                job_status_id=JOB_STATUS_DICT["failed"]).values(
                    "download_job_id", "file_name").first())

        if cached_download and not settings.IS_LOCAL:
            # By returning the cached files, there should be no duplicates on a daily basis
            write_to_log(
                message='Generating file from cached download job ID: {}'.
                format(cached_download['download_job_id']))
            cached_filename = cached_download['file_name']
            return self.get_download_response(file_name=cached_filename)

        request_agency = json_request.get('filters', {}).get('agency', None)
        final_output_zip_name = create_unique_filename(json_request,
                                                       request_agency)
        download_job = DownloadJob.objects.create(
            job_status_id=JOB_STATUS_DICT['ready'],
            file_name=final_output_zip_name,
            json_request=ordered_json_request)

        log_new_download_job(request, download_job)
        self.process_request(download_job)

        return self.get_download_response(file_name=final_output_zip_name)

    def process_request(self, download_job):
        if settings.IS_LOCAL:
            # Locally, we do not use SQS
            csv_generation.generate_csvs(download_job=download_job)
        else:
            # Send a SQS message that will be processed by another server which will eventually run
            # csv_generation.write_csvs(**kwargs) (see download_sqs_worker.py)
            write_to_log(message='Passing download_job {} to SQS'.format(
                download_job.download_job_id),
                         download_job=download_job)
            queue = get_sqs_queue_resource(
                queue_name=settings.BULK_DOWNLOAD_SQS_QUEUE_NAME)
            queue.send_message(MessageBody=str(download_job.download_job_id))

    def get_download_response(self, file_name):
        """Generate download response which encompasses various elements to provide accurate status for state of a
        download job"""
        download_job = DownloadJob.objects.filter(file_name=file_name).first()
        if not download_job:
            raise NotFound(
                'Download job with filename {} does not exist.'.format(
                    file_name))

        # Compile url to file
        if settings.IS_LOCAL:
            file_path = settings.CSV_LOCAL_PATH + file_name
        else:
            file_path = self.s3_handler.get_simple_url(file_name=file_name)

        # Add additional response elements that should be part of anything calling this function
        response = {
            'status':
            download_job.job_status.name,
            'url':
            file_path,
            'message':
            download_job.error_message,
            'file_name':
            file_name,
            # converting size from bytes to kilobytes if file_size isn't None
            'total_size':
            download_job.file_size / 1000 if download_job.file_size else None,
            'total_columns':
            download_job.number_of_columns,
            'total_rows':
            download_job.number_of_rows,
            'seconds_elapsed':
            download_job.seconds_elapsed(),
        }

        return Response(response)
Esempio n. 6
0
class ListMonthlyDownloadsViewSet(APIView):
    """
    Returns a list of the current versions of generated archive files for a given fiscal year and agency.
    """

    endpoint_doc = "usaspending_api/api_contracts/contracts/v2/bulk_download/list_monthly_files.md"

    s3_handler = S3Handler(
        bucket_name=settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME,
        redirect_dir=settings.MONTHLY_DOWNLOAD_S3_REDIRECT_DIR)

    # This is intentionally not cached so that the latest updates to these monthly generated files are always returned
    def post(self, request):
        """Return list of downloads that match the requested params"""
        agency_id = request.data.get("agency", None)
        fiscal_year = request.data.get("fiscal_year", None)
        type_param = request.data.get("type", None)

        # Check required params
        required_params = {
            "agency": agency_id,
            "fiscal_year": fiscal_year,
            "type": type_param
        }
        for required, param_value in required_params.items():
            if param_value is None:
                raise InvalidParameterException(
                    "Missing one or more required body parameters: {}".format(
                        required))

        # Capitalize type_param and retrieve agency information from agency ID
        download_type = type_param.capitalize()
        if agency_id == "all":
            agency = {
                "toptier_code": "All",
                "name": "All",
                "abbreviation": None
            }
        else:
            agency_check = ToptierAgency.objects.filter(
                toptier_agency_id=agency_id).values("toptier_code", "name",
                                                    "abbreviation")
            if agency_check:
                agency = agency_check[0]
            else:
                raise InvalidParameterException(
                    "{} agency not found".format(agency_id))

        # Populate regex
        monthly_download_prefixes = f"FY{fiscal_year}_{agency['toptier_code']}_{download_type}"
        monthly_download_regex = r"{}_Full_.*\.zip".format(
            monthly_download_prefixes)
        delta_download_prefixes = f"FY(All)_{agency['toptier_code']}_{download_type}"
        delta_download_regex = r"FY\(All\)_{}_{}_Delta_.*\.zip".format(
            agency["toptier_code"], download_type)

        # Retrieve and filter the files we need
        bucket = boto3.resource("s3",
                                region_name=self.s3_handler.region).Bucket(
                                    self.s3_handler.bucketRoute)
        monthly_download_names = list(
            filter(
                re.compile(monthly_download_regex).search,
                [
                    key.key for key in bucket.objects.filter(
                        Prefix=monthly_download_prefixes)
                ],
            ))
        delta_download_names = list(
            filter(
                re.compile(delta_download_regex).search,
                [
                    key.key for key in bucket.objects.filter(
                        Prefix=delta_download_prefixes)
                ],
            ))

        ##########################################
        # TEMPORARY 2019/12/12. REMOVE after 2020/01/15
        # KEEP old_* prefix  and regex around until monthly files using the new format are
        # generated and accessible in S3
        if agency["toptier_code"] == "All":
            agency["toptier_code"] = "all"
        old_monthly_download_prefixes = "{}_{}_{}".format(
            fiscal_year, agency["toptier_code"], download_type)
        old_monthly_download_regex = r"{}_Full_.*\.zip".format(
            old_monthly_download_prefixes)
        old_delta_download_prefixes = "{}_{}".format(agency["toptier_code"],
                                                     download_type)
        old_delta_download_regex = r"{}_Delta_.*\.zip".format(
            old_delta_download_prefixes)

        monthly_download_names.extend(
            list(
                filter(
                    re.compile(old_monthly_download_regex).search,
                    [
                        key.key for key in bucket.objects.filter(
                            Prefix=old_monthly_download_prefixes)
                    ],
                )))
        delta_download_names.extend(
            list(
                filter(
                    re.compile(old_delta_download_regex).search,
                    [
                        key.key for key in bucket.objects.filter(
                            Prefix=old_delta_download_prefixes)
                    ],
                )))
        ##########################################
        ##########################################

        # Generate response
        downloads = []
        for filename in monthly_download_names:
            downloads.append(
                self.create_download_response_obj(filename, fiscal_year,
                                                  type_param, agency))
        for filename in delta_download_names:
            downloads.append(
                self.create_download_response_obj(filename,
                                                  None,
                                                  type_param,
                                                  agency,
                                                  is_delta=True))

        return Response({"monthly_files": downloads})

    def create_download_response_obj(self,
                                     filename,
                                     fiscal_year,
                                     type_param,
                                     agency,
                                     is_delta=False):
        """Return a """
        regex = r"(.*)_(.*)_Delta_(.*)\.zip" if is_delta else r"(.*)_(.*)_(.*)_Full_(.*)\.zip"
        filename_data = re.findall(regex, filename)[0]

        # Simply adds dashes for the date, 20180101 -> 2018-01-01, could also use strftime
        unformatted_date = filename_data[2 if is_delta else 3]
        updated_date = "-".join([
            unformatted_date[:4], unformatted_date[4:6], unformatted_date[6:]
        ])

        return {
            "fiscal_year": fiscal_year,
            "agency_name": agency["name"],
            "agency_acronym": agency["abbreviation"],
            "type": type_param,
            "updated_date": updated_date,
            "file_name": filename,
            "url": self.s3_handler.get_simple_url(file_name=filename),
        }
Esempio n. 7
0
class BaseDownloadViewSet(APIView):
    s3_handler = S3Handler(name=settings.BULK_DOWNLOAD_S3_BUCKET_NAME,
                           region=settings.BULK_DOWNLOAD_AWS_REGION)

    def post(self, request):
        """Push a message to SQS with the validated request JSON"""
        json_request = self.validate_request(request.data)
        ordered_json_request = json.dumps(order_nested_object(json_request))

        # Check if the same request has been called today
        updated_date_timestamp = datetime.datetime.strftime(
            datetime.datetime.utcnow(), '%Y-%m-%d')
        cached_download = DownloadJob.objects.filter(
            json_request=ordered_json_request,
            update_date__gte=updated_date_timestamp).exclude(
                job_status_id=4).values('file_name')
        if cached_download:
            # By returning the cached files, there should be no duplicates on a daily basis
            cached_filename = cached_download[0]['file_name']
            return self.get_download_response(file_name=cached_filename)

        # Create download name and timestamped name for uniqueness
        download_name = '_'.join(
            VALUE_MAPPINGS[award_level]['download_name']
            for award_level in json_request['award_levels'])
        timestamped_file_name = self.s3_handler.get_timestamped_filename(
            download_name + '.zip')
        download_job = DownloadJob.objects.create(
            job_status_id=JOB_STATUS_DICT['ready'],
            file_name=timestamped_file_name,
            json_request=ordered_json_request)

        write_to_log(message='Starting new download job'.format(
            download_job.download_job_id),
                     download_job=download_job,
                     other_params={'request_addr': get_remote_addr(request)})
        self.process_request(download_job)

        return self.get_download_response(file_name=timestamped_file_name)

    def validate_request(self, json_request):
        """Analyze request and raise any formatting errors as Exceptions"""
        constraint_type = json_request.get('constraint_type', None)

        # Overriding all other filters if the keyword filter is provided in year-constraint download
        if constraint_type == 'year' and 'elasticsearch_keyword' in json_request[
                'filters']:
            json_request['filters'] = {
                'elasticsearch_keyword':
                json_request['filters']['elasticsearch_keyword'],
                'award_type_codes':
                list(award_type_mapping.keys())
            }
            json_request['limit'] = settings.MAX_DOWNLOAD_LIMIT
            return json_request

        # Validate required parameters
        for required_param in ['award_levels', 'filters']:
            if required_param not in json_request:
                raise InvalidParameterException(
                    'Missing one or more required query parameters: {}'.format(
                        required_param))

        if not isinstance(json_request['award_levels'], list):
            raise InvalidParameterException(
                'Award levels parameter not provided as a list')
        elif len(json_request['award_levels']) == 0:
            raise InvalidParameterException(
                'At least one award level is required.')
        for award_level in json_request['award_levels']:
            if award_level not in VALUE_MAPPINGS:
                raise InvalidParameterException(
                    'Invalid award_level: {}'.format(award_level))

        if not isinstance(json_request['filters'], dict):
            raise InvalidParameterException(
                'Filters parameter not provided as a dict')
        elif len(json_request['filters']) == 0:
            raise InvalidParameterException('At least one filter is required.')

        # Set defaults of non-required parameters
        json_request['columns'] = json_request.get('columns', [])
        json_request['file_format'] = json_request.get('file_format', 'csv')

        # Validate shared filter types and assign defaults
        filters = json_request['filters']
        check_types_and_assign_defaults(filters, SHARED_FILTER_DEFAULTS)

        # Validate award type types
        if not filters.get('award_type_codes',
                           None) or len(filters['award_type_codes']) < 1:
            filters['award_type_codes'] = list(award_type_mapping.keys())
        for award_type_code in filters['award_type_codes']:
            if award_type_code not in award_type_mapping:
                raise InvalidParameterException(
                    'Invalid award_type: {}'.format(award_type_code))

        # Validate time periods
        total_range_count = validate_time_periods(filters)

        if constraint_type == 'row_count':
            # Validate limit exists and is below MAX_DOWNLOAD_LIMIT
            json_request['limit'] = parse_limit(json_request)

            # Validate row_count-constrainted filter types and assign defaults
            check_types_and_assign_defaults(filters,
                                            ROW_CONSTRAINT_FILTER_DEFAULTS)
        elif constraint_type == 'year':
            # Validate combined total dates within one year (allow for leap years)
            if total_range_count > 366:
                raise InvalidParameterException(
                    'Invalid Parameter: time_period total days must be within a year'
                )

            # Validate year-constrainted filter types and assign defaults
            check_types_and_assign_defaults(filters,
                                            YEAR_CONSTRAINT_FILTER_DEFAULTS)
        else:
            raise InvalidParameterException(
                'Invalid parameter: constraint_type must be "row_count" or "year"'
            )

        return json_request

    def process_request(self, download_job):
        if settings.IS_LOCAL:
            # Locally, we do not use SQS
            csv_generation.generate_csvs(download_job=download_job)
        else:
            # Send a SQS message that will be processed by another server which will eventually run
            # csv_generation.write_csvs(**kwargs) (see generate_zip.py)
            write_to_log(message='Passing download_job {} to SQS'.format(
                download_job.download_job_id),
                         download_job=download_job)
            queue = sqs_queue(region_name=settings.BULK_DOWNLOAD_AWS_REGION,
                              QueueName=settings.BULK_DOWNLOAD_SQS_QUEUE_NAME)
            queue.send_message(MessageBody=str(download_job.download_job_id))

    def get_download_response(self, file_name):
        """Generate download response which encompasses various elements to provide accurate status for state of a
        download job"""
        download_job = DownloadJob.objects.filter(file_name=file_name).first()
        if not download_job:
            raise NotFound(
                'Download job with filename {} does not exist.'.format(
                    file_name))

        # Compile url to file
        file_path = settings.CSV_LOCAL_PATH + file_name if settings.IS_LOCAL else \
            self.s3_handler.get_simple_url(file_name=file_name)

        # Add additional response elements that should be part of anything calling this function
        response = {
            'status':
            download_job.job_status.name,
            'url':
            file_path,
            'message':
            download_job.error_message,
            'file_name':
            file_name,
            # converting size from bytes to kilobytes if file_size isn't None
            'total_size':
            download_job.file_size / 1000 if download_job.file_size else None,
            'total_columns':
            download_job.number_of_columns,
            'total_rows':
            download_job.number_of_rows,
            'seconds_elapsed':
            download_job.seconds_elapsed()
        }

        return Response(response)