コード例 #1
0
def parse_source(source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit):
    """Write to csv and zip files using the source data"""
    d_map = {
        "d1": "contracts",
        "d2": "assistance",
        "treasury_account": "treasury_account",
        "federal_account": "federal_account",
    }
    if download_job and download_job.monthly_download:
        # Use existing detailed filename from parent file for monthly files
        # e.g. `019_Assistance_Delta_20180917_%s.csv`
        source_name = strip_file_extension(download_job.file_name)
    elif source.is_for_idv or source.is_for_contract:
        file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]
        source_name = file_name_pattern.format(piid=slugify_text_for_file_names(piid, "UNKNOWN", 50))
    elif source.is_for_assistance:
        file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]
        source_name = file_name_pattern.format(assistance_id=slugify_text_for_file_names(assistance_id, "UNKNOWN", 50))
    else:
        source_name = "{}_{}_{}".format(
            source.agency_code, d_map[source.file_type], VALUE_MAPPINGS[source.source_type]["download_name"]
        )
    source_query = source.row_emitter(columns)
    source.file_name = "{}.csv".format(source_name)
    source_path = os.path.join(working_dir, source.file_name)

    write_to_log(message="Preparing to download data as {}".format(source_name), download_job=download_job)

    # Generate the query file; values, limits, dates fixed
    temp_file, temp_file_path = generate_temp_query_file(source_query, limit, source, download_job, columns)

    start_time = time.perf_counter()
    try:
        # Create a separate process to run the PSQL command; wait
        psql_process = multiprocessing.Process(target=execute_psql, args=(temp_file_path, source_path, download_job))
        psql_process.start()
        wait_for_process(psql_process, start_time, download_job)

        # Log how many rows we have
        write_to_log(message="Counting rows in CSV", download_job=download_job)
        try:
            download_job.number_of_rows += count_rows_in_csv_file(filename=source_path, has_header=True)
        except Exception:
            write_to_log(message="Unable to obtain CSV line count", is_error=True, download_job=download_job)
        download_job.save()

        # Create a separate process to split the large csv into smaller csvs and write to zip; wait
        zip_process = multiprocessing.Process(
            target=split_and_zip_csvs, args=(zip_file_path, source_path, source_name, download_job)
        )
        zip_process.start()
        wait_for_process(zip_process, start_time, download_job)
        download_job.save()
    except Exception as e:
        raise e
    finally:
        # Remove temporary files
        os.close(temp_file)
        os.remove(temp_file_path)
コード例 #2
0
def build_data_file_name(source, download_job, piid, assistance_id):
    d_map = {
        "d1": "Contracts",
        "d2": "Assistance",
        "treasury_account": "TAS",
        "federal_account": "FA"
    }

    if download_job and download_job.monthly_download:
        # For monthly archives, use the existing detailed zip filename for the data files
        # e.g. FY(All)-012_Contracts_Delta_20191108.zip -> FY(All)-012_Contracts_Delta_20191108_%.csv
        return strip_file_extension(download_job.file_name)

    file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]
    timestamp = datetime.strftime(datetime.now(timezone.utc),
                                  "%Y-%m-%d_H%HM%MS%S")

    if source.is_for_idv or source.is_for_contract:
        data_file_name = file_name_pattern.format(
            piid=slugify_text_for_file_names(piid, "UNKNOWN", 50))
    elif source.is_for_assistance:
        data_file_name = file_name_pattern.format(
            assistance_id=slugify_text_for_file_names(assistance_id, "UNKNOWN",
                                                      50))
    elif source.source_type == "disaster_recipient":
        data_file_name = file_name_pattern.format(
            award_category=source.award_category, timestamp=timestamp)
    else:
        if source.agency_code == "all":
            agency = "All"
        else:
            agency = str(source.agency_code)

        request = json.loads(download_job.json_request)
        filters = request["filters"]
        if request.get("limit"):
            agency = ""
        elif source.file_type not in ("treasury_account", "federal_account"):
            agency = f"{agency}_"

        data_file_name = file_name_pattern.format(
            agency=agency,
            data_quarters=construct_data_date_range(filters),
            level=d_map[source.file_type],
            timestamp=timestamp,
            type=d_map[source.file_type],
        )

    return data_file_name
コード例 #3
0
def create_unique_filename(json_request, origination=None):
    timestamp = datetime.strftime(datetime.now(timezone.utc), "%Y-%m-%d_H%HM%MS%S%f")
    request_agency = json_request.get("agency", "all")

    if json_request.get("is_for_idv"):
        download_name = f"IDV_{slugify_text_for_file_names(json_request.get('piid'), 'UNKNOWN', 50)}_{timestamp}.zip"
    elif json_request.get("is_for_contract"):
        download_name = f"CONT_{slugify_text_for_file_names(json_request.get('piid'), 'UNKNOWN', 50)}_{timestamp}.zip"
    elif json_request.get("is_for_assistance"):
        slug_text = slugify_text_for_file_names(json_request.get("assistance_id"), "UNKNOWN", 50)
        download_name = f"ASST_{slug_text}_{timestamp}.zip"
    elif json_request["request_type"] == "account":
        file_name_template = obtain_zip_filename_format(json_request["download_types"])
        agency = obtain_filename_prefix_from_agency_id(request_agency)
        level = "FA" if json_request["account_level"] == "federal_account" else "TAS"
        data_quarters = construct_data_date_range(json_request["filters"])

        download_name = file_name_template.format(
            agency=agency, data_quarters=data_quarters, level=level, timestamp=timestamp,
        )
    else:  # "award" downloads
        agency = ""

        # Since Keyword Search using the "Bulk Download" Endpoint for unknown reasons
        # Check for the specific filter to mimic the Advanced Search download filename
        if origination == "bulk_download" and "elasticsearch_keyword" not in json_request["filters"]:
            agency = obtain_filename_prefix_from_agency_id(request_agency) + "_"

        award_type_name = create_award_level_string(json_request["download_types"])
        download_name = f"{agency}{award_type_name}_{timestamp}.zip"

    return download_name
コード例 #4
0
def create_unique_filename(json_request, request_agency=None):
    if json_request.get("is_for_idv"):
        download_name = "IDV_" + slugify_text_for_file_names(
            json_request.get("piid"), "UNKNOWN", 50)
    elif json_request.get("is_for_contract"):
        download_name = "CONT_" + slugify_text_for_file_names(
            json_request.get("piid"), "UNKNOWN", 50)
    elif json_request.get("is_for_assistance"):
        download_name = "ASST_" + slugify_text_for_file_names(
            json_request.get("assistance_id"), "UNKNOWN", 50)
    else:
        download_types = json_request["download_types"]
        prefix = obtain_filename_prefix_from_agency_id(request_agency)
        award_type_name = create_award_level_string(download_types)
        download_name = "{}_{}".format(prefix, award_type_name)
    timestamped_file_name = get_timestamped_filename(
        "{}.zip".format(download_name))
    return timestamped_file_name
コード例 #5
0
def parse_source(source, columns, download_job, working_dir, piid,
                 assistance_id, zip_file_path, limit, extension):
    """Write to delimited text file(s) and zip file(s) using the source data"""
    d_map = {
        "d1": "Contracts",
        "d2": "Assistance",
        "treasury_account": "TAS",
        "federal_account": "FA",
    }
    if download_job and download_job.monthly_download:
        # For monthly archives, use the existing detailed zip filename for the data files
        # e.g. FY(All)-012_Contracts_Delta_20191108.zip -> FY(All)-012_Contracts_Delta_20191108_%.csv
        source_name = strip_file_extension(download_job.file_name)
    elif source.is_for_idv or source.is_for_contract:
        file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]
        source_name = file_name_pattern.format(
            piid=slugify_text_for_file_names(piid, "UNKNOWN", 50))
    elif source.is_for_assistance:
        file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]
        source_name = file_name_pattern.format(
            assistance_id=slugify_text_for_file_names(assistance_id, "UNKNOWN",
                                                      50))
    else:
        file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"]

        if source.agency_code == "all":
            agency = "All"
        else:
            agency = str(source.agency_code)

        request = json.loads(download_job.json_request)
        filters = request["filters"]
        if request.get("limit"):
            agency = ""
        elif source.file_type not in ("treasury_account", "federal_account"):
            agency = f"{agency}_"
        timestamp = datetime.strftime(datetime.now(timezone.utc),
                                      "%Y-%m-%d_H%HM%MS%S")
        source_name = file_name_pattern.format(
            agency=agency,
            data_quarters=construct_data_date_range(filters),
            level=d_map[source.file_type],
            timestamp=timestamp,
            type=d_map[source.file_type],
        )

    source_query = source.row_emitter(columns)
    source.file_name = f"{source_name}.{extension}"
    source_path = os.path.join(working_dir, source.file_name)

    write_to_log(message=f"Preparing to download data as {source.file_name}",
                 download_job=download_job)

    # Generate the query file; values, limits, dates fixed
    temp_file, temp_file_path = generate_temp_query_file(
        source_query, limit, source, download_job, columns, extension)

    start_time = time.perf_counter()
    try:
        # Create a separate process to run the PSQL command; wait
        psql_process = multiprocessing.Process(target=execute_psql,
                                               args=(temp_file_path,
                                                     source_path,
                                                     download_job))
        psql_process.start()
        wait_for_process(psql_process, start_time, download_job)

        delim = FILE_FORMATS[extension]["delimiter"]

        # Log how many rows we have
        write_to_log(message="Counting rows in delimited text file",
                     download_job=download_job)
        try:
            download_job.number_of_rows += count_rows_in_delimited_file(
                filename=source_path, has_header=True, delimiter=delim)
        except Exception:
            write_to_log(
                message="Unable to obtain delimited text file line count",
                is_error=True,
                download_job=download_job)
        download_job.save()

        # Create a separate process to split the large data files into smaller file and write to zip; wait
        zip_process = multiprocessing.Process(target=split_and_zip_data_files,
                                              args=(zip_file_path, source_path,
                                                    source_name, extension,
                                                    download_job))
        zip_process.start()
        wait_for_process(zip_process, start_time, download_job)
        download_job.save()
    except Exception as e:
        raise e
    finally:
        # Remove temporary files
        os.close(temp_file)
        os.remove(temp_file_path)
コード例 #6
0
def test_slugify_text_for_file_names():
    assert slugify_text_for_file_names("test") == "test"
    assert slugify_text_for_file_names("test", "default") == "test"
    assert slugify_text_for_file_names("test", "default", 50) == "test"
    assert slugify_text_for_file_names("test", "default", 2) == "te"

    assert slugify_text_for_file_names(None) is None
    assert slugify_text_for_file_names(None, None) is None
    assert slugify_text_for_file_names(None, "default") == "default"
    assert slugify_text_for_file_names(None, "default", 50) == "default"
    assert slugify_text_for_file_names(None, "default", 2) == "default"

    assert slugify_text_for_file_names("---") is None
    assert slugify_text_for_file_names("---", "default") == "default"
    assert slugify_text_for_file_names("---", "default", 50) == "default"
    assert slugify_text_for_file_names("---", "default", 2) == "default"

    garbage = r")*(&*()THIS(#@$@^*IS<,>.?/:;\"'{[}]|\+=_-)(*&^%$#@!A><?>\":}TEST()(*&(*&*(*"
    assert slugify_text_for_file_names(garbage) == "THIS_IS_A_TEST"
    assert slugify_text_for_file_names(garbage, "default") == "THIS_IS_A_TEST"
    assert slugify_text_for_file_names(garbage, "default",
                                       50) == "THIS_IS_A_TEST"
    assert slugify_text_for_file_names(garbage, "default", 2) == "TH"

    assert slugify_text_for_file_names("áéíóúüñ") == "aeiouun"
    assert slugify_text_for_file_names("áéíóúüñ", "default") == "aeiouun"
    assert slugify_text_for_file_names("áéíóúüñ", "default", 50) == "aeiouun"
    assert slugify_text_for_file_names("áéíóúüñ", "default", 2) == "ae"

    assert slugify_text_for_file_names("buenos días") == "buenos_dias"
    assert slugify_text_for_file_names("buenos días",
                                       "default") == "buenos_dias"
    assert slugify_text_for_file_names("buenos días", "default",
                                       50) == "buenos_dias"
    assert slugify_text_for_file_names("buenos días", "default", 2) == "bu"

    # These all contain no ascii compatible characters.
    assert slugify_text_for_file_names("Καλημέρα") is None
    assert slugify_text_for_file_names("Καλημέρα", "default") == "default"
    assert slugify_text_for_file_names("Καλημέρα", "default", 50) == "default"
    assert slugify_text_for_file_names("Καλημέρα", "default", 2) == "default"

    assert slugify_text_for_file_names("Доброе утро") is None
    assert slugify_text_for_file_names("Доброе утро", "default") == "default"
    assert slugify_text_for_file_names("Доброе утро", "default",
                                       50) == "default"
    assert slugify_text_for_file_names("Доброе утро", "default",
                                       2) == "default"

    assert slugify_text_for_file_names("早上好/ 早") is None
    assert slugify_text_for_file_names("早上好/ 早", "default") == "default"
    assert slugify_text_for_file_names("早上好/ 早", "default", 50) == "default"
    assert slugify_text_for_file_names("早上好/ 早", "default", 2) == "default"