def test_http_copy(temp_file_path):

    RetrieveFileFromUri(URL).copy(temp_file_path)

    with RetrieveFileFromUri(__file__).get_file_object(True) as f:
        c = f.read()
        assert type(c) is str
        assert len(c) > 0
def test_retrieve_from_http():

    with RetrieveFileFromUri(URL).get_file_object() as f:
        c = f.read()
        assert type(c) is bytes
        assert len(c) > 0

    with RetrieveFileFromUri(URL).get_file_object(True) as f:
        c = f.read()
        assert type(c) is str
        assert len(c) > 0
def test_file_copy(temp_file_path):

    RetrieveFileFromUri(__file__).copy(temp_file_path)

    with RetrieveFileFromUri(__file__).get_file_object(True) as f:
        c1 = f.read()

    with open(__file__) as f:
        c2 = f.read()

    assert c1 == c2
    def _retrieve_file(file_path):
        """
        Remote file context manager.  If file is not local, copies it to a local temporary location, yields, and
        cleans up afterwards.  Otherwise, effectively does nothing.
        """
        file = RetrieveFileFromUri(file_path)
        is_local = file.parsed_url_obj.scheme in ("file", "")  # Our best guess.

        if not is_local:
            with TemporaryDirectory() as temp_dir:
                temp_file_path = str(Path(temp_dir) / "local_file_copy")
                with Timer("Download file"):
                    file.copy(temp_file_path)
                yield temp_file_path
        else:
            yield file_path
Esempio n. 5
0
    def handle(self, *args, **options):

        source_location = options["source_location"]
        logger.info(f"SOURCE CSV LOCATION: {source_location}")

        with psycopg2.connect(get_database_dsn_string()) as connection:
            with connection.cursor() as cursor:
                self.connection = connection
                self.cursor = cursor
                for file_name, table_name in FILE_TO_TABLE_MAPPING.items():
                    with Timer(f"Copy {file_name}"):
                        uri = os.path.join(source_location, file_name)
                        file_path = RetrieveFileFromUri(
                            uri).copy_to_temporary_file()
                    with Timer(f"Get CSV headers from {file_name}"):
                        headers = self._get_headers(file_path)
                    with Timer(f"Create temporary table {table_name}"):
                        self._create_temporary_table(table_name, headers)
                    with Timer(f"Import {file_name}"):
                        self._import_file(file_path, table_name)
                    os.remove(file_path)

                destination_table_name = HistoricalAppropriationAccountBalances._meta.db_table
                with Timer(f"Empty {destination_table_name}"):
                    cursor.execute(f"delete from {destination_table_name}")
                with Timer(f"Import into {destination_table_name}"):
                    self._import_data()
Esempio n. 6
0
def read_file_for_database_ids(provided_uri,
                               chunk_count,
                               is_numeric: bool = True):
    """wrapped generator to read file and stream IDs"""

    use_binary = not is_numeric
    try:
        with RetrieveFileFromUri(provided_uri).get_file_object(
                text=use_binary) as file:
            chunk_list = []
            while True:
                line = file.readline()
                if not line:
                    break
                elif is_numeric:
                    chunk_list.append(int(line))
                else:
                    chunk_list.append(line.strip())

                if len(chunk_list) >= chunk_count:
                    yield chunk_list
                    chunk_list = []

            yield chunk_list

    except Exception as e:
        raise RuntimeError(f"Issue with reading/parsing file: {e}")
def add_data_dictionary_to_zip(working_dir, zip_file_path):
    write_to_log(message="Adding data dictionary to zip file")
    data_dictionary_file_name = "Data_Dictionary_Crosswalk.xlsx"
    data_dictionary_file_path = os.path.join(working_dir, data_dictionary_file_name)
    data_dictionary_url = settings.DATA_DICTIONARY_DOWNLOAD_URL
    RetrieveFileFromUri(data_dictionary_url).copy(data_dictionary_file_path)
    append_files_to_zip_file([data_dictionary_file_path], zip_file_path)
def test_s3_copy(temp_file_path):

    # Again, need an actual file to test against.

    with pytest.raises(NotImplementedError):
        with RetrieveFileFromUri("s3://whatever/file.txt").copy(
                temp_file_path):
            pass
Esempio n. 9
0
    def read_schedules_from_csv(self, file_path):

        logger.info("Reading from file: {}".format(file_path))

        with RetrieveFileFromUri(file_path).get_file_object(True) as file:
            csv_reader = csv.DictReader(file)
            submission_schedule_objs = [DABSSubmissionWindowSchedule(**values) for values in csv_reader]
            return submission_schedule_objs
def build_file_description(source_file_template, sources):

    # Read in file description ignoring lines that start with #.
    with RetrieveFileFromUri(source_file_template).get_file_object() as f:
        file_description_text = "".join(tuple(line.decode("utf-8") for line in f if not line.startswith(b"#")))

    # Replace source.source_type keys with source.file_name values.
    return file_description_text.format(**{source.source_type: source.file_name for source in sources})
Esempio n. 11
0
def test_retrieve_from_s3():
    # Can't currently think of a good way to test this.  We'd need a public,
    # always available S3 file.

    # Can test the not implemented bit, though.
    with pytest.raises(NotImplementedError):
        with RetrieveFileFromUri("s3://whatever/file.txt").get_file_object(
                True):
            pass
Esempio n. 12
0
def load_glossary(path, append):

    logger = logging.getLogger("script")

    file_object = RetrieveFileFromUri(path).get_file_object()
    wb = load_workbook(filename=BytesIO(file_object.read()))
    ws = wb.active
    rows = ws.rows

    headers = [c.value for c in next(rows)[:6]]
    expected_headers = [
        "Term",
        "Plain Language Descriptions",
        "DATA Act Schema Term",
        "DATA Act Schema Definition",
        "More Resources",
        "Markdown for More Resources",
    ]
    if headers != expected_headers:
        raise Exception("Expected headers of {} in {}".format(
            expected_headers, path))

    if append:
        logging.info("Appending definitions to existing guide")
    else:
        logging.info("Deleting existing definitions from guide")
        Definition.objects.all().delete()

    field_names = ("term", "plain", "data_act_term", "official", None,
                   "resources")
    row_count = 0
    for row in rows:
        if not row[0].value:
            break  # Reads file only until a line with blank `term`
        definition = Definition()
        for (i, field_name) in enumerate(field_names):
            if field_name:
                setattr(definition, field_name, row[i].value)
        definition.save()
        row_count += 1
    logger.info("{} definitions loaded from {}".format(row_count, path))
Esempio n. 13
0
    def load_fpds_from_file(self, file_path: str) -> None:
        """Loads arbitrary set of ids, WITHOUT checking for deletes"""
        total_count = 0
        with RetrieveFileFromUri(file_path).get_file_object() as file:
            logger.info(f"Loading transactions from IDs in {file_path}")
            for next_batch in self.gen_read_file_for_ids(file):
                id_list = [int(re.search(r"\d+", x).group()) for x in next_batch]
                total_count += len(id_list)
                logger.info(f"Loading next batch (size: {len(id_list)}, ids {id_list[0]}-{id_list[-1]})...")
                self.modified_award_ids.extend(load_fpds_transactions(id_list))

        logger.info(f"Total transaction IDs in file: {total_count}")
Esempio n. 14
0
def test_iobase_api(temp_file_path):
    """Testing IOBase API https://docs.python.org/3/library/io.html#io.IOBase
    using a well-written standard library function, like open() will pass these
    with flying colors. tempfile.SpooledTemporaryFile() is missing some expected
    API which can cause issues. If you see an attribute error like below then
    it might be good to leverage the custom class

    ```AttributeError: 'SpooledTemporaryFile' object has no attribute 'readable'```
    """

    sources = (__file__, SMALL_FILE_URL)

    def test_methods(f):
        assert f.tell() == 0
        assert f.readable() is True
        assert f.seekable() is True

    for source in sources:
        with RetrieveFileFromUri(source).get_file_object() as f:
            test_methods(f)
    def handle(self, *args, **options):
        script_start_time = perf_counter()
        logger.info("Starting load_rosetta management command")
        logger.info("Loading data from {}".format(options["path"]))

        try:
            local_filepath = Path("temp_local_data_dictionary.xlsx")
            RetrieveFileFromUri(options["path"]).copy(str(local_filepath))
            rosetta_object = extract_data_from_source_file(
                filepath=local_filepath)
        except Exception:
            logger.exception("Exception during file retrieval or parsing")
        finally:
            if local_filepath.exists():
                local_filepath.unlink()

        rosetta_object["metadata"]["download_location"] = options["path"]

        load_xlsx_data_to_model(rosetta_object)

        logger.info("Script completed in {:.2f}s".format(perf_counter() -
                                                         script_start_time))
Esempio n. 16
0
def read_afa_ids_from_file(afa_id_file_path):
    with RetrieveFileFromUri(afa_id_file_path).get_file_object() as f:
        return tuple(l.decode('utf-8').rstrip() for l in f if l)
Esempio n. 17
0
def load_from_url(rfc_path_string):
    with RetrieveFileFromUri(
            rfc_path_string).get_file_object() as data_file_handle:
        return load_cfda_csv_into_pandas(data_file_handle)
def test_retrieve_from_s3():
    # Can't currently think of a good way to test this.  We'd need a public,
    # always available S3 file.
    with RetrieveFileFromUri("s3://whatever/file.txt").get_file_object(True):
        pass
Esempio n. 19
0
    def csv_tas_loader(self, file_path):
        field_map = {
            "treasury_account_identifier": "ACCT_NUM",
            "allocation_transfer_agency_id": "ATA",
            "agency_id": "AID",
            "beginning_period_of_availability": "BPOA",
            "ending_period_of_availability": "EPOA",
            "availability_type_code": "A",
            "main_account_code": "MAIN",
            "sub_account_code": "SUB",
            "account_title": "GWA_TAS_NAME",
            "reporting_agency_id": "Agency AID",
            "reporting_agency_name": "Agency Name",
            "budget_bureau_code": "ADMIN_ORG",
            "budget_bureau_name": "Admin Org Name",
            "fr_entity_code": "FR Entity Type",
            "fr_entity_description": "FR Entity Description",
            "budget_function_code": "Function Code",
            "budget_function_title": "Function Description",
            "budget_subfunction_code": "Sub Function Code",
            "budget_subfunction_title": "Sub Function Description",
        }

        value_map = {
            "data_source": "USA",
            "tas_rendering_label": self.generate_tas_rendering_label,
            "awarding_toptier_agency": None,
            "funding_toptier_agency": None,
            "internal_start_date": lambda row: datetime.strftime(
                datetime.strptime(row["DT_TM_ESTAB"], "%m/%d/%Y  %H:%M:%S"), "%Y-%m-%d"
            ),
            "internal_end_date": lambda row: datetime.strftime(
                datetime.strptime(row["DT_END"], "%m/%d/%Y  %H:%M:%S"), "%Y-%m-%d"
            )
            if row["DT_END"]
            else None,
        }

        with RetrieveFileFromUri(file_path).get_file_object(True) as tas_list_file_object:
            # Get a total count for print out
            tas_list_reader = csv.DictReader(tas_list_file_object)
            total_count = len(list(tas_list_reader))

            # Reset the reader back to the beginning of the file
            tas_list_file_object.seek(0)
            tas_list_reader = csv.DictReader(tas_list_file_object)

            for count, row in enumerate(tas_list_reader, 1):
                for key, value in row.items():
                    row[key] = value.strip() or None

                # Check to see if we need to update or create a TreasuryAppropriationAccount record
                current_record = TreasuryAppropriationAccount.objects.filter(
                    treasury_account_identifier=row["ACCT_NUM"]
                ).first()
                taa_instance = current_record or TreasuryAppropriationAccount()

                # Don't load Financing TAS
                if row["financial_indicator_type2"] == "F":
                    if taa_instance.treasury_account_identifier:
                        taa_instance.delete()
                    logger.info("   Row contains Financing TAS, Skipping...")
                    continue

                load_data_into_model(taa_instance, row, field_map=field_map, value_map=value_map, save=True)

                if count % 1000 == 0:
                    logger.info("   Loaded {} rows of {}".format(count, total_count))
Esempio n. 20
0
def read_csv_file_as_list_of_dictionaries(file_path):
    """
    Read in the specified CSV file and return as a list of dictionaries ("records").
    """
    with RetrieveFileFromUri(file_path).get_file_object(True) as f:
        return list(csv.DictReader(f))
Esempio n. 21
0
def read_afa_ids_from_file(afa_id_file_path):
    with RetrieveFileFromUri(afa_id_file_path).get_file_object() as f:
        return {line.decode("utf-8").rstrip() for line in f if line}