def _perform_load(self): overrides = { "insert_overrides": { "create_date": SQL("now()"), "update_date": SQL("now()") }, "update_overrides": { "update_date": SQL("now()") }, } def_code_table = ETLTable("disaster_emergency_fund_code", **overrides) temp_def_code_table = ETLTemporaryTable( "temp_load_disaster_emergency_fund_codes") self._execute_dml_sql( CREATE_TEMP_TABLE, "Create disaster_emergency_fund_code temp table") self._execute_function_and_log(self._read_raw_def_code_csv, "Read raw DEF Code csv") self._execute_function(self._validate_raw_def_codes, "Validate raw DEF Codes") self._execute_function_and_log(self._import_def_codes, "Import DEF Codes") self._execute_function_and_log(update_changed_rows, "Update changed DEF Codes", temp_def_code_table, def_code_table) self._execute_function_and_log(insert_missing_rows, "Insert missing DEF Codes", temp_def_code_table, def_code_table)
def test_stage_table(operations_fixture): # Just copies the table. operations.stage_table(ETLTable("t1"), ETLTable("t2"), ETLTemporaryTable("t3")) assert execute_sql("select id1, id2 from t3 order by id1") == [(1, 2), (4, 5), (8, 8)]
def _perform_load(self): self._create_staging_table() self._import_input_file() natural_key_columns = ["feature_id", "state_alpha", "county_sequence", "county_numeric"] destination_table = ETLTable(self.destination_table_name, key_overrides=natural_key_columns) source_table = ETLTemporaryTable(self.staging_table_name) with Timer("Delete obsolete rows"): deleted_count = operations.delete_obsolete_rows(source_table, destination_table) logger.info(f"{deleted_count:,} rows deleted") with Timer("Update changed rows"): updated_count = operations.update_changed_rows(source_table, destination_table) logger.info(f"{updated_count:,} rows updated") with Timer("Insert missing rows"): inserted_count = operations.insert_missing_rows(source_table, destination_table) logger.info(f"{inserted_count:,} rows inserted") with Timer("Remove staging table"): execute_dml_sql(f'drop table if exists "{self.staging_table_name}"') change_count = deleted_count + updated_count + inserted_count current_record_count = CityCountyStateCode.objects.count() max_change_threshold = int(current_record_count * MAX_CHANGE_PERCENT / 100.0) if current_record_count > 0 and change_count > max_change_threshold and not self.force: raise RuntimeError( f"Exceeded maximum number of allowed changes {max_change_threshold:,} ({MAX_CHANGE_PERCENT}%). " f"Use --force switch if this was intentional." ) return change_count
def _perform_load(self): overrides = { "insert_overrides": { "create_date": SQL("now()"), "update_date": SQL("now()") }, "update_overrides": { "update_date": SQL("now()") }, "key_overrides": ["object_class", "direct_reimbursable"], } object_class_table = ETLTable("object_class", **overrides) temp_object_class_table = ETLTemporaryTable("temp_load_object_classes") self._execute_dml_sql(CREATE_TEMP_TABLE, "Create object_class temp table") self._execute_function_and_log(self._read_raw_object_classes_csv, "Read raw object class csv") self._execute_function(self._validate_raw_object_classes, "Validate raw object classes") self._execute_function(self._add_unknown_object_classes, 'Add "unknown" object classes') self._execute_function(self._derive_remaining_fields, "Derive remaining fields") self._execute_function_and_log(self._import_object_classes, "Import object classes") # NOT deleting object classes is intentional for historical reasons. self._execute_function_and_log(update_changed_rows, "Update changed object classes", temp_object_class_table, object_class_table) self._execute_function_and_log(insert_missing_rows, "Insert missing object classes", temp_object_class_table, object_class_table)
def test_identify_new_or_updated(operations_fixture): operations.identify_new_or_updated(ETLTable("t1"), ETLTable("t2"), ETLTemporaryTable("t3")) assert execute_sql("select id1, id2 from t3 order by id1") == [(4, 5), (8, 8)]
def test_etl_table(make_a_table): expected_data_types = OrderedDict(( ("id1", primatives.ColumnDefinition("id1", "integer", True)), ("id2", primatives.ColumnDefinition("id2", "integer", True)), ("name", primatives.ColumnDefinition("name", "text", False)), ("description", primatives.ColumnDefinition("description", "text", False)), )) # Happy path. table = ETLTable("totally_tubular_testing_table_for_two", "public") assert table.columns == ["id1", "id2", "name", "description"] assert dict(table.data_types) == expected_data_types assert table.update_overrides == {} assert table.insert_overrides == {} assert table.key_columns == [ primatives.ColumnDefinition("id1", "integer", True), primatives.ColumnDefinition("id2", "integer", True), ] assert cc(table.object_representation ) == '"public"."totally_tubular_testing_table_for_two"' # No schema. table = ETLTable("totally_tubular_testing_table_for_two") assert cc(table.object_representation ) == '"public"."totally_tubular_testing_table_for_two"' # Happy overrides. table = ETLTable( "totally_tubular_testing_table_for_two", "public", key_overrides=["id1"], insert_overrides={"name": Literal("hello")}, update_overrides={"description": Literal("o hi")}, ) assert table.columns == ["id1", "id2", "name", "description"] assert dict(table.data_types) == expected_data_types assert table.insert_overrides == {"name": Literal("hello")} assert table.update_overrides == {"description": Literal("o hi")} assert table.key_columns == [ primatives.ColumnDefinition("id1", "integer", True) ] assert cc(table.object_representation ) == '"public"."totally_tubular_testing_table_for_two"' # Nonexistent overrides. table = ETLTable( "totally_tubular_testing_table_for_two", "public", key_overrides=["bogus"], insert_overrides={"bogus": SQL("now()")}, update_overrides={"bogus": SQL("now()")}, ) with pytest.raises(RuntimeError): assert table.update_overrides is not None with pytest.raises(RuntimeError): assert table.insert_overrides is not None with pytest.raises(RuntimeError): assert table.key_columns is not None # Attempt to override primary keys. table = ETLTable( "totally_tubular_testing_table_for_two", "public", insert_overrides={"id1": SQL("now()")}, update_overrides={"id1": SQL("now()")}, ) with pytest.raises(RuntimeError): assert table.update_overrides is not None with pytest.raises(RuntimeError): assert table.insert_overrides is not None # Everything for ETLTemporaryTable is the same as ETLTable just no schema. table = ETLTemporaryTable("totally_tubular_testing_table_for_two") assert cc(table.object_representation ) == '"totally_tubular_testing_table_for_two"'
def _perform_load(self): """ Grab the Broker subaward table and use it to update ours. """ broker_subaward = ETLTable("broker_subaward") subaward = ETLTable("subaward") remote_subaward = ETLDBLinkTable("subaward", settings.DATA_BROKER_DBLINK_NAME, broker_subaward.data_types) temp_broker_subaward = ETLTemporaryTable( "temp_load_subawards_broker_subaward") temp_new_or_updated = ETLTemporaryTable( "temp_load_subawards_new_or_updated") temp_subaward = ETLTemporaryTable("temp_load_subawards_subaward") if self.full_reload: logger.info( "--full-reload switch provided. Emptying subaward tables.") self._execute_dml_sql("delete from broker_subaward", "Empty broker_subaward table") self._execute_dml_sql("delete from subaward", "Empty subaward table") self._execute_function_and_log( operations.stage_table, "Copy Broker subaward table", source=remote_subaward, destination=broker_subaward, staging=temp_broker_subaward, ) # To help performance. self._execute_dml_sql( "create index idx_temp_load_subawards_broker_subaward on temp_load_subawards_broker_subaward(id)", "Create temporary index", ) # Create a list of new or updated subawards that we can use to filter down # subsequent operations. self._execute_function_and_log( operations.identify_new_or_updated, "Identify new/updated rows", source=temp_broker_subaward, destination=broker_subaward, staging=temp_new_or_updated, ) self._delete_update_insert_rows("broker_subawards", temp_broker_subaward, broker_subaward) # The Broker subaward table takes up a good bit of space so let's explicitly free # it up before continuing. self._execute_dml_sql( "drop table if exists temp_load_subawards_broker_subaward", "Drop staging table") self._execute_etl_dml_sql_directory_file("030_frame_out_subawards") self._execute_etl_dml_sql_directory_file( "040_enhance_with_awards_data") self._execute_etl_dml_sql_directory_file( "050_enhance_with_transaction_fpds_data.sql") self._execute_etl_dml_sql_directory_file( "051_enhance_with_transaction_fabs_data.sql") self._execute_etl_dml_sql_directory_file("060_enhance_with_cfda_data") self._execute_etl_dml_sql_directory_file( "070_enhance_with_awarding_agency") self._execute_etl_dml_sql_directory_file( "080_enhance_with_funding_agency") with Timer("Enhance with city and county information"): update_subaward_city_county("temp_load_subawards_subaward") self._execute_etl_dml_sql_directory_file( "110_enhance_with_pop_country") self._execute_etl_dml_sql_directory_file( "130_enhance_with_recipient_location_country") # Delete from broker_subaward because temp_subaward only has new/updated rows. self._execute_function_and_log(operations.delete_obsolete_rows, "Delete obsolete subawards", broker_subaward, subaward) # But update and insert from temp_subaward. self._execute_function_and_log(operations.update_changed_rows, "Update changed subawards", temp_subaward, subaward) self._execute_function_and_log(operations.insert_missing_rows, "Insert missing subawards", temp_subaward, subaward) self._execute_etl_dml_sql_directory_file("140_link_awards") self._execute_etl_dml_sql_directory_file("150_update_awards") # This is for unit tests. Temporary tables go away on their own once the transaction/session # drops, but because unit tests run in a transaction, our temporary tables do not go away # between loads... which is problematic. self._execute_dml_sql( "drop table if exists temp_load_subawards_new_or_updated", "Drop new/updated temp table") self._execute_dml_sql( "drop table if exists temp_load_subawards_subaward", "Drop subaward temp table")