def create_direct_ingest_process_job_task(self, region: Region, ingest_args: IngestArgs): body = self._get_body_from_args(ingest_args) task_name = self._build_task_name_for_queue_and_region( region.get_queue_name(), region.region_code, ingest_args.task_id_tag()) task = tasks.types.Task( name=task_name, app_engine_http_request={ 'relative_uri': f'/direct/process_job?region={region.region_code}', 'body': json.dumps(body).encode() }) self._queue_task(region.get_queue_name(), task)
def check_is_region_launched_in_env(region: Region) -> None: """Checks if direct ingest has been launched for the provided |region| in the current GAE env and throws if it has not.""" if not region.is_ingest_launched_in_env(): gae_env = environment.get_gae_environment() error_msg = f'Bad environment [{gae_env}] for region [{region.region_code}].' logging.error(error_msg) raise DirectIngestError( msg=error_msg, error_type=DirectIngestErrorType.ENVIRONMENT_ERROR)
def _get_process_job_queue_manager( self, region: Region ) -> CloudTaskQueueManager[ProcessIngestJobCloudTaskQueueInfo]: if region.region_code not in self.region_process_job_queue_managers: self.region_process_job_queue_managers[ region.region_code] = CloudTaskQueueManager( queue_info_cls=ProcessIngestJobCloudTaskQueueInfo, queue_name=region.get_queue_name(), ) return self.region_process_job_queue_managers[region.region_code]
def mock_region(region_code, queue_name=None, is_stoppable=False): return Region( region_code=region_code, shared_queue=queue_name or None, agency_name='the agency', agency_type='benevolent', base_url='localhost:3000', names_file='names.txt', timezone='America/New_York', environment='production', jurisdiction_id='jurisdiction_id', is_stoppable=is_stoppable or False, )
def _mock_region(): return Region( region_code=_REGION_CODE, shared_queue='queue', agency_name='the agency', agency_type='benevolent', base_url='localhost:3000', names_file='names.txt', timezone='America/Chicago', environment='production', jurisdiction_id='01071001', is_stoppable=False, )
def _mock_region(): return Region( region_code=_REGION_CODE, shared_queue="queue", agency_name="the agency", agency_type="benevolent", base_url="localhost:3000", names_file="names.txt", timezone="America/Chicago", environment="production", jurisdiction_id="01071001", is_stoppable=False, )
def mock_region(region_code, queue_name=None, is_stoppable=False): return Region( region_code=region_code, shared_queue=queue_name or None, agency_name="the agency", agency_type="benevolent", base_url="localhost:3000", names_file="names.txt", timezone="America/New_York", environment="production", jurisdiction_id="00000000", # must be 8 character numeric string is_stoppable=is_stoppable or False, )
def create_direct_ingest_process_job_task(self, region: Region, ingest_args: IngestArgs): task_id = _build_task_id(region.region_code, ingest_args.task_id_tag(), prefix_only=False) relative_uri = f'/direct/process_job?region={region.region_code}' body = self._get_body_from_args(ingest_args) self.cloud_task_client.create_task( task_id=task_id, queue_name=region.get_queue_name(), relative_uri=relative_uri, body=body, )
def run_scraper_for_region(region: regions.Region, args: argparse.Namespace) -> None: """Runs the scraper for the given region Creates and manages an in-memory FIFO queue to replicate production. """ # Don't initialize a ScraperCloudTaskManager when running locally. scraper_module.ScraperCloudTaskManager = lambda: None # type: ignore scraper = region.get_scraper() scraper.BATCH_WRITES = False task_queue: deque = deque() # We use this to bind the method to the instance. scraper.add_task = types.MethodType(partial(add_task, task_queue), scraper) scraper.start_scrape = types.MethodType(partial(start_scrape, task_queue), scraper) scraper.start_scrape(constants.ScrapeType.BACKGROUND) num_tasks_run = 0 while task_queue and (num_tasks_run < args.num_tasks or args.run_forever): logging.info("***") logging.info( "Running task [%d] of [%s] tasks", num_tasks_run, "infinite" if args.run_forever else args.num_tasks, ) # run the task if args.lifo: method, request = task_queue.pop() else: method, request = task_queue.popleft() try: getattr(scraper, method)(request) except Exception as e: if args.fail_fast or e is KeyboardInterrupt: raise traceback.print_exc() # increment and sleep num_tasks_run += 1 logging.info( "Sleeping [%s] seconds before sending another request", args.sleep_between_requests, ) time.sleep(args.sleep_between_requests) logging.info("Completed the test run!")
def fake_region(*, region_code: str = 'us_ca', agency_type: str = 'prison', environment: str = 'local', jurisdiction_id: str = 'unknown', ingestor: Optional[Union[BaseScraper, BaseDirectIngestController]] = None): region = create_autospec(Region) region.region_code = region_code region.agency_type = agency_type region.environment = environment region.jurisdiction_id = jurisdiction_id region.get_ingestor.return_value = \ ingestor if ingestor else create_autospec(BaseDirectIngestController) region.is_ingest_launched_in_env.return_value = \ Region.is_ingest_launched_in_env(region) return region
def make_sql_preprocessing_flag_region( raw_vs_ingest_file_name_differentiation_enabled_env: Optional[ str] = None, raw_data_bq_imports_enabled_env: Optional[str] = None, ingest_view_exports_enabled_env: Optional[str] = None): region_code = 'us_mo' flag_overrides = { 'raw_vs_ingest_file_name_differentiation_enabled_env': raw_vs_ingest_file_name_differentiation_enabled_env, 'raw_data_bq_imports_enabled_env': raw_data_bq_imports_enabled_env, 'ingest_view_exports_enabled_env': ingest_view_exports_enabled_env, } kwargs = {**get_region_manifest(region_code, True), **flag_overrides} return Region(region_code=region_code, is_direct_ingest=True, **kwargs)
def make_sql_preprocessing_flag_region( raw_vs_ingest_file_name_differentiation_enabled_env: Optional[str] = None, raw_data_bq_imports_enabled_env: Optional[str] = None, ingest_view_exports_enabled_env: Optional[str] = None, ): region_code = "us_mo" flag_overrides = { "raw_vs_ingest_file_name_differentiation_enabled_env": raw_vs_ingest_file_name_differentiation_enabled_env, "raw_data_bq_imports_enabled_env": raw_data_bq_imports_enabled_env, "ingest_view_exports_enabled_env": ingest_view_exports_enabled_env, } kwargs = { **get_region_manifest(region_code, direct_ingest_regions_module), **flag_overrides, } return Region(region_code=region_code, is_direct_ingest=True, **kwargs)
def fake_region(*, region_code: str = 'us_xx', agency_type: str = 'prison', environment: str = 'local', jurisdiction_id: str = 'unknown', ingestor: Optional[Union[BaseScraper, BaseDirectIngestController]] = None, is_raw_vs_ingest_file_name_detection_enabled: bool = False, are_raw_data_bq_imports_enabled_in_env: bool = False, are_ingest_view_exports_enabled_in_env: bool = False): region = create_autospec(Region) region.region_code = region_code region.agency_type = agency_type region.environment = environment region.jurisdiction_id = jurisdiction_id region.get_ingestor.return_value = \ ingestor if ingestor else create_autospec(BaseDirectIngestController) region.is_ingest_launched_in_env.return_value = \ Region.is_ingest_launched_in_env(region) region.is_raw_vs_ingest_file_name_detection_enabled.return_value = is_raw_vs_ingest_file_name_detection_enabled region.are_raw_data_bq_imports_enabled_in_env.return_value = are_raw_data_bq_imports_enabled_in_env region.are_ingest_view_exports_enabled_in_env.return_value = are_ingest_view_exports_enabled_in_env return region
def fake_is_launched_in_env(): return Region.is_ingest_launched_in_env(region)
def get_process_job_queue_info(self, region: Region) -> CloudTaskQueueInfo: return self._get_queue_info(region.get_queue_name(), region.region_code)
def get_parent_class(region: regions.Region) -> type: return inspect.getmro(region.get_ingestor_class())[1]