def check_all_paths_processed(test_case: unittest.TestCase, controller: GcsfsDirectIngestController, file_tags: List[str], unexpected_tags: List[str]): """Checks that all non-directory paths with expected tags have been processed and moved to storage. """ if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(controller.fs)}]") file_tags_processed = set() for path in controller.fs.all_paths: if isinstance(path, GcsfsDirectoryPath): continue file_tag = filename_parts_from_path(path).file_tag if file_tag not in unexpected_tags: # Test all expected files have been moved to storage test_case.assertTrue( path.abs_path().startswith( controller.storage_directory_path.abs_path()), f'{path} has not been moved to correct storage directory') file_tags_processed.add(filename_parts_from_path(path).file_tag) else: test_case.assertTrue(path.file_name.startswith('unprocessed')) # Test that each expected file tag has been processed test_case.assertEqual(file_tags_processed, set(file_tags).difference(set(unexpected_tags)))
def test_process_file_that_needs_splitting(self): controller = build_gcsfs_controller_for_tests( StateTestGcsfsDirectIngestController, self.FIXTURE_PATH_PREFIX, run_async=True) # Set line limit to 1 controller.file_split_line_limit = 1 # pylint:disable=protected-access file_tags = list(sorted(controller._get_file_tag_rank_list())) add_paths_with_tags_and_process(self, controller, file_tags, pre_normalize_filename=True) processed_split_file_paths = defaultdict(list) for path in controller.fs.all_paths: if self._path_in_split_file_storage_subdir(path, controller): file_tag = filename_parts_from_path(path).file_tag processed_split_file_paths[file_tag].append(path) self.assertEqual(1, len(processed_split_file_paths.keys())) self.assertEqual(2, len(processed_split_file_paths['tagC'])) found_suffixes = { filename_parts_from_path(p).filename_suffix for p in processed_split_file_paths['tagC'] } self.assertEqual(found_suffixes, {'00001_file_split_size1', '00002_file_split_size1'})
def add_paths_with_tags_and_process(test_case: unittest.TestCase, controller: GcsfsDirectIngestController, file_tags: List[str], unexpected_tags: List[str] = None): """Runs a test that queues files for all the provided file tags, waits for the controller to finish processing everything, then makes sure that all files not in |unexpected_tags| have been moved to storage. """ if unexpected_tags is None: unexpected_tags = [] for file_tag in file_tags: args = ingest_args_for_fixture_file(controller, f'{file_tag}.csv') if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(controller.fs)}]") controller.fs.test_add_path(args.file_path) controller.kick_scheduler(just_finished_job=False) time.sleep(.05) if isinstance(controller.cloud_task_manager, FakeAsyncDirectIngestCloudTaskManager): controller.cloud_task_manager.wait_for_all_tasks_to_run() elif isinstance(controller.cloud_task_manager, FakeSynchronousDirectIngestCloudTaskManager): tm = controller.cloud_task_manager while tm.get_scheduler_queue_info(controller.region).size() \ or tm.get_process_job_queue_info(controller.region).size(): if tm.get_scheduler_queue_info(controller.region).size(): tm.test_run_next_scheduler_task() tm.test_pop_finished_scheduler_task() if tm.get_process_job_queue_info(controller.region).size(): tm.test_run_next_process_job_task() tm.test_pop_finished_process_job_task() else: raise ValueError(f"Unexpected type for cloud task manager: " f"[{type(controller.cloud_task_manager)}]") file_tags_processed = set() for path in controller.fs.all_paths: file_tag = filename_parts_from_path(path).file_tag if file_tag not in unexpected_tags: # Test all expected files have been moved to storage test_case.assertTrue( path.startswith(controller.storage_directory_path), f'{path} does not start with expected prefix') file_tags_processed.add(filename_parts_from_path(path).file_tag) else: _, file_name = os.path.split(path) test_case.assertTrue(file_name.startswith('unprocessed')) # Test that each expected file tag has been processed test_case.assertEqual(file_tags_processed, set(file_tags).difference(set(unexpected_tags)))
def _is_last_job_for_day(self, args: GcsfsIngestArgs) -> bool: """Returns True if the file handled in |args| is the last file for that upload date.""" parts = filename_parts_from_path(args.file_path) upload_date, date_str = parts.utc_upload_datetime, parts.date_str more_jobs_expected = self.file_prioritizer.are_more_jobs_expected_for_day( date_str) if more_jobs_expected: return False next_job_args = self.file_prioritizer.get_next_job_args(date_str) if next_job_args: next_job_date = filename_parts_from_path( next_job_args.file_path).utc_upload_datetime return next_job_date > upload_date return True
def has_file_been_discovered(self, path: GcsfsFilePath) -> bool: parts = filename_parts_from_path(path) try: metadata = self.get_file_metadata(path) except ValueError as e: if parts.file_type != GcsfsDirectIngestFileType.RAW_DATA: raise e return False if not metadata: raise ValueError(f"Metadata unexpectedly None for path [{path.abs_path()}]") # TODO(#3020): Design/handle/write tests for case where this is a file we've moved from storage for a # rerun. How do we accurately detect when this is happening? if isinstance(metadata, DirectIngestRawFileMetadata): return True if isinstance(metadata, DirectIngestIngestFileMetadata): if metadata.discovery_time is None: return False return True raise ValueError( f"Unexpected metadata type [{type(metadata)}] for path [{path.abs_path()}]" )
def _ls_with_file_prefix( self, directory_path: GcsfsDirectoryPath, file_prefix: str, file_type_filter: Optional[GcsfsDirectIngestFileType], ) -> List[GcsfsFilePath]: """Returns absolute paths of files in the directory with the given |file_prefix|.""" blob_prefix = os.path.join( *[directory_path.relative_path, file_prefix]) blob_paths = self.gcs_file_system.ls_with_blob_prefix( directory_path.bucket_name, blob_prefix) result = [] for path in blob_paths: if not isinstance(path, GcsfsFilePath): continue if not file_type_filter: result.append(path) continue file_type = filename_parts_from_path(path).file_type if file_type == GcsfsDirectIngestFileType.UNSPECIFIED: raise ValueError( f"Found path {path.abs_path()} with unexpected UNSPECIFIED type." ) if file_type == file_type_filter: result.append(path) return result
def _to_normalized_file_path_from_normalized_path( original_normalized_file_path: str, build_function: Callable, file_type_override: Optional[GcsfsDirectIngestFileType] = None, ) -> str: """Moves any normalized path back to a unprocessed/processed path with the same information embedded in the file name. If |file_type_override| is provided, we will always overwrite the original path file type with the override file type.""" directory, _ = os.path.split(original_normalized_file_path) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path(original_normalized_file_path)) file_type = file_type_override if file_type_override else parts.file_type utc_iso_timestamp_str = parts.utc_upload_datetime.strftime( "%Y-%m-%dT%H:%M:%S:%f") suffix_str = f"_{parts.filename_suffix}" if parts.filename_suffix else "" base_file_name = f"{parts.file_tag}{suffix_str}" path_to_return = build_function( utc_iso_timestamp_str=utc_iso_timestamp_str, file_type=file_type, base_file_name=base_file_name, extension=parts.extension, ) return os.path.join(directory, path_to_return)
def import_raw_file_to_big_query( self, path: GcsfsFilePath, file_metadata: DirectIngestFileMetadata) -> None: """Import a raw data file at the given path to the appropriate raw data table in BigQuery.""" if not self.region.are_raw_data_bq_imports_enabled_in_env(): raise ValueError( f'Cannot import raw files for region [{self.region.region_code}]' ) parts = filename_parts_from_path(path) if parts.file_tag not in self.region_raw_file_config.raw_file_tags: raise ValueError( f'Attempting to import raw file with tag [{parts.file_tag}] unspecified by [{self.region.region_code}] ' f'config.') if parts.file_type != GcsfsDirectIngestFileType.RAW_DATA: raise ValueError( f'Unexpected file type [{parts.file_type}] for path [{parts.file_tag}].' ) logging.info('Beginning BigQuery upload of raw file [%s]', path.abs_path()) temp_output_paths = self._upload_contents_to_temp_gcs_paths( path, file_metadata) self._load_contents_to_bigquery(path, temp_output_paths) logging.info('Completed BigQuery import of [%s]', path.abs_path())
def test_multiple_files_times_out_of_order(self): """Runs a test where there are no gaps but the files have been added (i.e. have creation times) out of order. """ paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) self.assertTrue( self.prioritizer.are_next_args_expected(next_job_args)) are_more_jobs_expected = \ self.prioritizer.are_more_jobs_expected_for_day(date_str) if i == 2: self.assertFalse(are_more_jobs_expected) else: self.assertTrue(are_more_jobs_expected) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat()))
def _create_split_file_path( self, original_file_path: GcsfsFilePath, output_dir: GcsfsDirectoryPath, split_num: int, ) -> GcsfsFilePath: parts = filename_parts_from_path(original_file_path) rank_str = str(split_num + 1).zfill(5) updated_file_name = ( f"{parts.stripped_file_name}_{rank_str}" f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}" f".{parts.extension}") file_type = ( GcsfsDirectIngestFileType.INGEST_VIEW if self.region.is_raw_vs_ingest_file_name_detection_enabled() else GcsfsDirectIngestFileType.UNSPECIFIED) return GcsfsFilePath.from_directory_and_file_name( output_dir, to_normalized_unprocessed_file_path(updated_file_name, file_type=file_type, dt=parts.utc_upload_datetime), )
def import_raw_file_to_big_query( self, path: GcsfsFilePath, file_metadata: DirectIngestRawFileMetadata) -> None: """Import a raw data file at the given path to the appropriate raw data table in BigQuery.""" parts = filename_parts_from_path(path) if parts.file_tag not in self.region_raw_file_config.raw_file_tags: raise ValueError( f"Attempting to import raw file with tag [{parts.file_tag}] unspecified by [{self.region.region_code}] " f"config.") if parts.file_type != GcsfsDirectIngestFileType.RAW_DATA: raise ValueError( f"Unexpected file type [{parts.file_type}] for path [{parts.file_tag}]." ) logging.info("Beginning BigQuery upload of raw file [%s]", path.abs_path()) temp_output_paths = self._upload_contents_to_temp_gcs_paths( path, file_metadata) self._load_contents_to_bigquery(path, temp_output_paths) migration_queries = self.raw_table_migrations.get(parts.file_tag, []) logging.info( "Running [%s] migration queries for table [%s]", len(migration_queries), parts.file_tag, ) for migration_query in migration_queries: query_job = self.big_query_client.run_query_async( query_str=migration_query) # Wait for the migration query to complete before running the next one query_job.result() logging.info("Completed BigQuery import of [%s]", path.abs_path())
def to_normalized_unprocessed_file_path_from_normalized_path( original_normalized_file_path: str, file_type_override: Optional[GcsfsDirectIngestFileType] = None ) -> str: """Moves any normalized path back to an unprocessed path with the same information embedded in the file name. If |file_type_override| is provided, we will always overwrite the original path file type with the override file type. """ directory, _ = os.path.split(original_normalized_file_path) parts = filename_parts_from_path(GcsfsFilePath.from_absolute_path(original_normalized_file_path)) file_type = file_type_override if file_type_override else parts.file_type utc_iso_timestamp_str = parts.utc_upload_datetime.strftime('%Y-%m-%dT%H:%M:%S:%f') suffix_str = \ f'_{parts.filename_suffix}' if parts.filename_suffix else '' base_file_name = f'{parts.file_tag}{suffix_str}' path_as_unprocessed = _build_unprocessed_file_name( utc_iso_timestamp_str=utc_iso_timestamp_str, file_type=file_type, base_file_name=base_file_name, extension=parts.extension) return os.path.join(directory, path_as_unprocessed)
def collect_file_paths( data_discovery_args: DataDiscoveryArgs, configs: ConfigsByFileType, gcs_files: List[str], ) -> FilesByFileType: """ Given a set of configs configs, filter the listed GCS files to only those that match our search filters """ collected_files = defaultdict(list) for found_file in gcs_files: try: path = GcsfsFilePath.from_absolute_path(found_file) file_parts = filename_parts_from_path(path) except DirectIngestError as e: if e.error_type == DirectIngestErrorType.INPUT_ERROR: continue logger.exception(e) continue if (not data_discovery_args.start_date <= file_parts.utc_upload_datetime.date() <= data_discovery_args.end_date): continue if file_parts.is_file_split: continue if file_parts.file_tag in configs[file_parts.file_type]: collected_files[file_parts.file_type].append(path) return collected_files
def register_ingest_view_export_file_name( self, metadata_entity: DirectIngestIngestFileMetadata, exported_path: GcsfsFilePath, ) -> None: parts = filename_parts_from_path(exported_path) if parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW: raise ValueError(f"Exported path has unexpected type {parts.file_type}") session = SessionFactory.for_schema_base(OperationsBase) try: metadata = dao.get_file_metadata_row( session, GcsfsDirectIngestFileType.INGEST_VIEW, metadata_entity.file_id ) if metadata.normalized_file_name: raise ValueError( f"Normalized file name already set to [{metadata.normalized_file_name}] for file id " f"[{metadata.file_id}]" ) metadata.normalized_file_name = exported_path.file_name session.commit() except Exception as e: session.rollback() raise e finally: session.close()
def mark_file_as_discovered(self, path: GcsfsFilePath) -> None: if not path.file_name.startswith(DIRECT_INGEST_UNPROCESSED_PREFIX): raise ValueError("Expect only unprocessed paths in this function.") parts = filename_parts_from_path(path) session = SessionFactory.for_schema_base(OperationsBase) try: if parts.file_type == GcsfsDirectIngestFileType.INGEST_VIEW: metadata = dao.get_file_metadata_row_for_path( session, self.region_code, path ) dt = datetime.datetime.utcnow() if not metadata.export_time: metadata.export_time = dt metadata.discovery_time = dt elif parts.file_type == GcsfsDirectIngestFileType.RAW_DATA: session.add( schema.DirectIngestRawFileMetadata( region_code=self.region_code, file_tag=parts.file_tag, normalized_file_name=path.file_name, discovery_time=datetime.datetime.utcnow(), processed_time=None, datetimes_contained_upper_bound_inclusive=parts.utc_upload_datetime, ) ) else: raise ValueError(f"Unexpected path type: {parts.file_type}") session.commit() except Exception as e: session.rollback() raise e finally: session.close()
def _can_proceed_with_ingest_for_contents( self, args: GcsfsIngestArgs, contents_handle: GcsfsFileContentsHandle) -> bool: parts = filename_parts_from_path(args.file_path) return self._are_contents_empty( args, contents_handle) or not self._must_split_contents( parts.file_type, args.file_path)
def _should_split_file(self, path: GcsfsFilePath) -> bool: """Returns a handle to the contents of this path if this file should be split, None otherwise.""" parts = filename_parts_from_path(path) if (self.region.is_raw_vs_ingest_file_name_detection_enabled() and parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW): raise ValueError( f"Should not be attempting to split files other than ingest view files, found path with " f"file type: {parts.file_type}") if parts.file_tag not in self.get_file_tag_rank_list(): logging.info( "File tag [%s] for path [%s] not in rank list - not splitting.", parts.file_tag, path.abs_path(), ) return False if (parts.is_file_split and parts.file_split_size and parts.file_split_size <= self.ingest_file_split_line_limit): logging.info( "File [%s] already split with size [%s].", path.abs_path(), parts.file_split_size, ) return False return self._must_split_contents(parts.file_type, path)
def get_file_metadata_row_for_path( session: Session, region_code: str, path: GcsfsFilePath ) -> Union[schema.DirectIngestRawFileMetadata, schema.DirectIngestIngestFileMetadata]: """Returns metadata information for the provided path. If the file has not yet been registered in the appropriate metadata table, this function will generate a file_id to return with the metadata. """ parts = filename_parts_from_path(path) if parts.file_type == GcsfsDirectIngestFileType.INGEST_VIEW: results = session.query( schema.DirectIngestIngestFileMetadata).filter_by( region_code=region_code, is_invalidated=False, normalized_file_name=path.file_name).all() elif parts.file_type == GcsfsDirectIngestFileType.RAW_DATA: results = session.query(schema.DirectIngestRawFileMetadata).filter_by( region_code=region_code, normalized_file_name=path.file_name).all() else: raise ValueError(f'Unexpected path type: {parts.file_type}') if len(results) != 1: raise ValueError( f'Unexpected number of metadata results for path {path.abs_path()}: [{len(results)}]' ) return one(results)
def _move_processed_files_to_storage_as_necessary( self, last_processed_date_str: str): next_args = self.file_prioritizer.get_next_job_args() should_move_last_processed_date = False if not next_args: are_more_jobs_expected = \ self.file_prioritizer.are_more_jobs_expected_for_day( last_processed_date_str) if not are_more_jobs_expected: should_move_last_processed_date = True else: next_date_str = \ filename_parts_from_path(next_args.file_path).date_str if next_date_str < last_processed_date_str: logging.info("Found a file [%s] from a date previous to our " "last processed date - not moving anything to " "storage.") return # If there are still more to process on this day, do not move files # from this day. should_move_last_processed_date = \ next_date_str != last_processed_date_str # Note: at this point, we expect RAW file type files to already have been moved once they were imported to BQ. file_type_to_move = GcsfsDirectIngestFileType.INGEST_VIEW \ if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None self.fs.mv_processed_paths_before_date_to_storage( self.ingest_directory_path, self.storage_directory_path, file_type_filter=file_type_to_move, date_str_bound=last_processed_date_str, include_bound=should_move_last_processed_date)
def _split_file_if_necessary(self, path: GcsfsFilePath): """Checks if the given file needs to be split according to this controller's |file_split_line_limit|. """ parts = filename_parts_from_path(path) if parts.file_tag not in self._get_file_tag_rank_list(): logging.info( "File tag [%s] for path [%s] not in rank list - " "not splitting.", parts.file_tag, path.abs_path()) return False if parts.is_file_split and \ parts.file_split_size and \ parts.file_split_size <= self.file_split_line_limit: logging.info("File [%s] already split with size [%s].", path.abs_path(), parts.file_split_size) return False file_contents_handle = self._get_contents_handle_from_path(path) if not file_contents_handle: logging.info("File [%s] has no rows - not splitting.", path.abs_path()) return False if self._can_proceed_with_ingest_for_contents(file_contents_handle): logging.info("No need to split file path [%s].", path.abs_path()) return False logging.info("Proceeding to file splitting for path [%s].", path.abs_path()) self._split_file(path, file_contents_handle) return True
def test_move_to_storage_with_conflict(self) -> None: dt = datetime.datetime.now() self.fully_process_file( dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file( dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 2) found_first_file = False found_second_file = False for path in storage_paths: self.assertTrue(filename_parts_from_path(path)) if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)
def _move_processed_files_to_storage_as_necessary( self, last_processed_date_str: str): next_args = self.file_prioritizer.get_next_job_args() should_move_last_processed_date = False if not next_args: are_more_jobs_expected = \ self.file_prioritizer.are_more_jobs_expected_for_day( last_processed_date_str) if not are_more_jobs_expected: should_move_last_processed_date = True else: next_date_str = \ filename_parts_from_path(next_args.file_path).date_str if next_date_str < last_processed_date_str: logging.info("Found a file [%s] from a date previous to our " "last processed date - not moving anything to " "storage.") return # If there are still more to process on this day, do not move files # from this day. should_move_last_processed_date = \ next_date_str != last_processed_date_str self.fs.mv_processed_paths_before_date_to_storage( self.ingest_directory_path, self.storage_directory_path, last_processed_date_str, include_bound=should_move_last_processed_date)
def _move_files(self, from_uri: str): curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri) previous_date_format = filename_parts_from_path( curr_gcsfs_file_path).date_str new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path): path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_processed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir( self.region_storage_raw_dir_path, new_date_format) to_uri = GcsfsFilePath.from_directory_and_file_name( raw_dir_with_date, path_with_new_file_name.file_name).uri() if not self.dry_run: gsutil_mv(from_path=from_uri, to_path=to_uri) with self.mutex: self.move_list.append((from_uri, to_uri)) if self.move_progress: self.move_progress.next()
def _get_ingest_metadata(self, args: GcsfsIngestArgs) -> IngestMetadata: parts = filename_parts_from_path(args.file_path) ingest_time = datetime.strptime(cast(str, parts.filename_suffix), '%m%d%Y_%H%M%S') return attr.evolve(super()._get_ingest_metadata(args), ingest_time=ingest_time)
def _upload_contents_to_temp_gcs_paths( self, path: GcsfsFilePath, file_metadata: DirectIngestFileMetadata ) -> List[Tuple[GcsfsFilePath, List[str]]]: """Uploads the contents of the file at the provided path to one or more GCS files, with whitespace stripped and additional metadata columns added. Returns a list of tuple pairs containing the destination paths and corrected CSV columns for that file. """ logging.info('Starting chunked upload of contents to GCS') parts = filename_parts_from_path(path) file_config = self.region_raw_file_config.raw_file_configs[ parts.file_tag] columns = self._get_validated_columns(path, file_config) delegate = DirectIngestRawDataSplittingGcsfsCsvReaderDelegate( path, self.fs, file_metadata, self.temp_output_directory_path) self.csv_reader.streaming_read( path, delegate=delegate, chunk_size=self.upload_chunk_size, encodings_to_try=file_config.encodings_to_try(), index_col=False, header=None, skiprows=1, usecols=columns, names=columns, keep_default_na=False, **self._common_read_csv_kwargs(file_config)) return delegate.output_paths_with_columns
def _read_contents_into_dataframes(self, path: GcsfsFilePath, contents_handle: GcsfsFileContentsHandle) -> Iterator[pd.DataFrame]: parts = filename_parts_from_path(path) file_config = self.region_raw_file_config.raw_file_configs[parts.file_tag] columns = self._get_validated_columns(file_config, contents_handle) try: for df in pd.read_csv( contents_handle.local_file_path, sep=file_config.separator, dtype=str, index_col=False, header=None, skiprows=1, encoding=file_config.encoding, quoting=(csv.QUOTE_NONE if file_config.ignore_quotes else csv.QUOTE_MINIMAL), usecols=columns, names=columns, chunksize=self.upload_chunk_size, keep_default_na=False): yield df except Exception as e: logging.error('Failed to parse DataFrame for path [%s] with config [%s]', path.abs_path(), file_config) raise e
def mv_processed_paths_before_date_to_storage( self, directory_path: GcsfsDirectoryPath, storage_directory_path: GcsfsDirectoryPath, file_type_filter: Optional[GcsfsDirectIngestFileType], date_str_bound: str, include_bound: bool, ) -> None: """Moves all files with timestamps before the provided |date_str_bound| to the appropriate storage location for that file. If a |file_type_filter| is provided, only moves files of a certain file type and throws if encountering a file of type UNSPECIFIED in the directory path. """ processed_file_paths = self.get_processed_file_paths( directory_path, file_type_filter) for file_path in processed_file_paths: date_str = filename_parts_from_path(file_path).date_str if date_str < date_str_bound or (include_bound and date_str == date_str_bound): logging.info( "Found file [%s] from [%s] which abides by provided bound " "[%s]. Moving to storage.", file_path.abs_path(), date_str, date_str_bound, ) self.mv_path_to_storage(file_path, storage_directory_path)
def handle_file(self, path: GcsfsFilePath, start_ingest: bool) -> None: """Called when a single new file is added to an ingest bucket (may also be called as a result of a rename). May be called from any worker/queue. """ if self.fs.is_processed_file(path): logging.info("File [%s] is already processed, returning.", path.abs_path()) return if self.fs.is_normalized_file_path(path): parts = filename_parts_from_path(path) if (parts.is_file_split and parts.file_split_size and parts.file_split_size <= self.ingest_file_split_line_limit): self.kick_scheduler(just_finished_job=False) logging.info( "File [%s] is already normalized and split split " "with correct size, kicking scheduler.", path.abs_path(), ) return logging.info("Creating cloud task to schedule next job.") self.cloud_task_manager.create_direct_ingest_handle_new_files_task( region=self.region, can_start_ingest=start_ingest)
def get_ingest_file_metadata_row_for_path( session: Session, region_code: str, path: GcsfsFilePath, ingest_database_name: str) -> schema.DirectIngestIngestFileMetadata: """Returns metadata information for the provided path. If the file has not yet been registered in the appropriate metadata table, this function will generate a file_id to return with the metadata. """ parts = filename_parts_from_path(path) if parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW: raise ValueError(f"Unexpected file type [{parts.file_type}]") results = (session.query(schema.DirectIngestIngestFileMetadata).filter_by( region_code=region_code.upper(), is_invalidated=False, normalized_file_name=path.file_name, ingest_database_name=ingest_database_name, ).all()) if len(results) != 1: raise ValueError( f"Unexpected number of metadata results for path {path.abs_path()}: [{len(results)}]" ) return one(results)
def _move_processed_files_to_storage_as_necessary( self, last_processed_date_str: str) -> None: """Moves files that have already been ingested/processed, up to and including the given date, into storage, if there is nothing more left to ingest/process, i.e. we are not expecting more files.""" next_args = self.file_prioritizer.get_next_job_args() should_move_last_processed_date = False if not next_args: are_more_jobs_expected = ( self.file_prioritizer.are_more_jobs_expected_for_day( last_processed_date_str)) if not are_more_jobs_expected: should_move_last_processed_date = True else: next_date_str = filename_parts_from_path( next_args.file_path).date_str if next_date_str < last_processed_date_str: logging.info("Found a file [%s] from a date previous to our " "last processed date - not moving anything to " "storage.") return # If there are still more to process on this day, do not move files # from this day. should_move_last_processed_date = next_date_str != last_processed_date_str # Note: at this point, we expect RAW file type files to already have been moved once they were imported to BQ. self.fs.mv_processed_paths_before_date_to_storage( self.ingest_bucket_path, self.storage_directory_path, file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW, date_str_bound=last_processed_date_str, include_bound=should_move_last_processed_date, )