def _data_to_load(self, gcs: GCSFileSystem, scan_type: str, incremental_load: bool, table_name: str, start_date: Optional[datetime.date] = None, end_date: Optional[datetime.date] = None) -> List[str]: """Select the right files to read. Args: gcs: GCSFileSystem object scan_type: one of 'echo', 'discard', 'http', 'https' incremental_load: boolean. If true, only read the latest new data table_name: dataset.table name like 'base.scan_echo' start_date: date object, only files after or at this date will be read end_date: date object, only files at or before this date will be read Returns: A List of filename strings. ex ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json', 'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json'] """ if incremental_load: full_table_name = self._get_full_table_name(table_name) existing_sources = _get_existing_datasources(full_table_name) else: existing_sources = [] # Both zipped and unzipped data to be read in zipped_regex = self.bucket + scan_type + '/**/results.json.gz' unzipped_regex = self.bucket + scan_type + '/**/results.json' zipped_metadata = [m.metadata_list for m in gcs.match([zipped_regex])][0] unzipped_metadata = [ m.metadata_list for m in gcs.match([unzipped_regex]) ][0] file_metadata = zipped_metadata + unzipped_metadata filenames = [metadata.path for metadata in file_metadata] file_sizes = [metadata.size_in_bytes for metadata in file_metadata] filtered_filenames = [ filename for (filename, file_size) in zip(filenames, file_sizes) if (_between_dates(filename, start_date, end_date) and _source_from_filename(filename) not in existing_sources and file_size != 0) ] return filtered_filenames
def _data_to_load(self, gcs: GCSFileSystem, scan_type: str, incremental_load: bool, table_name: str, start_date: Optional[datetime.date] = None, end_date: Optional[datetime.date] = None) -> List[str]: """Select the right files to read. Args: gcs: GCSFileSystem object scan_type: one of 'echo', 'discard', 'http', 'https', 'satellite' incremental_load: boolean. If true, only read the latest new data table_name: dataset.table name like 'base.scan_echo' start_date: date object, only files after or at this date will be read end_date: date object, only files at or before this date will be read Returns: A List of filename strings. ex ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json', 'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json'] """ if incremental_load: full_table_name = self._get_full_table_name(table_name) existing_sources = _get_existing_datasources(full_table_name) else: existing_sources = [] if scan_type == satellite.SCAN_TYPE_SATELLITE: files_to_load = flatten_satellite.SATELLITE_FILES else: files_to_load = SCAN_FILES # Filepath like `gs://firehook-scans/echo/**/*' files_regex = f'{self.bucket}{scan_type}/**/*' file_metadata = [m.metadata_list for m in gcs.match([files_regex])][0] filepaths = [metadata.path for metadata in file_metadata] file_sizes = [metadata.size_in_bytes for metadata in file_metadata] filtered_filenames = [ filepath for (filepath, file_size) in zip(filepaths, file_sizes) if (_between_dates(filepath, start_date, end_date) and _filename_matches(filepath, files_to_load) and flatten_base.source_from_filename(filepath) not in existing_sources and file_size > EMPTY_GZIPPED_FILE_SIZE) ] return filtered_filenames
def run(): p = beam.Pipeline(options=PipelineOptions()) gcs = GCSFileSystem(PipelineOptions()) pattern_1 = [ 'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untarI20180130/DESIGN/USD0808610-20180130.ZIP'] input_pattern = ['gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP'] input_pattern_1 = 'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP' parent_zip = 'gs://bulk_pdfimages_dump/bulkdata.uspto.gov/data/patent/grant/redbook/2010/I20100202.zip' result = [m.metadata_list for m in gcs.match(input_pattern)] metadata_list = result.pop() print 'satya' parts = (p # | 'Match Files' >> fileio.MatchFiles(pattern_1) | 'Return nested files' >> beam.Create(metadata_list) # | 'print Files' >> beam | 'Print read file' >> beam.ParDo(ImageExtract()) # | 'one' >> beam.Map() ) p.run().wait_until_finish()