コード例 #1
0
    def _data_to_load(self,
                      gcs: GCSFileSystem,
                      scan_type: str,
                      incremental_load: bool,
                      table_name: str,
                      start_date: Optional[datetime.date] = None,
                      end_date: Optional[datetime.date] = None) -> List[str]:
        """Select the right files to read.

    Args:
      gcs: GCSFileSystem object
      scan_type: one of 'echo', 'discard', 'http', 'https'
      incremental_load: boolean. If true, only read the latest new data
      table_name: dataset.table name like 'base.scan_echo'
      start_date: date object, only files after or at this date will be read
      end_date: date object, only files at or before this date will be read

    Returns:
      A List of filename strings. ex
       ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json',
        'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json']
    """
        if incremental_load:
            full_table_name = self._get_full_table_name(table_name)
            existing_sources = _get_existing_datasources(full_table_name)
        else:
            existing_sources = []

        # Both zipped and unzipped data to be read in
        zipped_regex = self.bucket + scan_type + '/**/results.json.gz'
        unzipped_regex = self.bucket + scan_type + '/**/results.json'

        zipped_metadata = [m.metadata_list
                           for m in gcs.match([zipped_regex])][0]
        unzipped_metadata = [
            m.metadata_list for m in gcs.match([unzipped_regex])
        ][0]
        file_metadata = zipped_metadata + unzipped_metadata

        filenames = [metadata.path for metadata in file_metadata]
        file_sizes = [metadata.size_in_bytes for metadata in file_metadata]

        filtered_filenames = [
            filename for (filename, file_size) in zip(filenames, file_sizes)
            if (_between_dates(filename, start_date, end_date)
                and _source_from_filename(filename) not in existing_sources
                and file_size != 0)
        ]
        return filtered_filenames
コード例 #2
0
    def _data_to_load(self,
                      gcs: GCSFileSystem,
                      scan_type: str,
                      incremental_load: bool,
                      table_name: str,
                      start_date: Optional[datetime.date] = None,
                      end_date: Optional[datetime.date] = None) -> List[str]:
        """Select the right files to read.

    Args:
      gcs: GCSFileSystem object
      scan_type: one of 'echo', 'discard', 'http', 'https', 'satellite'
      incremental_load: boolean. If true, only read the latest new data
      table_name: dataset.table name like 'base.scan_echo'
      start_date: date object, only files after or at this date will be read
      end_date: date object, only files at or before this date will be read

    Returns:
      A List of filename strings. ex
       ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json',
        'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json']
    """
        if incremental_load:
            full_table_name = self._get_full_table_name(table_name)
            existing_sources = _get_existing_datasources(full_table_name)
        else:
            existing_sources = []

        if scan_type == satellite.SCAN_TYPE_SATELLITE:
            files_to_load = flatten_satellite.SATELLITE_FILES
        else:
            files_to_load = SCAN_FILES

        # Filepath like `gs://firehook-scans/echo/**/*'
        files_regex = f'{self.bucket}{scan_type}/**/*'
        file_metadata = [m.metadata_list for m in gcs.match([files_regex])][0]

        filepaths = [metadata.path for metadata in file_metadata]
        file_sizes = [metadata.size_in_bytes for metadata in file_metadata]

        filtered_filenames = [
            filepath for (filepath, file_size) in zip(filepaths, file_sizes)
            if (_between_dates(filepath, start_date, end_date)
                and _filename_matches(filepath, files_to_load)
                and flatten_base.source_from_filename(filepath) not in
                existing_sources and file_size > EMPTY_GZIPPED_FILE_SIZE)
        ]

        return filtered_filenames
コード例 #3
0
def run():
    p = beam.Pipeline(options=PipelineOptions())
    gcs = GCSFileSystem(PipelineOptions())
    pattern_1 = [
        'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untarI20180130/DESIGN/USD0808610-20180130.ZIP']
    input_pattern = ['gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP']
    input_pattern_1 = 'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP'

    parent_zip = 'gs://bulk_pdfimages_dump/bulkdata.uspto.gov/data/patent/grant/redbook/2010/I20100202.zip'

    result = [m.metadata_list for m in gcs.match(input_pattern)]

    metadata_list = result.pop()

    print 'satya'
    parts = (p
             # | 'Match Files' >> fileio.MatchFiles(pattern_1)
             | 'Return nested files' >> beam.Create(metadata_list)
             # | 'print Files' >> beam
             | 'Print read file' >> beam.ParDo(ImageExtract())
             # | 'one' >> beam.Map()
             )

    p.run().wait_until_finish()