Python GCSFileSystem.match 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: apache_beam.io.gcp.gcsfilesystem

클래스/타입: GCSFileSystem

메소드/함수: match

hotexamples.com에서의 예제들: 3

Python GCSFileSystem.match - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 apache_beam.io.gcp.gcsfilesystem.GCSFileSystem.match에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

GCSFileSystem(13)

open(4)

match(3)

자주 사용되는 메소드들

GCSFileSystem (13)

open (4)

match (3)

예제 #1

파일 보기

    def _data_to_load(self,
                      gcs: GCSFileSystem,
                      scan_type: str,
                      incremental_load: bool,
                      table_name: str,
                      start_date: Optional[datetime.date] = None,
                      end_date: Optional[datetime.date] = None) -> List[str]:
        """Select the right files to read.

    Args:
      gcs: GCSFileSystem object
      scan_type: one of 'echo', 'discard', 'http', 'https'
      incremental_load: boolean. If true, only read the latest new data
      table_name: dataset.table name like 'base.scan_echo'
      start_date: date object, only files after or at this date will be read
      end_date: date object, only files at or before this date will be read

    Returns:
      A List of filename strings. ex
       ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json',
        'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json']
    """
        if incremental_load:
            full_table_name = self._get_full_table_name(table_name)
            existing_sources = _get_existing_datasources(full_table_name)
        else:
            existing_sources = []

        # Both zipped and unzipped data to be read in
        zipped_regex = self.bucket + scan_type + '/**/results.json.gz'
        unzipped_regex = self.bucket + scan_type + '/**/results.json'

        zipped_metadata = [m.metadata_list
                           for m in gcs.match([zipped_regex])][0]
        unzipped_metadata = [
            m.metadata_list for m in gcs.match([unzipped_regex])
        ][0]
        file_metadata = zipped_metadata + unzipped_metadata

        filenames = [metadata.path for metadata in file_metadata]
        file_sizes = [metadata.size_in_bytes for metadata in file_metadata]

        filtered_filenames = [
            filename for (filename, file_size) in zip(filenames, file_sizes)
            if (_between_dates(filename, start_date, end_date)
                and _source_from_filename(filename) not in existing_sources
                and file_size != 0)
        ]
        return filtered_filenames

예제 #2

파일 보기

    def _data_to_load(self,
                      gcs: GCSFileSystem,
                      scan_type: str,
                      incremental_load: bool,
                      table_name: str,
                      start_date: Optional[datetime.date] = None,
                      end_date: Optional[datetime.date] = None) -> List[str]:
        """Select the right files to read.

    Args:
      gcs: GCSFileSystem object
      scan_type: one of 'echo', 'discard', 'http', 'https', 'satellite'
      incremental_load: boolean. If true, only read the latest new data
      table_name: dataset.table name like 'base.scan_echo'
      start_date: date object, only files after or at this date will be read
      end_date: date object, only files at or before this date will be read

    Returns:
      A List of filename strings. ex
       ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json',
        'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json']
    """
        if incremental_load:
            full_table_name = self._get_full_table_name(table_name)
            existing_sources = _get_existing_datasources(full_table_name)
        else:
            existing_sources = []

        if scan_type == satellite.SCAN_TYPE_SATELLITE:
            files_to_load = flatten_satellite.SATELLITE_FILES
        else:
            files_to_load = SCAN_FILES

        # Filepath like `gs://firehook-scans/echo/**/*'
        files_regex = f'{self.bucket}{scan_type}/**/*'
        file_metadata = [m.metadata_list for m in gcs.match([files_regex])][0]

        filepaths = [metadata.path for metadata in file_metadata]
        file_sizes = [metadata.size_in_bytes for metadata in file_metadata]

        filtered_filenames = [
            filepath for (filepath, file_size) in zip(filepaths, file_sizes)
            if (_between_dates(filepath, start_date, end_date)
                and _filename_matches(filepath, files_to_load)
                and flatten_base.source_from_filename(filepath) not in
                existing_sources and file_size > EMPTY_GZIPPED_FILE_SIZE)
        ]

        return filtered_filenames

예제 #3

파일 보기

파일: parent_unpack.py 프로젝트: a-satyateja/python-beam

def run():
    p = beam.Pipeline(options=PipelineOptions())
    gcs = GCSFileSystem(PipelineOptions())
    pattern_1 = [
        'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untarI20180130/DESIGN/USD0808610-20180130.ZIP']
    input_pattern = ['gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP']
    input_pattern_1 = 'gs://dataflow-buffer/parent-unpack/2018/i20180130/PxpFJwJabD-untar*/**/*.ZIP'

    parent_zip = 'gs://bulk_pdfimages_dump/bulkdata.uspto.gov/data/patent/grant/redbook/2010/I20100202.zip'

    result = [m.metadata_list for m in gcs.match(input_pattern)]

    metadata_list = result.pop()

    print 'satya'
    parts = (p
             # | 'Match Files' >> fileio.MatchFiles(pattern_1)
             | 'Return nested files' >> beam.Create(metadata_list)
             # | 'print Files' >> beam
             | 'Print read file' >> beam.ParDo(ImageExtract())
             # | 'one' >> beam.Map()
             )

    p.run().wait_until_finish()