コード例 #1
0
def _clean_cdp_filestore(google_creds_path: Path) -> None:
    # Connect to database
    fs = GCSFileSystem(token=str(google_creds_path))

    # Open the key to get the project id
    with open(google_creds_path, "r") as open_resource:
        creds = json.load(open_resource)
        project_id = creds["project_id"]

    # Remove all files in bucket
    bucket = f"{project_id}.appspot.com"
    log.info(f"Cleaning bucket: {bucket}")
    try:
        fs.rm(f"{bucket}/*")
    # Handle empty bucket
    except FileNotFoundError:
        pass

    log.info("Filestore cleaning complete")
コード例 #2
0
class GCSFS(Operations):
    def __init__(self, path='.', gcs=None, **fsargs):
        if gcs is None:
            self.gcs = GCSFileSystem(**fsargs)
        else:
            self.gcs = gcs
        self.cache = {}
        self.counter = 0
        self.root = path

    def getattr(self, path, fh=None):
        try:
            info = self.gcs.info(''.join([self.root, path]))
        except FileNotFoundError:
            raise FuseOSError(ENOENT)
        data = {'st_uid': 1000, 'st_gid': 1000}
        perm = 0o777

        if info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']:
            data['st_atime'] = 0
            data['st_ctime'] = 0
            data['st_mtime'] = 0
            data['st_mode'] = (stat.S_IFDIR | perm)
            data['st_size'] = 0
            data['st_blksize'] = 0
        else:
            data['st_atime'] = str_to_time(info['timeStorageClassUpdated'])
            data['st_ctime'] = str_to_time(info['timeCreated'])
            data['st_mtime'] = str_to_time(info['updated'])
            data['st_mode'] = (stat.S_IFREG | perm)
            data['st_size'] = info['size']
            data['st_blksize'] = 5 * 2**20
            data['st_nlink'] = 1

        return data

    def readdir(self, path, fh):
        path = ''.join([self.root, path])
        files = self.gcs.ls(path)
        files = [f.rstrip('/').rsplit('/', 1)[1] for f in files]
        return ['.', '..'] + files

    def mkdir(self, path, mode):
        bucket, key = core.split_path(path)
        if not self.gcs.info(path):
            self.gcs.dirs['bucket'].append({
                'bucket': bucket,
                'kind': 'storage#object',
                'size': 0,
                'storageClass': 'DIRECTORY',
                'name': path.rstrip('/') + '/'
            })

    def rmdir(self, path):
        info = self.gcs.info(path)
        if info['storageClass':'DIRECTORY']:
            self.gcs.rm(path, False)

    def read(self, path, size, offset, fh):
        print('read', path, size, offset, fh)
        fn = ''.join([self.root, path])
        f = self.cache[fn]
        f.seek(offset)
        out = f.read(size)
        return out

    def write(self, path, data, offset, fh):
        print('write', path, offset, fh)
        f = self.cache[fh]
        f.write(data)
        return len(data)

    def create(self, path, flags):
        print('create', path, oct(flags))
        fn = ''.join([self.root, path])
        self.gcs.touch(
            fn)  # this makes sure directory entry exists - wasteful!
        # write (but ignore creation flags)
        f = self.gcs.open(fn, 'wb')
        self.cache[self.counter] = f
        self.counter += 1
        return self.counter - 1

    def open(self, path, flags):
        print('open', path, oct(flags))
        fn = ''.join([self.root, path])
        if flags % 2 == 0:
            # read
            f = self.gcs.open(fn, 'rb')
        else:
            # write (but ignore creation flags)
            f = self.gcs.open(fn, 'wb')
        self.cache[self.counter] = f
        self.counter += 1
        return self.counter - 1

    def truncate(self, path, length, fh=None):
        print('truncate', path, length, fh)
        fn = ''.join([self.root, path])
        if length != 0:
            raise NotImplementedError
        # maybe should be no-op since open with write sets size to zero anyway
        self.gcs.touch(fn)

    def unlink(self, path):
        print('delete', path)
        fn = ''.join([self.root, path])
        try:
            self.gcs.rm(fn, False)
        except (IOError, FileNotFoundError):
            raise FuseOSError(EIO)

    def release(self, path, fh):
        print('close', path, fh)
        try:
            f = self.cache[fh]
            f.close()
            self.cache.pop(fh, None)  # should release any cache memory
        except Exception as e:
            print(e)
        return 0

    def chmod(self, path, mode):
        raise NotImplementedError
コード例 #3
0
ファイル: gcsfuse.py プロジェクト: ryan-williams/gcsfs
class GCSFS(Operations):

    def __init__(self, path='.', gcs=None, nfiles=10, **fsargs):
        if gcs is None:
            # minimum block size: still read on 5MB boundaries.
            self.gcs = GCSFileSystem(block_size=30 * 2 ** 20,
                                     cache_timeout=6000, **fsargs)
        else:
            self.gcs = gcs
        self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles)
        self.write_cache = {}
        self.counter = 0
        self.root = path

    @_tracemethod
    def getattr(self, path, fh=None):
        path = ''.join([self.root, path])
        try:
            info = self.gcs.info(path)
        except FileNotFoundError:
            parent = path.rsplit('/', 1)[0]
            if path in self.gcs.ls(parent):
                info = True
            else:
                raise FuseOSError(ENOENT)
        data = {'st_uid': 1000, 'st_gid': 1000}
        perm = 0o777

        if (info is True or info['storageClass'] == 'DIRECTORY'
                or 'bucket' in info['kind']):
            data['st_atime'] = 0
            data['st_ctime'] = 0
            data['st_mtime'] = 0
            data['st_mode'] = (stat.S_IFDIR | perm)
            data['st_size'] = 0
            data['st_blksize'] = 0
        else:
            data['st_atime'] = str_to_time(info['timeStorageClassUpdated'])
            data['st_ctime'] = str_to_time(info['timeCreated'])
            data['st_mtime'] = str_to_time(info['updated'])
            data['st_mode'] = (stat.S_IFREG | perm)
            data['st_size'] = info['size']
            data['st_blksize'] = 5 * 2**20
            data['st_nlink'] = 1
        return data

    @_tracemethod
    def readdir(self, path, fh):
        path = ''.join([self.root, path])
        logger.info("List {}, {}".format(path, fh))
        files = self.gcs.ls(path)
        files = [os.path.basename(f.rstrip('/')) for f in files]
        return ['.', '..'] + files

    @_tracemethod
    def mkdir(self, path, mode):
        path = ''.join([self.root, path])
        logger.info("Mkdir {}".format(path))
        parent, name = path.rsplit('/', 1)
        prefixes = self.gcs._listing_cache[parent + '/'][1]['prefixes']
        if name not in prefixes:
            prefixes.append(name)
        return 0

    @_tracemethod
    def rmdir(self, path):
        info = self.gcs.info(path)
        if info['storageClass': 'DIRECTORY']:
            self.gcs.rm(path, False)

    @_tracemethod
    def read(self, path, size, offset, fh):
        fn = ''.join([self.root, path])
        logger.info('read #{} ({}) offset: {}, size: {}'.format(
            fh, fn, offset, size))
        out = self.cache.read(fn, offset, size)
        return out

    @_tracemethod
    def write(self, path, data, offset, fh):
        fn = ''.join([self.root, path])
        logger.info('write #{} ({}) offset'.format(fh, fn, offset))
        f = self.write_cache[fh]
        f.write(data)
        return len(data)

    @_tracemethod
    def create(self, path, flags):
        fn = ''.join([self.root, path])
        logger.info('create {} {}'.format(fn, oct(flags)))
        self.gcs.touch(fn)  # this makes sure directory entry exists - wasteful!
        # write (but ignore creation flags)
        f = self.gcs.open(fn, 'wb')
        self.write_cache[self.counter] = f
        logger.info('-> fh #{}'.format(self.counter))
        self.counter += 1
        return self.counter - 1

    @_tracemethod
    def open(self, path, flags):
        fn = ''.join([self.root, path])
        logger.info('open {} {}'.format(fn, oct(flags)))
        if flags % 2 == 0:
            # read
            self.cache.open(fn)
        else:
            # write (but ignore creation flags)
            self.gcs.open(fn, 'wb')
            self.write_cache[self.counter] = f
        logger.info('-> fh #{}'.format(self.counter))
        self.counter += 1
        return self.counter - 1

    @_tracemethod
    def truncate(self, path, length, fh=None):
        fn = ''.join([self.root, path])
        logger.info('truncate #{} ({}) to {}'.format(fh, fn, length))
        if length != 0:
            raise NotImplementedError
        # maybe should be no-op since open with write sets size to zero anyway
        self.gcs.touch(fn)

    @_tracemethod
    def unlink(self, path):
        fn = ''.join([self.root, path])
        logger.info('delete', fn)
        try:
            self.gcs.rm(fn, False)
        except (IOError, FileNotFoundError):
            raise FuseOSError(EIO)

    @_tracemethod
    def release(self, path, fh):
        fn = ''.join([self.root, path])
        logger.info('close #{} ({})'.format(fh, fn))
        try:
            if fh in self.write_cache:
                # write mode
                f = self.write_cache[fh]
                f.close()
                self.write_cache.pop(fh, None)
        except Exception as e:
            logger.exception("exception on release:" + str(e))
        return 0

    @_tracemethod
    def chmod(self, path, mode):
        raise NotImplementedError
コード例 #4
0
class DeltaReaderAppendTest(TestCase):
    @classmethod
    def setUpClass(self):
        self.path = f"tests/{str(uuid.uuid4())}/table1"
        self.spark = (
            pyspark.sql.SparkSession.builder.appName("deltalake").config(
                "spark.jars.packages",
                "io.delta:delta-core_2.12:0.7.0").config(
                    "spark.sql.extensions",
                    "io.delta.sql.DeltaSparkSessionExtension").config(
                        "spark.sql.catalog.spark_catalog",
                        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
                    ).getOrCreate())
        df = (self.spark.range(0,
                               1000).withColumn("number", rand()).withColumn(
                                   "number2",
                                   when(col("id") < 500, 0).otherwise(1)))

        for i in range(12):
            df.write.partitionBy("number2").format("delta").mode(
                "append").save(self.path)
        self.fs = GCSFileSystem(project=GCP_PROJECT_ID)

        self.fs.upload(self.path, f"{GCP_BUCKET}/{self.path}", recursive=True)
        self.table = DeltaTable(f"{GCP_BUCKET}/{self.path}",
                                file_system=self.fs)

    @classmethod
    def tearDownClass(self):
        # remove folder when we are done with the test
        self.fs.rm(f"{GCP_BUCKET}/{self.path}", recursive=True)
        shutil.rmtree(self.path)

    def test_paths(self):
        assert self.table.path == f"{GCP_BUCKET}/{self.path}"
        assert self.table.log_path == f"{GCP_BUCKET}/{self.path}/_delta_log"

    def test_versions(self):

        assert self.table.checkpoint == 10
        assert self.table.version == 11

    def test_data(self):

        # read the parquet files using pandas
        df_pandas = self.table.to_pandas()
        # read the table using spark
        df_spark = self.spark.read.format("delta").load(self.path).toPandas()

        # compare dataframes. The index may not be the same order, so we ignore it
        assert_frame_equal(
            df_pandas.sort_values("id").reset_index(drop=True),
            df_spark.sort_values("id").reset_index(drop=True),
        )

    def test_version(self):
        # read the parquet files using pandas
        df_pandas = self.table.as_version(5, inplace=False).to_pandas()
        # read the table using spark
        df_spark = (self.spark.read.format("delta").option(
            "versionAsOf", 5).load(self.path).toPandas())

        # compare dataframes. The index may not be the same order, so we ignore it
        assert_frame_equal(
            df_pandas.sort_values("id").reset_index(drop=True),
            df_spark.sort_values("id").reset_index(drop=True),
        )

    def test_partitioning(self):
        # Partition pruning should half number of rows
        assert self.table.to_table(
            filter=ds.field("number2") == 0).num_rows == 6000

    def test_predicate_pushdown(self):
        # number is random 0-1, so we should have fewer than 12000 rows no matter what
        assert self.table.to_table(
            filter=ds.field("number") < 0.5).num_rows < 12000

    def test_column_pruning(self):
        t = self.table.to_table(columns=["number", "number2"])
        assert t.column_names == ["number", "number2"]