Python GCSFileSystem.rm Examples

Programming Language: Python

Namespace/Package Name: gcsfs

Class/Type: GCSFileSystem

Method/Function: rm

Examples at hotexamples.com: 4

Python GCSFileSystem.rm - 4 examples found. These are the top rated real world Python examples of gcsfs.GCSFileSystem.rm extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GCSFileSystem(26)

ls(7)

open(7)

exists(4)

rm(4)

end_transaction(2)

info(2)

start_transaction(2)

touch(2)

du(1)

get(1)

glob(1)

isdir(1)

put(1)

upload(1)

Example #1

Show file

def _clean_cdp_filestore(google_creds_path: Path) -> None:
    # Connect to database
    fs = GCSFileSystem(token=str(google_creds_path))

    # Open the key to get the project id
    with open(google_creds_path, "r") as open_resource:
        creds = json.load(open_resource)
        project_id = creds["project_id"]

    # Remove all files in bucket
    bucket = f"{project_id}.appspot.com"
    log.info(f"Cleaning bucket: {bucket}")
    try:
        fs.rm(f"{bucket}/*")
    # Handle empty bucket
    except FileNotFoundError:
        pass

    log.info("Filestore cleaning complete")

Example #2

Show file

class GCSFS(Operations):
    def __init__(self, path='.', gcs=None, **fsargs):
        if gcs is None:
            self.gcs = GCSFileSystem(**fsargs)
        else:
            self.gcs = gcs
        self.cache = {}
        self.counter = 0
        self.root = path

    def getattr(self, path, fh=None):
        try:
            info = self.gcs.info(''.join([self.root, path]))
        except FileNotFoundError:
            raise FuseOSError(ENOENT)
        data = {'st_uid': 1000, 'st_gid': 1000}
        perm = 0o777

        if info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']:
            data['st_atime'] = 0
            data['st_ctime'] = 0
            data['st_mtime'] = 0
            data['st_mode'] = (stat.S_IFDIR | perm)
            data['st_size'] = 0
            data['st_blksize'] = 0
        else:
            data['st_atime'] = str_to_time(info['timeStorageClassUpdated'])
            data['st_ctime'] = str_to_time(info['timeCreated'])
            data['st_mtime'] = str_to_time(info['updated'])
            data['st_mode'] = (stat.S_IFREG | perm)
            data['st_size'] = info['size']
            data['st_blksize'] = 5 * 2**20
            data['st_nlink'] = 1

        return data

    def readdir(self, path, fh):
        path = ''.join([self.root, path])
        files = self.gcs.ls(path)
        files = [f.rstrip('/').rsplit('/', 1)[1] for f in files]
        return ['.', '..'] + files

    def mkdir(self, path, mode):
        bucket, key = core.split_path(path)
        if not self.gcs.info(path):
            self.gcs.dirs['bucket'].append({
                'bucket': bucket,
                'kind': 'storage#object',
                'size': 0,
                'storageClass': 'DIRECTORY',
                'name': path.rstrip('/') + '/'
            })

    def rmdir(self, path):
        info = self.gcs.info(path)
        if info['storageClass':'DIRECTORY']:
            self.gcs.rm(path, False)

    def read(self, path, size, offset, fh):
        print('read', path, size, offset, fh)
        fn = ''.join([self.root, path])
        f = self.cache[fn]
        f.seek(offset)
        out = f.read(size)
        return out

    def write(self, path, data, offset, fh):
        print('write', path, offset, fh)
        f = self.cache[fh]
        f.write(data)
        return len(data)

    def create(self, path, flags):
        print('create', path, oct(flags))
        fn = ''.join([self.root, path])
        self.gcs.touch(
            fn)  # this makes sure directory entry exists - wasteful!
        # write (but ignore creation flags)
        f = self.gcs.open(fn, 'wb')
        self.cache[self.counter] = f
        self.counter += 1
        return self.counter - 1

    def open(self, path, flags):
        print('open', path, oct(flags))
        fn = ''.join([self.root, path])
        if flags % 2 == 0:
            # read
            f = self.gcs.open(fn, 'rb')
        else:
            # write (but ignore creation flags)
            f = self.gcs.open(fn, 'wb')
        self.cache[self.counter] = f
        self.counter += 1
        return self.counter - 1

    def truncate(self, path, length, fh=None):
        print('truncate', path, length, fh)
        fn = ''.join([self.root, path])
        if length != 0:
            raise NotImplementedError
        # maybe should be no-op since open with write sets size to zero anyway
        self.gcs.touch(fn)

    def unlink(self, path):
        print('delete', path)
        fn = ''.join([self.root, path])
        try:
            self.gcs.rm(fn, False)
        except (IOError, FileNotFoundError):
            raise FuseOSError(EIO)

    def release(self, path, fh):
        print('close', path, fh)
        try:
            f = self.cache[fh]
            f.close()
            self.cache.pop(fh, None)  # should release any cache memory
        except Exception as e:
            print(e)
        return 0

    def chmod(self, path, mode):
        raise NotImplementedError

Example #3

Show file

File: gcsfuse.py Project: ryan-williams/gcsfs

class GCSFS(Operations):

    def __init__(self, path='.', gcs=None, nfiles=10, **fsargs):
        if gcs is None:
            # minimum block size: still read on 5MB boundaries.
            self.gcs = GCSFileSystem(block_size=30 * 2 ** 20,
                                     cache_timeout=6000, **fsargs)
        else:
            self.gcs = gcs
        self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles)
        self.write_cache = {}
        self.counter = 0
        self.root = path

    @_tracemethod
    def getattr(self, path, fh=None):
        path = ''.join([self.root, path])
        try:
            info = self.gcs.info(path)
        except FileNotFoundError:
            parent = path.rsplit('/', 1)[0]
            if path in self.gcs.ls(parent):
                info = True
            else:
                raise FuseOSError(ENOENT)
        data = {'st_uid': 1000, 'st_gid': 1000}
        perm = 0o777

        if (info is True or info['storageClass'] == 'DIRECTORY'
                or 'bucket' in info['kind']):
            data['st_atime'] = 0
            data['st_ctime'] = 0
            data['st_mtime'] = 0
            data['st_mode'] = (stat.S_IFDIR | perm)
            data['st_size'] = 0
            data['st_blksize'] = 0
        else:
            data['st_atime'] = str_to_time(info['timeStorageClassUpdated'])
            data['st_ctime'] = str_to_time(info['timeCreated'])
            data['st_mtime'] = str_to_time(info['updated'])
            data['st_mode'] = (stat.S_IFREG | perm)
            data['st_size'] = info['size']
            data['st_blksize'] = 5 * 2**20
            data['st_nlink'] = 1
        return data

    @_tracemethod
    def readdir(self, path, fh):
        path = ''.join([self.root, path])
        logger.info("List {}, {}".format(path, fh))
        files = self.gcs.ls(path)
        files = [os.path.basename(f.rstrip('/')) for f in files]
        return ['.', '..'] + files

    @_tracemethod
    def mkdir(self, path, mode):
        path = ''.join([self.root, path])
        logger.info("Mkdir {}".format(path))
        parent, name = path.rsplit('/', 1)
        prefixes = self.gcs._listing_cache[parent + '/'][1]['prefixes']
        if name not in prefixes:
            prefixes.append(name)
        return 0

    @_tracemethod
    def rmdir(self, path):
        info = self.gcs.info(path)
        if info['storageClass': 'DIRECTORY']:
            self.gcs.rm(path, False)

    @_tracemethod
    def read(self, path, size, offset, fh):
        fn = ''.join([self.root, path])
        logger.info('read #{} ({}) offset: {}, size: {}'.format(
            fh, fn, offset, size))
        out = self.cache.read(fn, offset, size)
        return out

    @_tracemethod
    def write(self, path, data, offset, fh):
        fn = ''.join([self.root, path])
        logger.info('write #{} ({}) offset'.format(fh, fn, offset))
        f = self.write_cache[fh]
        f.write(data)
        return len(data)

    @_tracemethod
    def create(self, path, flags):
        fn = ''.join([self.root, path])
        logger.info('create {} {}'.format(fn, oct(flags)))
        self.gcs.touch(fn)  # this makes sure directory entry exists - wasteful!
        # write (but ignore creation flags)
        f = self.gcs.open(fn, 'wb')
        self.write_cache[self.counter] = f
        logger.info('-> fh #{}'.format(self.counter))
        self.counter += 1
        return self.counter - 1

    @_tracemethod
    def open(self, path, flags):
        fn = ''.join([self.root, path])
        logger.info('open {} {}'.format(fn, oct(flags)))
        if flags % 2 == 0:
            # read
            self.cache.open(fn)
        else:
            # write (but ignore creation flags)
            self.gcs.open(fn, 'wb')
            self.write_cache[self.counter] = f
        logger.info('-> fh #{}'.format(self.counter))
        self.counter += 1
        return self.counter - 1

    @_tracemethod
    def truncate(self, path, length, fh=None):
        fn = ''.join([self.root, path])
        logger.info('truncate #{} ({}) to {}'.format(fh, fn, length))
        if length != 0:
            raise NotImplementedError
        # maybe should be no-op since open with write sets size to zero anyway
        self.gcs.touch(fn)

    @_tracemethod
    def unlink(self, path):
        fn = ''.join([self.root, path])
        logger.info('delete', fn)
        try:
            self.gcs.rm(fn, False)
        except (IOError, FileNotFoundError):
            raise FuseOSError(EIO)

    @_tracemethod
    def release(self, path, fh):
        fn = ''.join([self.root, path])
        logger.info('close #{} ({})'.format(fh, fn))
        try:
            if fh in self.write_cache:
                # write mode
                f = self.write_cache[fh]
                f.close()
                self.write_cache.pop(fh, None)
        except Exception as e:
            logger.exception("exception on release:" + str(e))
        return 0

    @_tracemethod
    def chmod(self, path, mode):
        raise NotImplementedError

Example #4

Show file

File: test_deltatable_gcp.py Project: Pverheijen/DeltaLakeReader

class DeltaReaderAppendTest(TestCase):
    @classmethod
    def setUpClass(self):
        self.path = f"tests/{str(uuid.uuid4())}/table1"
        self.spark = (
            pyspark.sql.SparkSession.builder.appName("deltalake").config(
                "spark.jars.packages",
                "io.delta:delta-core_2.12:0.7.0").config(
                    "spark.sql.extensions",
                    "io.delta.sql.DeltaSparkSessionExtension").config(
                        "spark.sql.catalog.spark_catalog",
                        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
                    ).getOrCreate())
        df = (self.spark.range(0,
                               1000).withColumn("number", rand()).withColumn(
                                   "number2",
                                   when(col("id") < 500, 0).otherwise(1)))

        for i in range(12):
            df.write.partitionBy("number2").format("delta").mode(
                "append").save(self.path)
        self.fs = GCSFileSystem(project=GCP_PROJECT_ID)

        self.fs.upload(self.path, f"{GCP_BUCKET}/{self.path}", recursive=True)
        self.table = DeltaTable(f"{GCP_BUCKET}/{self.path}",
                                file_system=self.fs)

    @classmethod
    def tearDownClass(self):
        # remove folder when we are done with the test
        self.fs.rm(f"{GCP_BUCKET}/{self.path}", recursive=True)
        shutil.rmtree(self.path)

    def test_paths(self):
        assert self.table.path == f"{GCP_BUCKET}/{self.path}"
        assert self.table.log_path == f"{GCP_BUCKET}/{self.path}/_delta_log"

    def test_versions(self):

        assert self.table.checkpoint == 10
        assert self.table.version == 11

    def test_data(self):

        # read the parquet files using pandas
        df_pandas = self.table.to_pandas()
        # read the table using spark
        df_spark = self.spark.read.format("delta").load(self.path).toPandas()

        # compare dataframes. The index may not be the same order, so we ignore it
        assert_frame_equal(
            df_pandas.sort_values("id").reset_index(drop=True),
            df_spark.sort_values("id").reset_index(drop=True),
        )

    def test_version(self):
        # read the parquet files using pandas
        df_pandas = self.table.as_version(5, inplace=False).to_pandas()
        # read the table using spark
        df_spark = (self.spark.read.format("delta").option(
            "versionAsOf", 5).load(self.path).toPandas())

        # compare dataframes. The index may not be the same order, so we ignore it
        assert_frame_equal(
            df_pandas.sort_values("id").reset_index(drop=True),
            df_spark.sort_values("id").reset_index(drop=True),
        )

    def test_partitioning(self):
        # Partition pruning should half number of rows
        assert self.table.to_table(
            filter=ds.field("number2") == 0).num_rows == 6000

    def test_predicate_pushdown(self):
        # number is random 0-1, so we should have fewer than 12000 rows no matter what
        assert self.table.to_table(
            filter=ds.field("number") < 0.5).num_rows < 12000

    def test_column_pruning(self):
        t = self.table.to_table(columns=["number", "number2"])
        assert t.column_names == ["number", "number2"]