コード例 #1
0
ファイル: gcp_storage.py プロジェクト: FMakosza/prbx-tracking
class GcsUnstructuredProvider(UnstructuredStorageProvider):
    """This class allows you to upload arbitrary bytes to GCS.
    They will be stored under bucket_name/base_path/filename
    """

    file_system: GCSFileSystem

    def __init__(
        self,
        project: str,
        bucket_name: str,
        base_path: str,
        token: str = None,
    ) -> None:
        super().__init__()
        self.project = project
        self.bucket_name = bucket_name
        self.base_path = base_path
        self.token = token
        self.base_path = f"{bucket_name}/{base_path}/{{filename}}"

        self.file_name_cache: Set[str] = set()
        """The set of all filenames ever uploaded, checked before uploading"""
        self.logger = logging.getLogger("openwpm")

    async def init(self) -> None:
        await super(GcsUnstructuredProvider, self).init()
        self.file_system = GCSFileSystem(project=self.project,
                                         token=self.token,
                                         access="read_write")

    async def store_blob(self,
                         filename: str,
                         blob: bytes,
                         overwrite: bool = False) -> None:
        target_path = self.base_path.format(filename=filename)
        if not overwrite and (filename in self.file_name_cache
                              or self.file_system.exists(target_path)):
            self.logger.info("Not saving out file %s as it already exists",
                             filename)
            return
        self.file_system.start_transaction()

        with self.file_system.open(target_path, mode="wb") as f:
            f.write(blob)

        self.file_system.end_transaction()

        self.file_name_cache.add(filename)

    async def flush_cache(self) -> None:
        pass

    async def shutdown(self) -> None:
        pass
コード例 #2
0
        def load_model_from_path(path, project_name=None, key=None):

            if path[:5] == 'gs://':
                if project_name is None:
                    fs = GCSFileSystem()
                else:
                    fs = GCSFileSystem(project_name)
                file = fs.open(path)
            else:
                file = path

            return load_model(file, custom_objects={'Swish': Swish, 'InstanceNormalization': InstanceNormalization})
コード例 #3
0
def main(month, type_, outfile):
    spark = build_spark()
    raw_dat = spark.read.parquet('gs://spain-tweets/rehydrated/lake').where(f'month = {month}')
    dat = get_dat(spark, raw_dat)
    tweets = get_tweets(dat)

    if type_ == 'tweets':
        nodes, edges = build_tweet_graph(tweets, dat)
        G = create_graph(nodes, edges, 'id_str')

    elif type_ == 'users':
        nodes, edges = build_user_graph(tweets)
        G = create_graph(nodes, edges, 'user')

    else:
        raise TypeError(f'Unrecognized type_ parameter: {type_}')

    fs = GCSFileSystem(project = 'trollhunters')
    with fs.open(outfile, 'wb') as f:
        nx.write_graphml(G, f)            
コード例 #4
0
ファイル: util.py プロジェクト: TurkeyBlaster/Sunset-GAN
def load_npz(path, project_name=None, key=None):

    if path[:5] == 'gs://':

        if project_name is None:
            fs = GCSFileSystem(token=key)
        else:
            fs = GCSFileSystem(project_name, token=key)
        file = fs.open(path)

    else:
        file = path

    print(f'Loading file {path.rsplit("/", 1)[-1]}')
    with np.load(file, allow_pickle=True) as npz:
        print(f'Available files: {npz.files}')
        X = npz[npz.files[0]]
        X = np.expand_dims(X, -1)[0]['sunset_ims']

    return X
コード例 #5
0
class GCSFS(Operations):
    def __init__(self, path='.', gcs=None, **fsargs):
        if gcs is None:
            self.gcs = GCSFileSystem(**fsargs)
        else:
            self.gcs = gcs
        self.cache = {}
        self.counter = 0
        self.root = path

    def getattr(self, path, fh=None):
        try:
            info = self.gcs.info(''.join([self.root, path]))
        except FileNotFoundError:
            raise FuseOSError(ENOENT)
        data = {'st_uid': 1000, 'st_gid': 1000}
        perm = 0o777

        if info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']:
            data['st_atime'] = 0
            data['st_ctime'] = 0
            data['st_mtime'] = 0
            data['st_mode'] = (stat.S_IFDIR | perm)
            data['st_size'] = 0
            data['st_blksize'] = 0
        else:
            data['st_atime'] = str_to_time(info['timeStorageClassUpdated'])
            data['st_ctime'] = str_to_time(info['timeCreated'])
            data['st_mtime'] = str_to_time(info['updated'])
            data['st_mode'] = (stat.S_IFREG | perm)
            data['st_size'] = info['size']
            data['st_blksize'] = 5 * 2**20
            data['st_nlink'] = 1

        return data

    def readdir(self, path, fh):
        path = ''.join([self.root, path])
        files = self.gcs.ls(path)
        files = [f.rstrip('/').rsplit('/', 1)[1] for f in files]
        return ['.', '..'] + files

    def mkdir(self, path, mode):
        bucket, key = core.split_path(path)
        if not self.gcs.info(path):
            self.gcs.dirs['bucket'].append({
                'bucket': bucket,
                'kind': 'storage#object',
                'size': 0,
                'storageClass': 'DIRECTORY',
                'name': path.rstrip('/') + '/'
            })

    def rmdir(self, path):
        info = self.gcs.info(path)
        if info['storageClass':'DIRECTORY']:
            self.gcs.rm(path, False)

    def read(self, path, size, offset, fh):
        print('read', path, size, offset, fh)
        fn = ''.join([self.root, path])
        f = self.cache[fn]
        f.seek(offset)
        out = f.read(size)
        return out

    def write(self, path, data, offset, fh):
        print('write', path, offset, fh)
        f = self.cache[fh]
        f.write(data)
        return len(data)

    def create(self, path, flags):
        print('create', path, oct(flags))
        fn = ''.join([self.root, path])
        self.gcs.touch(
            fn)  # this makes sure directory entry exists - wasteful!
        # write (but ignore creation flags)
        f = self.gcs.open(fn, 'wb')
        self.cache[self.counter] = f
        self.counter += 1
        return self.counter - 1

    def open(self, path, flags):
        print('open', path, oct(flags))
        fn = ''.join([self.root, path])
        if flags % 2 == 0:
            # read
            f = self.gcs.open(fn, 'rb')
        else:
            # write (but ignore creation flags)
            f = self.gcs.open(fn, 'wb')
        self.cache[self.counter] = f
        self.counter += 1
        return self.counter - 1

    def truncate(self, path, length, fh=None):
        print('truncate', path, length, fh)
        fn = ''.join([self.root, path])
        if length != 0:
            raise NotImplementedError
        # maybe should be no-op since open with write sets size to zero anyway
        self.gcs.touch(fn)

    def unlink(self, path):
        print('delete', path)
        fn = ''.join([self.root, path])
        try:
            self.gcs.rm(fn, False)
        except (IOError, FileNotFoundError):
            raise FuseOSError(EIO)

    def release(self, path, fh):
        print('close', path, fh)
        try:
            f = self.cache[fh]
            f.close()
            self.cache.pop(fh, None)  # should release any cache memory
        except Exception as e:
            print(e)
        return 0

    def chmod(self, path, mode):
        raise NotImplementedError
コード例 #6
0
ファイル: dig.py プロジェクト: agriuseatstweets/dig
def read_schema(path):
    fs = GCSFileSystem(project='trollhunters')
    with fs.open(path, 'rb') as f:
        schema = pickle.load(f)
    return schema
コード例 #7
0
ファイル: gcsfuse.py プロジェクト: ryan-williams/gcsfs
class GCSFS(Operations):

    def __init__(self, path='.', gcs=None, nfiles=10, **fsargs):
        if gcs is None:
            # minimum block size: still read on 5MB boundaries.
            self.gcs = GCSFileSystem(block_size=30 * 2 ** 20,
                                     cache_timeout=6000, **fsargs)
        else:
            self.gcs = gcs
        self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles)
        self.write_cache = {}
        self.counter = 0
        self.root = path

    @_tracemethod
    def getattr(self, path, fh=None):
        path = ''.join([self.root, path])
        try:
            info = self.gcs.info(path)
        except FileNotFoundError:
            parent = path.rsplit('/', 1)[0]
            if path in self.gcs.ls(parent):
                info = True
            else:
                raise FuseOSError(ENOENT)
        data = {'st_uid': 1000, 'st_gid': 1000}
        perm = 0o777

        if (info is True or info['storageClass'] == 'DIRECTORY'
                or 'bucket' in info['kind']):
            data['st_atime'] = 0
            data['st_ctime'] = 0
            data['st_mtime'] = 0
            data['st_mode'] = (stat.S_IFDIR | perm)
            data['st_size'] = 0
            data['st_blksize'] = 0
        else:
            data['st_atime'] = str_to_time(info['timeStorageClassUpdated'])
            data['st_ctime'] = str_to_time(info['timeCreated'])
            data['st_mtime'] = str_to_time(info['updated'])
            data['st_mode'] = (stat.S_IFREG | perm)
            data['st_size'] = info['size']
            data['st_blksize'] = 5 * 2**20
            data['st_nlink'] = 1
        return data

    @_tracemethod
    def readdir(self, path, fh):
        path = ''.join([self.root, path])
        logger.info("List {}, {}".format(path, fh))
        files = self.gcs.ls(path)
        files = [os.path.basename(f.rstrip('/')) for f in files]
        return ['.', '..'] + files

    @_tracemethod
    def mkdir(self, path, mode):
        path = ''.join([self.root, path])
        logger.info("Mkdir {}".format(path))
        parent, name = path.rsplit('/', 1)
        prefixes = self.gcs._listing_cache[parent + '/'][1]['prefixes']
        if name not in prefixes:
            prefixes.append(name)
        return 0

    @_tracemethod
    def rmdir(self, path):
        info = self.gcs.info(path)
        if info['storageClass': 'DIRECTORY']:
            self.gcs.rm(path, False)

    @_tracemethod
    def read(self, path, size, offset, fh):
        fn = ''.join([self.root, path])
        logger.info('read #{} ({}) offset: {}, size: {}'.format(
            fh, fn, offset, size))
        out = self.cache.read(fn, offset, size)
        return out

    @_tracemethod
    def write(self, path, data, offset, fh):
        fn = ''.join([self.root, path])
        logger.info('write #{} ({}) offset'.format(fh, fn, offset))
        f = self.write_cache[fh]
        f.write(data)
        return len(data)

    @_tracemethod
    def create(self, path, flags):
        fn = ''.join([self.root, path])
        logger.info('create {} {}'.format(fn, oct(flags)))
        self.gcs.touch(fn)  # this makes sure directory entry exists - wasteful!
        # write (but ignore creation flags)
        f = self.gcs.open(fn, 'wb')
        self.write_cache[self.counter] = f
        logger.info('-> fh #{}'.format(self.counter))
        self.counter += 1
        return self.counter - 1

    @_tracemethod
    def open(self, path, flags):
        fn = ''.join([self.root, path])
        logger.info('open {} {}'.format(fn, oct(flags)))
        if flags % 2 == 0:
            # read
            self.cache.open(fn)
        else:
            # write (but ignore creation flags)
            self.gcs.open(fn, 'wb')
            self.write_cache[self.counter] = f
        logger.info('-> fh #{}'.format(self.counter))
        self.counter += 1
        return self.counter - 1

    @_tracemethod
    def truncate(self, path, length, fh=None):
        fn = ''.join([self.root, path])
        logger.info('truncate #{} ({}) to {}'.format(fh, fn, length))
        if length != 0:
            raise NotImplementedError
        # maybe should be no-op since open with write sets size to zero anyway
        self.gcs.touch(fn)

    @_tracemethod
    def unlink(self, path):
        fn = ''.join([self.root, path])
        logger.info('delete', fn)
        try:
            self.gcs.rm(fn, False)
        except (IOError, FileNotFoundError):
            raise FuseOSError(EIO)

    @_tracemethod
    def release(self, path, fh):
        fn = ''.join([self.root, path])
        logger.info('close #{} ({})'.format(fh, fn))
        try:
            if fh in self.write_cache:
                # write mode
                f = self.write_cache[fh]
                f.close()
                self.write_cache.pop(fh, None)
        except Exception as e:
            logger.exception("exception on release:" + str(e))
        return 0

    @_tracemethod
    def chmod(self, path, mode):
        raise NotImplementedError