コード例 #1
0
def find_tiles_for_bounding_box(min_lat, max_lat, min_lon, max_lon):
    """
    return a list of 10x10 degree tile names covering the bounding box
    the tile names are in the format of {lat}_{lon} where lat, lon represent the upper left corner
    ocean tiles are removed
    """
    fs = GCSFileSystem(cache_timeout=0)
    folder = 'gs://carbonplan-climatetrace/intermediates/ecoregions_mask/'
    available_tiles = [
        os.path.splitext(os.path.split(path)[-1])[0] for path in fs.ls(folder)
        if not path.endswith('/')
    ]

    step = 10
    lat_start = math.ceil(min_lat / step) * step
    lat_stop = math.ceil(max_lat / step) * step
    all_lat_tiles = np.arange(start=lat_start, stop=lat_stop + 1, step=step)
    if min_lat == lat_start:
        all_lat_tiles = all_lat_tiles[1:]

    lon_start = math.floor(min_lon / step) * step
    lon_stop = math.floor(max_lon / step) * step
    all_lon_tiles = np.arange(start=lon_start, stop=lon_stop + 1, step=step)
    if max_lon == lon_stop:
        all_lon_tiles = all_lon_tiles[:-1]

    out = []
    for lat in all_lat_tiles:
        for lon in all_lon_tiles:
            lat_tag, lon_tag = get_lat_lon_tags_from_bounding_box(lat, lon)
            fn = f'{lat_tag}_{lon_tag}'
            if fn in available_tiles:
                out.append(fn)

    return out
コード例 #2
0
def open_and_combine_lat_lon_data(folder, tiles=None):
    """
    Load lat lon data stored as 10x10 degree tiles in folder
    If tiles is none, load all data available
    If no file is available, return None
    """
    fs = GCSFileSystem(cache_timeout=0)
    if not tiles:
        tiles = [
            os.path.splitext(os.path.split(path)[-1])[0]
            for path in fs.ls(folder) if not path.endswith('/')
        ]

    uris = [f'{folder}{tile}.zarr' for tile in tiles]
    ds_list = []
    for uri in uris:
        if fs.exists(uri):
            da = open_zarr_file(uri)
            if da.lat[0] > da.lat[-1]:
                da = da.reindex(lat=da.lat[::-1])
            if da.lon[0] > da.lon[-1]:
                da = da.reindex(lat=da.lon[::-1])
            ds_list.append(da)

    if len(ds_list) > 0:
        ds = xr.combine_by_coords(ds_list,
                                  combine_attrs="drop_conflicts").chunk({
                                      'lat':
                                      2000,
                                      'lon':
                                      2000
                                  })
        return ds
    # print(f'No data available at {folder} for tiles {tiles}')
    return None
コード例 #3
0
    def validated_gcs_bucket_name(self) -> str:
        if self._validated_gcs_bucket_name is None:
            if self.gcs_bucket_name is not None:
                bucket = self.gcs_bucket_name

            else:
                # Open the key to get the project id
                with open(self.google_credentials_file, "r") as open_resource:
                    creds = json.load(open_resource)
                    project_id = creds["project_id"]

                # Remove all files in bucket
                bucket = f"{project_id}.appspot.com"

            # Validate
            fs = GCSFileSystem(token=self.google_credentials_file)
            try:
                fs.ls(bucket)
                self._validated_gcs_bucket_name = bucket

            except FileNotFoundError:
                raise ValueError(
                    f"Provided or infered GCS bucket name does not exist. ('{bucket}')"
                )

        return self._validated_gcs_bucket_name
コード例 #4
0
 def __init__(self, path='.', gcs=None, **fsargs):
     if gcs is None:
         self.gcs = GCSFileSystem(**fsargs)
     else:
         self.gcs = gcs
     self.cache = {}
     self.counter = 0
     self.root = path
コード例 #5
0
def main():

    # Make spark session
    global spark
    spark = (
        pyspark.sql.SparkSession.builder
        #.config("parquet.summary.metadata.level", "ALL")
        .config("parquet.summary.metadata.level", "NONE")
        .getOrCreate()
    )
    start_time = datetime.now()

    # Load all molecular trait sumstats
    # This has to be done separately, followed by unionByName as the hive
    # parititions differ across datasets due to different tissues
    # (bio_features) and chromosomes
    strip_path_mol = udf(lambda x: x.replace('file:', ''), StringType())
    mol_dfs = []
    mol_pattern = 'gs://genetics-portal-sumstats-b38/unfiltered/molecular_trait/'
    fs = GCSFileSystem()
    # List files; remove trailing '/' and deduplicate
    paths = list(set([s.rstrip('/') for s in fs.glob(mol_pattern)]))
    for inf in paths:
        if fs.isdir(inf):
            print("gs://" + inf)
            df = (
                spark.read.parquet("gs://" + inf)
                .withColumn('input_name', strip_path_mol(lit(inf)))
            )
            mol_dfs.append(df)

    # Take union
    sumstats = functools.reduce(
        functools.partial(pyspark.sql.DataFrame.unionByName, allowMissingColumns=True),
        mol_dfs
    )

    cols_to_keep = ['study_id', 'bio_feature', 'gene_id', 'chrom', 'pos', 'ref', 'alt', 'pval']
    
    # Calculate the number of tests and min pval per gene ----------
    min_pvals = (
        sumstats
        .select(*cols_to_keep)
        .groupby('study_id', 'bio_feature', 'gene_id')
        .agg(count(col('pval')).alias('num_tests'),
             min(col('pval')).alias('min_pval'))
        .orderBy('study_id', 'bio_feature', 'min_pval')
    )

    # Collect all data and write using pandas
    min_pvals.toPandas().to_csv(
        'gs://genetics-portal-dev-analysis/js29/molecular_trait/min_pvals_per_gene_old_2002.csv.gz',
        index=False)

    print('Time taken: {}'.format(datetime.now() - start_time))

    return 0
コード例 #6
0
ファイル: gcsfuse.py プロジェクト: ryan-williams/gcsfs
 def __init__(self, path='.', gcs=None, nfiles=10, **fsargs):
     if gcs is None:
         # minimum block size: still read on 5MB boundaries.
         self.gcs = GCSFileSystem(block_size=30 * 2 ** 20,
                                  cache_timeout=6000, **fsargs)
     else:
         self.gcs = gcs
     self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles)
     self.write_cache = {}
     self.counter = 0
     self.root = path
コード例 #7
0
        def load_model_from_path(path, project_name=None, key=None):

            if path[:5] == 'gs://':
                if project_name is None:
                    fs = GCSFileSystem()
                else:
                    fs = GCSFileSystem(project_name)
                file = fs.open(path)
            else:
                file = path

            return load_model(file, custom_objects={'Swish': Swish, 'InstanceNormalization': InstanceNormalization})
コード例 #8
0
def open_glah01_data():
    fs = GCSFileSystem(cache_timeout=0)
    uris = [
        f'gs://{f}'
        for f in fs.ls('gs://carbonplan-climatetrace/intermediates/glah01/')
        if not f.endswith('/')
    ]
    ds_list = [open_zarr_file(uri) for uri in uris]
    ds = xr.concat(ds_list, dim='record_index').chunk({'record_index': 2000})
    for k in ds:
        _ = ds[k].encoding.pop('chunks', None)
    return ds
コード例 #9
0
ファイル: gcp_storage.py プロジェクト: mozilla/OpenWPM
class GcsUnstructuredProvider(UnstructuredStorageProvider):
    """This class allows you to upload arbitrary bytes to GCS.
    They will be stored under bucket_name/base_path/filename
    """

    file_system: GCSFileSystem

    def __init__(
        self,
        project: str,
        bucket_name: str,
        base_path: str,
        token: str = None,
    ) -> None:
        super().__init__()
        self.project = project
        self.bucket_name = bucket_name
        self.base_path = base_path
        self.token = token
        self.base_path = f"{bucket_name}/{base_path}/{{filename}}"

        self.file_name_cache: Set[str] = set()
        """The set of all filenames ever uploaded, checked before uploading"""
        self.logger = logging.getLogger("openwpm")

    async def init(self) -> None:
        await super(GcsUnstructuredProvider, self).init()
        self.file_system = GCSFileSystem(
            project=self.project, token=self.token, access="read_write"
        )

    async def store_blob(
        self, filename: str, blob: bytes, overwrite: bool = False
    ) -> None:
        target_path = self.base_path.format(filename=filename)
        if not overwrite and (
            filename in self.file_name_cache or self.file_system.exists(target_path)
        ):
            self.logger.info("Not saving out file %s as it already exists", filename)
            return

        with self.file_system.open(target_path, mode="wb") as f:
            f.write(blob)

        self.file_name_cache.add(filename)

    async def flush_cache(self) -> None:
        pass

    async def shutdown(self) -> None:
        pass
コード例 #10
0
ファイル: gcp_storage.py プロジェクト: FMakosza/prbx-tracking
class GcsStructuredProvider(ArrowProvider):
    """This class allows you to upload Parquet files to GCS.
    This might not actually be the thing that we want to do
    long term but seeing as GCS is the S3 equivalent of GCP
    it is the easiest way forward.

    Inspired by the old S3Aggregator structure the GcsStructuredProvider
    will by default store into
    base_path/visits/table_name in the given bucket.

    Pass a different sub_dir to change this.
    """

    file_system: GCSFileSystem

    def __init__(
        self,
        project: str,
        bucket_name: str,
        base_path: str,
        token: str = None,
        sub_dir: str = "visits",
    ) -> None:
        super().__init__()
        self.project = project
        self.token = token
        self.base_path = f"{bucket_name}/{base_path}/{sub_dir}/{{table_name}}"

    def __str__(self) -> str:
        return f"GCS:{self.base_path.removesuffix('/{table_name}')}"

    async def init(self) -> None:
        await super(GcsStructuredProvider, self).init()
        self.file_system = GCSFileSystem(project=self.project,
                                         token=self.token,
                                         access="read_write")

    async def write_table(self, table_name: TableName, table: Table) -> None:
        self.file_system.start_transaction()
        pq.write_to_dataset(
            table,
            self.base_path.format(table_name=table_name),
            filesystem=self.file_system,
        )
        self.file_system.end_transaction()

    async def shutdown(self) -> None:
        pass
コード例 #11
0
def _get_file_to_upload(
    path: str,
    fs: gcsfs.GCSFileSystem,
    url: str,
    pdf_name: str,
    always_download: bool,
    post_data: Dict,
    verify_ssl: bool,
) -> Optional[str]:
    """This function checks first whether it needs to download, and then
    returns the locally downloaded pdf"""
    # First check if the path doesn't exist at all
    path_to_download = None
    if always_download or not fs.exists(path):
        if post_data:
            response = requests.post(url, data=post_data, verify=verify_ssl)
        else:
            response = requests.get(url, verify=verify_ssl)
        if response.status_code == 200:
            path_to_download = os.path.join(tempfile.gettempdir(), pdf_name)
            with open(path_to_download, "wb") as f:
                # Need to use content since PDF needs to write raw bytes.
                f.write(response.content)
        else:
            raise ScrapeAggregateError(
                "Could not download file {}".format(pdf_name))
    return path_to_download
コード例 #12
0
def _clean_cdp_filestore(google_creds_path: Path) -> None:
    # Connect to database
    fs = GCSFileSystem(token=str(google_creds_path))

    # Open the key to get the project id
    with open(google_creds_path, "r") as open_resource:
        creds = json.load(open_resource)
        project_id = creds["project_id"]

    # Remove all files in bucket
    bucket = f"{project_id}.appspot.com"
    log.info(f"Cleaning bucket: {bucket}")
    try:
        fs.rm(f"{bucket}/*")
    # Handle empty bucket
    except FileNotFoundError:
        pass

    log.info("Filestore cleaning complete")
コード例 #13
0
ファイル: validators.py プロジェクト: isaacna/cdp-backend
def resource_exists(uri: Optional[str], **kwargs: str) -> bool:
    """
    Validate that the URI provided points to an existing file.

    None is a valid option.

    Parameters
    ----------
    uri: Optional[str]
        The URI to validate resource existance for.

    Returns
    -------
    status: bool
        The validation status.
    """

    if uri is None:
        return True

    if uri.startswith("gs://") or uri.startswith("https://storage.googleapis"):
        # Convert to gsutil form if necessary
        if uri.startswith("https://storage.googleapis"):
            uri = convert_gcs_json_url_to_gsutil_form(uri)

            # If uri is not convertible to gsutil form we can't confirm
            if uri == "":
                return False

        if kwargs.get("google_credentials_file"):
            fs = GCSFileSystem(
                token=str(kwargs.get("google_credentials_file", "anon")))
            return fs.exists(uri)

        # Can't check GCS resources without creds file
        else:
            try:
                anon_fs = GCSFileSystem(token="anon")
                return anon_fs.exists(uri)
            except Exception:
                return False

    # Is HTTP remote resource
    elif uri.startswith("http"):
        try:
            # Use HEAD request to check if remote resource exists
            r = requests.head(uri)

            return r.status_code == requests.codes.ok
        except requests.exceptions.SSLError:
            return False

    # Get any filesystem and try
    try:
        fs, path = url_to_fs(uri)
        return fs.exists(path)
    except Exception:
        return False
コード例 #14
0
def main(month, type_, outfile):
    spark = build_spark()
    raw_dat = spark.read.parquet('gs://spain-tweets/rehydrated/lake').where(f'month = {month}')
    dat = get_dat(spark, raw_dat)
    tweets = get_tweets(dat)

    if type_ == 'tweets':
        nodes, edges = build_tweet_graph(tweets, dat)
        G = create_graph(nodes, edges, 'id_str')

    elif type_ == 'users':
        nodes, edges = build_user_graph(tweets)
        G = create_graph(nodes, edges, 'user')

    else:
        raise TypeError(f'Unrecognized type_ parameter: {type_}')

    fs = GCSFileSystem(project = 'trollhunters')
    with fs.open(outfile, 'wb') as f:
        nx.write_graphml(G, f)            
コード例 #15
0
ファイル: util.py プロジェクト: TurkeyBlaster/Sunset-GAN
def load_npz(path, project_name=None, key=None):

    if path[:5] == 'gs://':

        if project_name is None:
            fs = GCSFileSystem(token=key)
        else:
            fs = GCSFileSystem(project_name, token=key)
        file = fs.open(path)

    else:
        file = path

    print(f'Loading file {path.rsplit("/", 1)[-1]}')
    with np.load(file, allow_pickle=True) as npz:
        print(f'Available files: {npz.files}')
        X = npz[npz.files[0]]
        X = np.expand_dims(X, -1)[0]['sunset_ims']

    return X
コード例 #16
0
    def setUpClass(self):
        self.path = f"tests/{str(uuid.uuid4())}/table1"
        self.spark = (
            pyspark.sql.SparkSession.builder.appName("deltalake").config(
                "spark.jars.packages",
                "io.delta:delta-core_2.12:0.7.0").config(
                    "spark.sql.extensions",
                    "io.delta.sql.DeltaSparkSessionExtension").config(
                        "spark.sql.catalog.spark_catalog",
                        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
                    ).getOrCreate())
        df = (self.spark.range(0,
                               1000).withColumn("number", rand()).withColumn(
                                   "number2",
                                   when(col("id") < 500, 0).otherwise(1)))

        for i in range(12):
            df.write.partitionBy("number2").format("delta").mode(
                "append").save(self.path)
        self.fs = GCSFileSystem(project=GCP_PROJECT_ID)

        self.fs.upload(self.path, f"{GCP_BUCKET}/{self.path}", recursive=True)
        self.table = DeltaTable(f"{GCP_BUCKET}/{self.path}",
                                file_system=self.fs)
コード例 #17
0
def initialize_gcs_file_system(credentials_file: str) -> GCSFileSystem:
    """
    Initializes an instance of a GCSFileSystem.

    Parameters
    ----------
    credentials_file: str
        The path to the Google Service Account credentials JSON file.

    Returns
    -------
    file_system: GCSFileSystem
        An initialized GCSFileSystem.
    """
    return GCSFileSystem(token=str(credentials_file))
コード例 #18
0
class DeltaReaderAppendTest(TestCase):
    @classmethod
    def setUpClass(self):
        self.path = f"tests/{str(uuid.uuid4())}/table1"
        self.spark = (
            pyspark.sql.SparkSession.builder.appName("deltalake").config(
                "spark.jars.packages",
                "io.delta:delta-core_2.12:0.7.0").config(
                    "spark.sql.extensions",
                    "io.delta.sql.DeltaSparkSessionExtension").config(
                        "spark.sql.catalog.spark_catalog",
                        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
                    ).getOrCreate())
        df = (self.spark.range(0,
                               1000).withColumn("number", rand()).withColumn(
                                   "number2",
                                   when(col("id") < 500, 0).otherwise(1)))

        for i in range(12):
            df.write.partitionBy("number2").format("delta").mode(
                "append").save(self.path)
        self.fs = GCSFileSystem(project=GCP_PROJECT_ID)

        self.fs.upload(self.path, f"{GCP_BUCKET}/{self.path}", recursive=True)
        self.table = DeltaTable(f"{GCP_BUCKET}/{self.path}",
                                file_system=self.fs)

    @classmethod
    def tearDownClass(self):
        # remove folder when we are done with the test
        self.fs.rm(f"{GCP_BUCKET}/{self.path}", recursive=True)
        shutil.rmtree(self.path)

    def test_paths(self):
        assert self.table.path == f"{GCP_BUCKET}/{self.path}"
        assert self.table.log_path == f"{GCP_BUCKET}/{self.path}/_delta_log"

    def test_versions(self):

        assert self.table.checkpoint == 10
        assert self.table.version == 11

    def test_data(self):

        # read the parquet files using pandas
        df_pandas = self.table.to_pandas()
        # read the table using spark
        df_spark = self.spark.read.format("delta").load(self.path).toPandas()

        # compare dataframes. The index may not be the same order, so we ignore it
        assert_frame_equal(
            df_pandas.sort_values("id").reset_index(drop=True),
            df_spark.sort_values("id").reset_index(drop=True),
        )

    def test_version(self):
        # read the parquet files using pandas
        df_pandas = self.table.as_version(5, inplace=False).to_pandas()
        # read the table using spark
        df_spark = (self.spark.read.format("delta").option(
            "versionAsOf", 5).load(self.path).toPandas())

        # compare dataframes. The index may not be the same order, so we ignore it
        assert_frame_equal(
            df_pandas.sort_values("id").reset_index(drop=True),
            df_spark.sort_values("id").reset_index(drop=True),
        )

    def test_partitioning(self):
        # Partition pruning should half number of rows
        assert self.table.to_table(
            filter=ds.field("number2") == 0).num_rows == 6000

    def test_predicate_pushdown(self):
        # number is random 0-1, so we should have fewer than 12000 rows no matter what
        assert self.table.to_table(
            filter=ds.field("number") < 0.5).num_rows < 12000

    def test_column_pruning(self):
        t = self.table.to_table(columns=["number", "number2"])
        assert t.column_names == ["number", "number2"]
コード例 #19
0
class GCSFS(Operations):
    def __init__(self, path='.', gcs=None, **fsargs):
        if gcs is None:
            self.gcs = GCSFileSystem(**fsargs)
        else:
            self.gcs = gcs
        self.cache = {}
        self.counter = 0
        self.root = path

    def getattr(self, path, fh=None):
        try:
            info = self.gcs.info(''.join([self.root, path]))
        except FileNotFoundError:
            raise FuseOSError(ENOENT)
        data = {'st_uid': 1000, 'st_gid': 1000}
        perm = 0o777

        if info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']:
            data['st_atime'] = 0
            data['st_ctime'] = 0
            data['st_mtime'] = 0
            data['st_mode'] = (stat.S_IFDIR | perm)
            data['st_size'] = 0
            data['st_blksize'] = 0
        else:
            data['st_atime'] = str_to_time(info['timeStorageClassUpdated'])
            data['st_ctime'] = str_to_time(info['timeCreated'])
            data['st_mtime'] = str_to_time(info['updated'])
            data['st_mode'] = (stat.S_IFREG | perm)
            data['st_size'] = info['size']
            data['st_blksize'] = 5 * 2**20
            data['st_nlink'] = 1

        return data

    def readdir(self, path, fh):
        path = ''.join([self.root, path])
        files = self.gcs.ls(path)
        files = [f.rstrip('/').rsplit('/', 1)[1] for f in files]
        return ['.', '..'] + files

    def mkdir(self, path, mode):
        bucket, key = core.split_path(path)
        if not self.gcs.info(path):
            self.gcs.dirs['bucket'].append({
                'bucket': bucket,
                'kind': 'storage#object',
                'size': 0,
                'storageClass': 'DIRECTORY',
                'name': path.rstrip('/') + '/'
            })

    def rmdir(self, path):
        info = self.gcs.info(path)
        if info['storageClass':'DIRECTORY']:
            self.gcs.rm(path, False)

    def read(self, path, size, offset, fh):
        print('read', path, size, offset, fh)
        fn = ''.join([self.root, path])
        f = self.cache[fn]
        f.seek(offset)
        out = f.read(size)
        return out

    def write(self, path, data, offset, fh):
        print('write', path, offset, fh)
        f = self.cache[fh]
        f.write(data)
        return len(data)

    def create(self, path, flags):
        print('create', path, oct(flags))
        fn = ''.join([self.root, path])
        self.gcs.touch(
            fn)  # this makes sure directory entry exists - wasteful!
        # write (but ignore creation flags)
        f = self.gcs.open(fn, 'wb')
        self.cache[self.counter] = f
        self.counter += 1
        return self.counter - 1

    def open(self, path, flags):
        print('open', path, oct(flags))
        fn = ''.join([self.root, path])
        if flags % 2 == 0:
            # read
            f = self.gcs.open(fn, 'rb')
        else:
            # write (but ignore creation flags)
            f = self.gcs.open(fn, 'wb')
        self.cache[self.counter] = f
        self.counter += 1
        return self.counter - 1

    def truncate(self, path, length, fh=None):
        print('truncate', path, length, fh)
        fn = ''.join([self.root, path])
        if length != 0:
            raise NotImplementedError
        # maybe should be no-op since open with write sets size to zero anyway
        self.gcs.touch(fn)

    def unlink(self, path):
        print('delete', path)
        fn = ''.join([self.root, path])
        try:
            self.gcs.rm(fn, False)
        except (IOError, FileNotFoundError):
            raise FuseOSError(EIO)

    def release(self, path, fh):
        print('close', path, fh)
        try:
            f = self.cache[fh]
            f.close()
            self.cache.pop(fh, None)  # should release any cache memory
        except Exception as e:
            print(e)
        return 0

    def chmod(self, path, mode):
        raise NotImplementedError
コード例 #20
0
ファイル: dig.py プロジェクト: agriuseatstweets/dig
def read_schema(path):
    fs = GCSFileSystem(project='trollhunters')
    with fs.open(path, 'rb') as f:
        schema = pickle.load(f)
    return schema
コード例 #21
0
ファイル: iorw.py プロジェクト: onevirus/papermill
 def _get_client(self):
     if self._client is None:
         self._client = GCSFileSystem()
     return self._client
コード例 #22
0
ファイル: gcp_storage.py プロジェクト: mozilla/OpenWPM
 async def init(self) -> None:
     await super(GcsUnstructuredProvider, self).init()
     self.file_system = GCSFileSystem(
         project=self.project, token=self.token, access="read_write"
     )
コード例 #23
0
# auto-generate some GCS metrics

from gcsfs import GCSFileSystem

fs = GCSFileSystem('pangeo-181919')


# https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
def sizeof_fmt(num, suffix='B'):
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


# get disk usage of each folder in gs://pangeo-data
with open('du-pangeo-data.csv', 'w') as f:
    f.write('directory, size, nbytes')
    print('directory, size, nbytes')
    for folder in fs.ls('pangeo-data'):
        nbytes = fs.du(folder)
        f.write(f'{folder}, {sizeof_fmt(nbytes)}, {nbytes}')
        print(f'{folder}, {sizeof_fmt(nbytes)}, {nbytes}')

# upload CSV to gs://pangeo-data
fs.put('du-pangeo-data.csv', 'pangeo-data/du-pangeo-data.csv')
コード例 #24
0
import os
from collections import defaultdict

import dask
import fsspec
import xarray as xr
import zarr
from dask.distributed import Client
from gcsfs import GCSFileSystem

from carbonplan_trace.v1.glas_extract import extract_GLAH01_data, extract_GLAH14_data

fs = GCSFileSystem()

skip_existing = True
chunksize = 2000

drop_keys = {
    'GLAH01': ['rec_bin', 'shot_number', 'tx_bin'],
    'GLAH14': ['n_gaussian_peaks', 'shot_number'],
}


def get_mapper(uri):
    key = os.path.splitext(os.path.split(uri)[-1])[0]
    muri = f'gs://carbonplan-scratch/glas-zarr-cache/{key}.zarr'
    mapper = fsspec.get_mapper(muri)
    return mapper


@dask.delayed
コード例 #25
0
def read_transcripts_and_generate_grams(
        event_transcripts: EventTranscripts, n_grams: int,
        credentials_file: str) -> List[ContextualizedGram]:
    """
    Parse all documents and create a list of contextualized grams for later weighting.

    Parameters
    ----------
    event_transcripts: EventTranscripts
        The EventTranscripts object to parse all transcripts for.
    n_grams: int
        N number of terms to act as a unique entity.
    credentials_file: str
        Path to Google Service Account Credentials JSON file.

    Returns
    -------
    grams: List[ContextualizedGram]
        All grams found in all transcripts provided.
    """
    fs = GCSFileSystem(token=credentials_file)

    # Store all n_gram results
    event_n_grams: List[ContextualizedGram] = []

    # Iter over each transcript
    for transcript_db_file in event_transcripts.transcript_db_files:
        with TemporaryDirectory() as temp_dir:
            temp_dir_path = Path(temp_dir)
            local_transcript_filepath = temp_dir_path / transcript_db_file.name

            # Download transcript
            fs.get(
                rpath=transcript_db_file.uri,
                lpath=str(local_transcript_filepath),
            )

            # Init transcript
            with open(local_transcript_filepath, "r") as open_f:
                transcript = Transcript.from_json(
                    open_f.read())  # type: ignore

            # Get cleaned sentences by removing stop words
            cleaned_sentences: List[SentenceManager] = [
                SentenceManager(
                    original_details=sentence,
                    cleaned_text=string_utils.clean_text(
                        sentence.text,
                        clean_stop_words=True,
                    ),
                    n_grams=[],
                ) for sentence in transcript.sentences
            ]

            # Filter any empty sentences
            cleaned_sentences = [
                sm for sm in cleaned_sentences if len(sm.cleaned_text) > 1
            ]

            # Get all n_grams for each sentence
            for sm in cleaned_sentences:
                sm.n_grams = [*ngrams(sm.cleaned_text.split(), n_grams)]

            # Init stemmer and stem all grams
            stemmer = SnowballStemmer("english")
            for sm in cleaned_sentences:
                for n_gram in sm.n_grams:
                    # Join into a single n gram
                    unstemmed_n_gram = " ".join(n_gram)

                    # Join, lower, and stem the n gram
                    stemmed_n_gram = " ".join(
                        [stemmer.stem(term.lower()) for term in n_gram])

                    # Get context span
                    # Because ngrams function, cleaning, and split may affect the exact
                    # matchup of the term, use fuzzy diff to find closest
                    closest_term = ""
                    closest_term_score = 0.0
                    for term in sm.original_details.text.split():
                        similarity = rapidfuzz.fuzz.QRatio(term, n_gram[0])
                        if similarity > closest_term_score:
                            closest_term = term
                            closest_term_score = similarity

                    # Get surrounding terms
                    terms = sm.original_details.text.split()
                    target_term_index = terms.index(closest_term)

                    # Get left and right indices
                    left_i = 0 if target_term_index - 8 < 0 else target_term_index - 8
                    right_i = (None if target_term_index + 7 >= len(terms) - 1
                               else target_term_index + 7)
                    context_span = " ".join(terms[left_i:right_i])

                    # Append ellipsis
                    if left_i != 0:
                        context_span = f"... {context_span}"
                    if right_i is not None:
                        context_span = f"{context_span}..."

                    # Append to event list
                    event_n_grams.append(
                        ContextualizedGram(
                            event_id=event_transcripts.event_id,
                            event_datetime=event_transcripts.event_datetime,
                            unstemmed_gram=unstemmed_n_gram,
                            stemmed_gram=stemmed_n_gram,
                            context_span=context_span,
                        ))

    return event_n_grams
コード例 #26
0
ファイル: main.py プロジェクト: donaldrauscher/dfs-data
import requests
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from gcsfs import GCSFileSystem
from flask import Flask, request

app = Flask(__name__)
fs = GCSFileSystem(project='blog-180218')

warnings.simplefilter(action='ignore', category=FutureWarning)


def get_current_dt():
    """
    Current datetime in CST
    """
    return pytz.utc.localize(datetime.datetime.utcnow(),
                             is_dst=None).astimezone(
                                 pytz.timezone('America/Chicago'))


def rotowire_scrape(dt=None, test=False):
    """
コード例 #27
0
ファイル: gcsfs_factory.py プロジェクト: dxy/pulse-data
 def build(cls) -> DirectIngestGCSFileSystem:
     return DirectIngestGCSFileSystemImpl(
         GCSFileSystem(project=metadata.project_id(),
                       cache_timeout=GCSFS_NO_CACHING))
コード例 #28
0
ファイル: gcsfuse.py プロジェクト: ryan-williams/gcsfs
class GCSFS(Operations):

    def __init__(self, path='.', gcs=None, nfiles=10, **fsargs):
        if gcs is None:
            # minimum block size: still read on 5MB boundaries.
            self.gcs = GCSFileSystem(block_size=30 * 2 ** 20,
                                     cache_timeout=6000, **fsargs)
        else:
            self.gcs = gcs
        self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles)
        self.write_cache = {}
        self.counter = 0
        self.root = path

    @_tracemethod
    def getattr(self, path, fh=None):
        path = ''.join([self.root, path])
        try:
            info = self.gcs.info(path)
        except FileNotFoundError:
            parent = path.rsplit('/', 1)[0]
            if path in self.gcs.ls(parent):
                info = True
            else:
                raise FuseOSError(ENOENT)
        data = {'st_uid': 1000, 'st_gid': 1000}
        perm = 0o777

        if (info is True or info['storageClass'] == 'DIRECTORY'
                or 'bucket' in info['kind']):
            data['st_atime'] = 0
            data['st_ctime'] = 0
            data['st_mtime'] = 0
            data['st_mode'] = (stat.S_IFDIR | perm)
            data['st_size'] = 0
            data['st_blksize'] = 0
        else:
            data['st_atime'] = str_to_time(info['timeStorageClassUpdated'])
            data['st_ctime'] = str_to_time(info['timeCreated'])
            data['st_mtime'] = str_to_time(info['updated'])
            data['st_mode'] = (stat.S_IFREG | perm)
            data['st_size'] = info['size']
            data['st_blksize'] = 5 * 2**20
            data['st_nlink'] = 1
        return data

    @_tracemethod
    def readdir(self, path, fh):
        path = ''.join([self.root, path])
        logger.info("List {}, {}".format(path, fh))
        files = self.gcs.ls(path)
        files = [os.path.basename(f.rstrip('/')) for f in files]
        return ['.', '..'] + files

    @_tracemethod
    def mkdir(self, path, mode):
        path = ''.join([self.root, path])
        logger.info("Mkdir {}".format(path))
        parent, name = path.rsplit('/', 1)
        prefixes = self.gcs._listing_cache[parent + '/'][1]['prefixes']
        if name not in prefixes:
            prefixes.append(name)
        return 0

    @_tracemethod
    def rmdir(self, path):
        info = self.gcs.info(path)
        if info['storageClass': 'DIRECTORY']:
            self.gcs.rm(path, False)

    @_tracemethod
    def read(self, path, size, offset, fh):
        fn = ''.join([self.root, path])
        logger.info('read #{} ({}) offset: {}, size: {}'.format(
            fh, fn, offset, size))
        out = self.cache.read(fn, offset, size)
        return out

    @_tracemethod
    def write(self, path, data, offset, fh):
        fn = ''.join([self.root, path])
        logger.info('write #{} ({}) offset'.format(fh, fn, offset))
        f = self.write_cache[fh]
        f.write(data)
        return len(data)

    @_tracemethod
    def create(self, path, flags):
        fn = ''.join([self.root, path])
        logger.info('create {} {}'.format(fn, oct(flags)))
        self.gcs.touch(fn)  # this makes sure directory entry exists - wasteful!
        # write (but ignore creation flags)
        f = self.gcs.open(fn, 'wb')
        self.write_cache[self.counter] = f
        logger.info('-> fh #{}'.format(self.counter))
        self.counter += 1
        return self.counter - 1

    @_tracemethod
    def open(self, path, flags):
        fn = ''.join([self.root, path])
        logger.info('open {} {}'.format(fn, oct(flags)))
        if flags % 2 == 0:
            # read
            self.cache.open(fn)
        else:
            # write (but ignore creation flags)
            self.gcs.open(fn, 'wb')
            self.write_cache[self.counter] = f
        logger.info('-> fh #{}'.format(self.counter))
        self.counter += 1
        return self.counter - 1

    @_tracemethod
    def truncate(self, path, length, fh=None):
        fn = ''.join([self.root, path])
        logger.info('truncate #{} ({}) to {}'.format(fh, fn, length))
        if length != 0:
            raise NotImplementedError
        # maybe should be no-op since open with write sets size to zero anyway
        self.gcs.touch(fn)

    @_tracemethod
    def unlink(self, path):
        fn = ''.join([self.root, path])
        logger.info('delete', fn)
        try:
            self.gcs.rm(fn, False)
        except (IOError, FileNotFoundError):
            raise FuseOSError(EIO)

    @_tracemethod
    def release(self, path, fh):
        fn = ''.join([self.root, path])
        logger.info('close #{} ({})'.format(fh, fn))
        try:
            if fh in self.write_cache:
                # write mode
                f = self.write_cache[fh]
                f.close()
                self.write_cache.pop(fh, None)
        except Exception as e:
            logger.exception("exception on release:" + str(e))
        return 0

    @_tracemethod
    def chmod(self, path, mode):
        raise NotImplementedError
コード例 #29
0
ファイル: gs.py プロジェクト: vishalbelsare/dvc
    def fs(self):
        from gcsfs import GCSFileSystem

        return GCSFileSystem(**self.fs_args)
コード例 #30
0
    def fs(self):
        from gcsfs import GCSFileSystem

        return GCSFileSystem(**self.login_info, consistency=None)