Esempio n. 1
0
    def __init__(self, cfg_file="config.yml"):
        self.cfg = get_cfg(cfg_file)
        self.method = self.cfg['backend']
        # setup cloud drive if needed
        if self.method == 'gcp':
            project = self.cfg['data_gcp']['project']
            bucket_name = self.cfg['data_gcp']['bucket']
            if self.cfg['gcp_local_auth'] == 1:  # running on local
                gs_token = self.cfg['data_gcp']['json_key']
                self.gcs_fs = gcsfs.GCSFileSystem(project=project,
                                                  token=gs_token)
                self.storage_client = storage.Client.from_service_account_json(
                    gs_token)
                self.bucket = self.storage_client.get_bucket(
                    bucket_name)  # now it will create bucket obj
            else:  # running on native gc
                self.storage_client = storage.Client()
                self.bucket = self.storage_client.get_bucket(
                    bucket_name)  # now it will create bucket obj
                self.gcs_fs = gcsfs.GCSFileSystem(project=project)

            # manual way of connecting to gcs
            # blob = bucket.blob(bucket_folder + file)
        self.dfLoc = None  # placeholder for existing data, but doesn't add it yet
        self.dfNew = None  # placeholder for new data coming in
Esempio n. 2
0
def get_gcs_root(project):
    """Get object map of a root GCS bucket"""
    fs = gcsfs.GCSFileSystem(project=project, token='cache')
    token = fs.session.credentials
    gcsfs_root = gcsfs.GCSFileSystem(project=project,
                                      token=token)
    return gcsfs_root
Esempio n. 3
0
def get_fs_and_path(url: str,
                    token=None,
                    public=True) -> Tuple[fsspec.AbstractFileSystem, str]:
    if url.startswith("s3://"):
        token = token or dict()
        token = read_aws_creds(token) if isinstance(token, str) else token
        return (
            S3FileSystemReplacement(
                key=token.get("aws_access_key_id"),
                secret=token.get("aws_secret_access_key"),
                token=token.get("aws_session_token"),
                client_kwargs={
                    "endpoint_url": token.get("endpoint_url"),
                    "region_name": token.get("region"),
                },
            ),
            url[5:],
        )
    elif url.startswith("gcs://"):
        return gcsfs.GCSFileSystem(token=token), url[6:]
    elif url.find("blob.core.windows.net/") != -1:
        account_name = url.split(".")[0]
        account_name = account_name[8:] if url.startswith(
            "https://") else account_name
        return (
            AzureBlobFileSystem(
                account_name=account_name,
                account_key=token.get("account_key"),
            ),
            url[url.find("blob.core.windows.net/") + 22:],
        )
    elif (url.startswith("../") or url.startswith("./") or url.startswith("/")
          or url.startswith("~/")):
        return fsspec.filesystem("file"), url
    elif (
            # windows local file system
            re.search("^[A-Za-z]:", url)):
        return fsspec.filesystem("file"), url
    else:
        # TOOD check if url is username/dataset:version
        if url.split("/")[0] == "google":
            org_id, ds_name = url.split("/")
            token, url = HubControlClient().get_dataset_credentials(
                org_id, ds_name)
            fs = gcsfs.GCSFileSystem(token=token)
            url = url[6:]
        else:
            url, creds = _connect(url, public=public)
            fs = S3FileSystemReplacement(
                expiration=creds["expiration"],
                key=creds["access_key"],
                secret=creds["secret_key"],
                token=creds["session_token"],
                client_kwargs={
                    "endpoint_url": creds["endpoint"],
                    "region_name": creds["region"],
                },
            )
        return (fs, url)
Esempio n. 4
0
    def get_temp_filepath(self):
        if self.backend == 'POSIX':
            self.temp_dir    = tempfile.mkdtemp()
            self.dir_store   = os.path.join(self.temp_dir,
                                            'temp-%s%s' % (next(_counter),
                                            self.suffix))
            # Saving dask objects as Zarr requires more than just a filehandle
            if not self.dask:
                self.storage_obj = self.dir_store
            else:
                self.storage_obj = zarr.create(shape=self.shape, chunks=self.chunksize,
                                               store=self.dir_store, dtype=self.dtype, 
                                               overwrite=True)
        elif self.backend == 'GCS':
            if not self.gcs_zarr:
                    raise NotImplementedError("Missing config for GCP test")
            
            # HACK in order to give worker pods read/write to storage
            fs = gcsfs.GCSFileSystem(project=self.gcp_project_name, token='cache')
            token = fs.session.credentials
            self.gcp_project = gcsfs.GCSFileSystem(project=self.gcp_project_name, 
                                                   token=token)
            self.gcsfsmap    = gcsfs.mapping.GCSMap(self.gcs_zarr, 
                                                    gcs=self.gcp_project,
                                                    check=True, create=False)
            if not self.dask:
                gsutil_arg = "gs://%s" % self.gcs_zarr
                call(["gsutil", "-q", "-m", "rm", "-r", gsutil_arg])
                self.storage_obj = self.gcsfsmap
            else: 
                self.storage_obj = zarr.create(shape=self.shape, chunks=self.chunksize,
                                               store=self.gcsfsmap, dtype=self.dtype, 
                                               overwrite=True)
            
        elif self.backend == 'FUSE':
            if not self.gcs_zarr_fuse:
                raise NotImplementedError("Missing config for FUSE test")

            self.temp_dir    = tempfile.mkdtemp()
            self.dir_store = self.temp_dir + self.gcs_zarr_fuse
            call([GCSFUSE, self.gcs_bucket, self.temp_dir])

            # Remove previous test runs
            if os.path.exists(self.dir_store):
                shutil.rmtree(self.dir_store)
            os.makedirs(self.dir_store)

            # Return the path if this isn't Dask
            # TODO: This should be a function
            if not self.dask:
                self.storage_obj = self.dir_store
            else:
                self.storage_obj = zarr.create(shape=self.shape, chunks=self.chunksize,
                                               store=self.dir_store, dtype=self.dtype, 
                                               overwrite=True)
        else:
            raise NotImplementedError("Storage backend not implemented.")
Esempio n. 5
0
 def __init__(self, project=None):
     super(CloudService, self).__init__()
     self.project = project
     if project is not None:
         self.client = bigquery.Client(project=project)
         self.storage_client = storage.Client(project=project)
         self.fs = gcsfs.GCSFileSystem(project=project)
     else:
         self.client = bigquery.Client()
         self.storage_client = storage.Client()
         self.fs = gcsfs.GCSFileSystem()
Esempio n. 6
0
def create_gcsfs(bucket_id=None, token_loc=None):
    """
    Create a GCSFileSystem (Google Cloud Storage File System), given a 'bucket_id'
    and 'token_loc' parameters. If anyones given, it will set by default configurations.
    """
    if bucket_id is None: bucket_id = GLOBAL_BUCKET_ID

    if token_loc is None:
        return gcsfs.GCSFileSystem(
            bucket_id,
            token=f'{HOME_ENV}/gcloud/application_default_credentials.json')

    return gcsfs.GCSFileSystem(bucket_id, token=token_loc)
def read_parquet(uri):
    parsed_uri = urlparse(uri)
    if parsed_uri.scheme == "file":
        return pd.read_parquet(parsed_uri.path)
    elif parsed_uri.scheme == "gs":
        fs = gcsfs.GCSFileSystem()
        files = [
            "gs://" + path
            for path in gcsfs.GCSFileSystem().glob(uri + "/part-*")
        ]
        ds = parquet.ParquetDataset(files, filesystem=fs)
        return ds.read().to_pandas()
    else:
        raise ValueError("Unsupported scheme")
Esempio n. 8
0
def multi_open(
    filename,
    mode,
    use_gcs=True,
    use_http=True,
    use_file=True,
    use_gzip=True,
    token=None,
    **kwargs,
):
    if use_gcs and (filename.startswith("gcs://")
                    or filename.startswith("gc://")):
        token = token or os.environ.get("FTM_PREDICT_GCS_TOKEN")
        logging.debug(f"Using GCSFS to open file: {filename}:{mode}")
        fs = gcsfs.GCSFileSystem(token=token)
        return fs.open(filename, mode=mode, **kwargs)
    elif use_http and (filename.startswith("http://")
                       or filename.startswith("https://")):
        if not mode.startswith("r"):
            raise ValueError("HTTP File-type only supports read modes")
        kwargs.setdefault("method", "GET")
        kwargs.setdefault("url", filename)
        kwargs["stream"] = True
        response = requests.request(**kwargs)
        return response.raw
    elif use_gzip and (filename.endswith(".gz") or filename.endswith(".gzip")):
        logging.debug(f"Using GZIP to open file: {filename}:{mode}")
        return gzip.open(filename, mode, **kwargs)
    elif use_file:
        return open(filename, mode, **kwargs)
    raise ValueError(f"Unable to open file: {filename}:{mode}")
def users_converter():
    json_gcs = []

    gcs_file_system = gcsfs.GCSFileSystem(project="sirapob-bluepi-de-exam",
                                          token="cloud")
    gcs_json_path = "gs://airflow-postgres/users"
    with gcs_file_system.open(gcs_json_path) as f:
        gcs_string_data = json.loads(json.dumps(f.read().decode('utf-8')))
        gcs = gcs_string_data.splitlines()
        for g in gcs:
            gcs = json.loads(g)
            gcs['created_at'] = dt.datetime.fromtimestamp(
                gcs['created_at']) + dt.timedelta(hours=7)
            gcs['updated_at'] = dt.datetime.fromtimestamp(
                gcs['updated_at']) + dt.timedelta(hours=7)
            json_gcs.append(gcs)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket("airflow-postgres")
    blob = bucket.blob("users.csv")
    df = pd.DataFrame(data=json_gcs).to_csv(sep=",",
                                            header=False,
                                            index=False,
                                            quotechar='"',
                                            quoting=csv.QUOTE_ALL,
                                            encoding='utf-8')
    blob.upload_from_string(data=df)
def sample_qc_zarr(input_path: str, output_path: str, remote: bool):
    """Convert sample QC csv to zarr"""
    import gcsfs
    import pandas as pd

    logger.info("Converting to Xarray")
    df = pd.read_csv(input_path, sep="\t")
    pc_vars = df.filter(regex="^genetic_principal_component").columns.tolist()
    ds = (df[[c for c in df if c not in pc_vars
              ]].rename_axis("samples",
                             axis="rows").to_xarray().drop_vars("samples"))
    pcs = (df[pc_vars].rename_axis(
        "samples", axis="rows").to_xarray().drop_vars("samples").to_array(
            dim="principal_components").T)
    ds = ds.assign(
        genotype_measurement_plate=ds.genotype_measurement_plate.astype("S"),
        genotype_measurement_well=ds.genotype_measurement_well.astype("S"),
        principal_component=pcs.drop_vars("principal_components"),
    )
    # Rechunk to enforce stricter dtypes as well as ease
    # downstream loading/processing of PC array
    ds = ds.chunk("auto")

    store = output_path
    if remote:
        gcs = gcsfs.GCSFileSystem()
        store = gcsfs.GCSMap(output_path, gcs=gcs, check=False, create=True)

    logger.info(f"Sample QC dataset:\n{ds}")
    logger.info(f"Saving zarr archive at {output_path}")
    ds.to_zarr(store, mode="w", consolidated=True)
Esempio n. 11
0
def clean_bucket(bucket: str, name: str, project: str) -> None:
    """
    Find all the file names that do not have one of the 3 labels of age, gender and accent.

    :param bucket: name of Google Cloud bucket
    :param name:  name of file or google cloud  blob
    :param project: unique google cloud project name



    file_list = ['validated', 'train', 'test', 'dev', 'other', 'invalidated']

    for file in file_list:
        clean_bucket(bucket = config.Bucket.META_DATA, name = file, project = 'commonvoice-voice-voice-270516')
    """

    fs = gcsfs.GCSFileSystem(project=project)

    with fs.open("{}/{}.tsv".format(bucket, name)) as f:
        data = pd.read_csv(f, delimiter="\t")

    data = data[["path", "age", "gender", "accent"]]

    print("There are {} audio files in the development set".format(data.shape[0]))

    columns = ["gender", "age", "accent"]
    clean_files = []

    for column in columns:
        mp3 = data[-data[column].isna()]["path"]
        clean_files.extend(mp3)

    clean_files = set(clean_files)

    path = data["path"]
    mp3_to_remove = collections.deque()

    for mp3 in tqdm(path):
        if mp3 not in clean_files:
            mp3_to_remove.append(mp3)

    clean_files = pd.DataFrame(list(mp3_to_remove))
    clean_files.to_csv("remove-{}.csv".format(name))

    upload_blob(
        bucket_name=config.Bucket.META_DATA,
        source_file_name="remove-{}.csv".format(name),
        destination_blob_name="subject_to_removal/{}".format(name),
    )

    print(
        "{} mp3s do not have labels, leaving {} in the {} labeled mp3".format(
            len(clean_files), data.shape[0] - len(clean_files), name
        )
    )
    print(
        "Removed {}% of the data".format(
            round((len(mp3_to_remove) / data.shape[0]) * 100, 2)
        )
    )
Esempio n. 12
0
def gcs_to_dataframe(data, context):
    '''Background Cloud Function to be triggered by Cloud Storage.
       This function put gcs CSV file into pandas dataframe
    Args:
        data (dict): The Cloud Functions event payload.
        context (google.cloud.functions.Context): Metadata of triggering event.
    Returns:
        None; the output is written to Stackdriver Logging
    References:
        https://gcsfs.readthedocs.io/en/latest/
		https://github.com/pandas-dev/pandas/pull/26221#issuecomment-487393880
    '''

    print(f"Event ID: {context.event_id}, Event type: {context.event_type}")
    print(f"Bucket: {data['bucket']}, Metageneration: {data['metageneration']}, File: {data['name']}, Created: {data['timeCreated']}, Updated: {data['updated']}")

    try:
        # if gcs file is updated with same filename, you may need to set cache_timeout=0 to avoid FileNotFoundError
        fs = gcsfs.GCSFileSystem(cache_timeout=0)
        with fs.open(f"{data['bucket']}/{data['name']}","rb") as fh:
            dataframe = pandas.read_csv(fh)
            return dataframe
    except FileNotFoundError:
        print("FileNotFoundError")
        return None
Esempio n. 13
0
    def __init__(self, path):
        if SARModel.__model is not None and SARModel.__path == path:
            self.model = SARModel.__model
            return

        # find the .sar.related & .sar.offsets files
        if path.startswith("gs:"):
            fs = gcsfs.GCSFileSystem(project='maga-bigdata')
            sar_file = fs.glob(f'{path}/*.sar')[0]
            fs.get(sar_file, 'sarplus_cache.sar')
            all_files = './sarplus_cache.sar'
        else:
            # bad hack but oh well
            raise ValueError("Please use a gcs file.")

        #     all_files = os.listdir(path)

        # def find_or_raise(extension):
        #     files = [f for f in all_files if f.endswith(extension)]
        #     log.info(f"files are {files}")
        #     if len(files) != 1:
        #         raise ValueError(
        #             "Directory '%s' must contain exactly 1 file ending in '%s'"
        #             % (path, extension)
        #         )
        #     return path + "/" + files[0]
        def find_or_raise(extension):
            log.info(f"file is {all_files}")
            return all_files

        # instantiate C++ backend
        SARModel.__model = self.model = pysarplus_cpp.SARModelCpp(
            find_or_raise(".sar"))
        SARModel.__path = path
def _add_qc(samples: List[Sample], namespace: str,
            overwrite_multiqc: bool) -> Tuple[str, str]:
    """
    Populates s.qc_values for each Sample object. Returns paths to MultiQC
    html and json files.
    """
    multiqc_html_path = join(
        f'gs://cpg-{NAGIM_PROJ_ID}-{namespace}-web/qc/multiqc.html')
    multiqc_json_path = join(
        f'gs://cpg-{NAGIM_PROJ_ID}-{namespace}-analysis/qc/multiqc_data.json')
    if 'QC' in SOURCES_TO_PROCESS:
        logger.info('Running MultiQC on QC files')
        parsed_json_fpath = _run_multiqc(
            samples,
            multiqc_html_path,
            multiqc_json_path,
            tmp_bucket=f'gs://cpg-{NAGIM_PROJ_ID}-{namespace}-tmp/qc',
            namespace=namespace,
            overwrite=overwrite_multiqc,
        )
        gfs = gcsfs.GCSFileSystem()
        with gfs.open(parsed_json_fpath) as f:
            row_by_sample = json.load(f)
        for s in samples:
            if s.nagim_id in row_by_sample:
                s.qc_values = row_by_sample[s.nagim_id]

    return multiqc_html_path, multiqc_json_path
Esempio n. 15
0
def online_main(request):
    # STATIC DATA
    the_project = 'autoinsight-258217'
    with open('catboostworkshop-e91a753d9550.json', 'rb') as rfile:
        token_dic = json.load(rfile)

    bucket = 'catboost-workshop'

    #input_data = request.get_json().get('columns') # dictionary
    input_data = request['columns']  # dictionary

    df = pd.DataFrame.from_dict(input_data)
    print(df)
    # READ METADATA
    fs = gcsfs.GCSFileSystem(project=the_project, token=token_dic)

    model_path = '{0}/models/final_model_amazon.pickle'.format(bucket)

    with open('final_model_amazon.pickle', 'rb') as rfile:
        model = pickle.load(rfile)
    #model = read_model(model_path, fs)
    preds_probas = model.predict(df, prediction_type='Probability')

    print(preds_probas)
    result = {
        'probability_0': preds_probas[0][0],
        'probability_1': preds_probas[0][1]
    }
def function_handler(request):
    request_json = request.get_json(silent=True)
    dataset_bucket = request_json['dataset_bucket']
    dataset_blob_name = request_json['dataset_blob_name']
    model_bucket = request_json['model_bucket']
    model_blob_name = request_json['model_blob_name']

    fs = gcsfs.GCSFileSystem(project='Serverless-faas-workbench')
    with fs.open(dataset_bucket + '/' + dataset_blob_name) as f:
        df = pd.read_csv(f)

        start = time()
        df['train'] = df['Text'].apply(cleanup)

        tfidf_vect = TfidfVectorizer(min_df=100).fit(df['train'])

        train = tfidf_vect.transform(df['train'])

        model = LogisticRegression()
        model.fit(train, df['Score'])
        latency = time() - start
        print(latency)

        model_file_path = "/tmp/" + model_blob_name
        joblib.dump(model, model_file_path)

        storage_client = storage.Client()
        m_bucket = storage_client.get_bucket(model_bucket)
        m_blob = m_bucket.blob(model_blob_name)

        upload_blob(model_bucket, m_blob, model_file_path)

        return "latency : " + str(latency)
Esempio n. 17
0
def parse_to_csv(bucket, source, filename):
    """Ingests covid sources"""
    if not bucket or not source or not filename:
        raise CovidIngestError(
            "All of source, bucket, and filename must be provided")

    all_sources = {'prison': None, 'ucla': None, 'recidiviz_manual': None}

    project_id = os.environ.get('GCP_PROJECT')
    path = os.path.join(bucket, source, filename)
    # Don't use the gcsfs cache
    fs = gcsfs.GCSFileSystem(project=project_id, cache_timeout=-1)
    logging.info("The path to download from is %s", path)
    bucket_path = os.path.join(bucket, source)
    logging.info("The files in the directory are:")
    logging.info(fs.ls(bucket_path))

    # Next we try to find the latest version of all three sources, if for
    # whatever reason a source folder is completely empty, we abort the
    # stitching process.
    for covid_source in all_sources:
        all_sources[covid_source] = _get_latest_source_file(
            fs, bucket, covid_source)

    # Once we have the latest file for each source, start stitching
    return _stitch_and_upload(fs, all_sources)
Esempio n. 18
0
def main(execution_date, **kwargs):
    # TODO: remove hard-coded project string
    fs = gcsfs.GCSFileSystem(project="cal-itp-data-infra")

    bucket = get_bucket()

    f = read_gcfs(f"schedule/{execution_date}/status.csv")
    status = pd.read_csv(f)

    success = status[lambda d: d.status == "success"]

    gtfs_files = []
    for ii, row in success.iterrows():
        agency_folder = f"{row.itp_id}_{row.url_number}"
        gtfs_url = f"{bucket}/schedule/{execution_date}/{agency_folder}/*"

        gtfs_files.append(fs.glob(gtfs_url))

    res = (success[["itp_id",
                    "url_number"]].assign(gtfs_file=gtfs_files).explode(
                        "gtfs_file").loc[lambda d: d.gtfs_file != "processed"])

    save_to_gcfs(
        res.to_csv(index=False).encode(),
        f"schedule/{execution_date}/processed/files.csv",
        use_pipe=True,
    )
Esempio n. 19
0
def list_gcs_objs(bucket_path, pattern=None, output_url=False, project=None):
    """Function to list objects in Google Cloud Storage Bucket

    args:
        bucket_path (str): Google Cloud Storage bucket name
        pattern (str | None, optional): regex pattern to search in bucket. 
            Can seach folders by adding folder names (i.e. pattern = 'subfolder/*.txt).
            If None then will not use search pattern. default = None
        output_url (bool, optional): boolean switch to output google cloud storage http url
            or google cloud storage object uri. If false will output gcs uri. default = False
        project (str | None): Cloud project name to use when initiation file spec. If None then
            use default gcloud config. default = None

    returns:
        list[str]: List of objects in bucket that match pattern
    """
    fs = gcsfs.GCSFileSystem(project=project)
    if pattern is not None:
        bucket_path = (bucket_path +
                       "/" if not bucket_path.endswith("/") else bucket_path)
        blobs = fs.glob(f"{bucket_path}{pattern}")
    else:
        blobs = fs.ls(bucket_path)

    base = "https://storage.cloud.google.com/{0}" if output_url else "gs://{0}"

    return [base.format(blob) for blob in blobs]
Esempio n. 20
0
 def __enter__(self):
     gcs_filesystem = gcsfs.GCSFileSystem(project=self.project_id)
     h1 = gcs_filesystem.open(self.gcs_full_path, 'wb')
     h = GzipFile(fileobj=h1, mode='wb')
     self.set_file_handle(h)
     self.add_file_to_registry()
     return self
Esempio n. 21
0
 def open_gcs_url(config, logger, storage, url):
     reader_impl = SourceFile.extract_reader_impl(config)
     use_gcs_service_account = "service_account_json" in config["provider"] and storage == "gs://"
     file_to_close = None
     if reader_impl == "gcsfs":
         if use_gcs_service_account:
             try:
                 token_dict = json.loads(config["provider"]["service_account_json"])
             except json.decoder.JSONDecodeError as err:
                 logger.error(f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}")
                 raise err
         else:
             token_dict = "anon"
         fs = gcsfs.GCSFileSystem(token=token_dict)
         file_to_close = fs.open(f"gs://{url}")
         result = file_to_close
     else:
         if use_gcs_service_account:
             try:
                 credentials = json.dumps(json.loads(config["provider"]["service_account_json"]))
                 tmp_service_account = tempfile.NamedTemporaryFile(delete=False)
                 with open(tmp_service_account, "w") as f:
                     f.write(credentials)
                 tmp_service_account.close()
                 client = Client.from_service_account_json(tmp_service_account.name)
                 result = open(f"gs://{url}", transport_params=dict(client=client))
                 os.remove(tmp_service_account.name)
             except json.decoder.JSONDecodeError as err:
                 logger.error(f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}")
                 raise err
         else:
             client = Client.create_anonymous_client()
             result = open(f"{storage}{url}", transport_params=dict(client=client))
     return result, file_to_close
Esempio n. 22
0
def save_da_to_zarr(da,
                    zarr_bucket,
                    dim_order=['time', 'x', 'y', 'variable'],
                    zarr_mode='a'):
    da = da.transpose(*dim_order)
    da['time'] = get_time_as_unix(da)

    _, y_size, x_size, _ = da.shape
    out_store = gcsfs.GCSMap(root=zarr_bucket, gcs=gcsfs.GCSFileSystem())

    chunks = (36, y_size, x_size, 1)

    ds = xr.Dataset({'stacked_eumetsat_data': da.chunk(chunks)})

    zarr_mode_to_extra_kwargs = {
        'a': {
            'append_dim': 'time'
        },
        'w': {
            'encoding': {
                'stacked_eumetsat_data': {
                    'compressor': numcodecs.Blosc(cname='zstd', clevel=5),
                    'chunks': chunks
                }
            }
        }
    }

    assert zarr_mode in ['a', 'w'], '`zarr_mode` must be one of: `a`, `w`'
    extra_kwargs = zarr_mode_to_extra_kwargs[zarr_mode]

    ds.to_zarr(out_store, mode=zarr_mode, consolidated=True, **extra_kwargs)
    print('Saved file to zarr bucket')
    return ds
Esempio n. 23
0
    def _output_results(self,
                        results: List[str],
                        project: str,
                        email: str,
                        file: str = None,
                        gcs_stored: bool = False) -> None:
        """Write the process results to a file.

    Args:
        results (List[str]): the results.
        project (str): project id
        email (str): OAuth email
        file (str, optional): file to process. Defaults to None.
        gcs_stored (bool, optional): write to GCS? Defaults to False.
    """
        def _send():
            for result in results:
                print(result, file=outfile)

        output_name = f'{file}.results'
        if gcs_stored:
            fs = gcsfs.GCSFileSystem(project=project)
            with fs.open(f'{self.bucket}/{output_name}', 'w') as outfile:
                _send()

        else:
            with open(output_name, 'w') as outfile:
                _send()
    def __init__(
        self,
        *,
        region: Region,
        fs: DirectIngestGCSFileSystem,
        ingest_directory_path: GcsfsDirectoryPath,
        temp_output_directory_path: GcsfsDirectoryPath,
        big_query_client: BigQueryClient,
        region_raw_file_config: Optional[
            DirectIngestRegionRawFileConfig] = None,
        upload_chunk_size: int = _DEFAULT_BQ_UPLOAD_CHUNK_SIZE,
    ):

        self.region = region
        self.fs = fs
        self.ingest_directory_path = ingest_directory_path
        self.temp_output_directory_path = temp_output_directory_path
        self.big_query_client = big_query_client
        self.region_raw_file_config = (
            region_raw_file_config
            if region_raw_file_config else DirectIngestRegionRawFileConfig(
                region_code=self.region.region_code,
                region_module=self.region.region_module,
            ))
        self.upload_chunk_size = upload_chunk_size
        self.csv_reader = GcsfsCsvReader(
            gcsfs.GCSFileSystem(project=metadata.project_id(),
                                cache_timeout=GCSFS_NO_CACHING))
        self.raw_table_migrations = DirectIngestRawTableMigrationCollector(
            region_code=self.region.region_code,
            regions_module_override=self.region.region_module,
        ).collect_raw_table_migration_queries()
def get_data_nhs_region(date_today):
    'get the prevalence data for the LADs within a specific window'
    #get most recent uploaded map
    fs = gcsfs.GCSFileSystem()
    final = date_today
    month = "0" + str(final.month) if final.month < 10 else str(final.month)
    day = final.day
    year = final.year
    start_date = datetime(2020, 6, 12)
    end_date = datetime.strptime(f"{year}{month}{day}", "%Y%m%d")
    #declare the different maps
    maps = []
    end_date_str = datetime.strftime(end_date, '%Y%m%d')
    for day in pd.date_range(start_date, end_date, freq="24H"):
        #get the right format
        date_file = str(day).split(" ")[0].replace("-", "")
        with fs.open(
                os.path.join(
                    f'covid-internal-data/covid-predictions/extrapolations/prevalence_history_{end_date_str}/corrected_prevalence_{date_file}.csv'
                )) as fileptr:
            #read the file
            file_prev = pd.read_csv(fileptr).groupby('nhser19nm')[
                'respondent_count', 'predicted_covid_positive_count',
                'population', 'corrected_covid_positive'].sum().reset_index()
            #create the date
            file_prev['day_updated_at'] = str(day).split(" ")[0]
            maps.append(file_prev)
    return pd.concat(maps)
def load_trusted(message):

    # Normalize json and rename columns
    data = flatten(message)
    df = pd.DataFrame(data, index=[0])

    df_result = df.rename(
        columns={
            'content_column_1': 'column_1',
            'content_column_2': 'column_2',
            'content_column_3': 'column_3'
        })

    # Load data to a trusted folder
    tz = pytz.timezone('America/Sao_Paulo')
    now = datetime.now()
    aware = tz.localize(now, is_dst=None)

    client = storage.Client()
    bucket = client.get_bucket('bexs_trusted_data')

    dt_processamento = 'dt=' + aware.strftime("%Y-%m-%d")
    partition_dir = 'test/test/profile/' + dt_processamento + '/'

    file_name = 'profile-' + aware.strftime("%Y-%m-%d_%H:%M:%S") + '.parquet'

    gcs = gcsfs.GCSFileSystem(project='example_staging', token=None)

    df_result.to_parquet('gs://bucket_name/' + partition_dir + file_name,
                         compression='SNAPPY')
Esempio n. 27
0
def delete_extra_file(name: chr, bucket: str) -> None:
    """
    Function was created to load all the files in the delete and remove those files by name in the raw folders. Those
    files are removed since they are not labeled with either Gender, Age, or  Country of Origin.

    The files in the deleted bucket are saved in one of 6 folders.'validated', 'train', 'test', 'dev', 'other', 'invalidated'

    They represent the original folders in commonvoice-voice-voice dataset

    :param name: File name to upload
    :param bucket: Bucket name

    Example:


    file_list = ['validated', 'train', 'test', 'dev', 'other', 'invalidated']

    for file in file_list:
        delete_extra_file(name = file, bucket = config.Bucket.RAW_DATA)

    """
    storage_client = storage.Client()
    project = storage_client.project

    fs = gcsfs.GCSFileSystem(project=project)

    with fs.open("{}/delete/{}".format(config.Bucket.META_DATA, name)) as f:
        data = pd.read_csv(f)

    mp3_to_remove = data.iloc[:, 1].to_list()
    bucket = storage_client.bucket(bucket)
    delete_mp3_from_bucket(file_list=mp3_to_remove, bucket=bucket)
Esempio n. 28
0
def get_service_configs(service=None, project_name=None):
    """Utility function to set configurations for the service."""
    print("get_service_configs()")
    if service == "clearbit":
        try:
            # set configs from the env vars of the machine
            PROJECT_ID = os.environ["GCP_PROJECT_ID"]
            LOCATION_ID = os.environ["GCP_LOCATION_ID"]
            KEYRING_ID = os.environ["GCP_KEYRING_ID"]
            CRYPTO_KEY_ID = os.environ["GCP_KEY_ID"]
            CIPHERTEXT_BLOB = os.environ["GCP_CIPHERTEXT_BLOB"]
            BUCKET_NAME = os.environ["GCS_BUCKET"]

            # DEPRECATED
            # set clearbit api versions
            # clearbit.Person.set_version("2018-06-06")
            # clearbit.Company.set_version("2017-09-12")
            # clearbit.Reveal.set_version("2018-03-28")
            # # not in use
            # clearbit.Watchlist.set_version("2015-11-13")
            # # not in use
            # clearbit.Prospector.set_version("2016-10-04")

            fs = gcsfs.GCSFileSystem(
                project=project_name,
                access="full_control",
                token="cloud",
                # consistency="md5",
                cache_timeout=None,
                secure_serialize=True,
                check_connection=True)

            fs.retries = 7
            fs.connect(method="cloud")

            cipher_string = fs.cat(BUCKET_NAME + "/" + CIPHERTEXT_BLOB)

            # DEPRECATED
            # download the file as a string in-memory
            # st_client = storage.Client()
            # bucket = st_client.get_bucket(BUCKET_NAME)
            # cipher_blob = bucket.blob(CIPHERTEXT_BLOB)
            # cipher_string = cipher_blob.download_as_string()

            # decrypt the kms key stored in gcs and set the key attr
            clearbit.key = decrypt_with_kms(project_id=PROJECT_ID,
                                            location_id=LOCATION_ID,
                                            key_ring_id=KEYRING_ID,
                                            crypto_key_id=CRYPTO_KEY_ID,
                                            ciphertext_string=cipher_string)
            print("configs_set: True")
        # catch-all
        except Exception as e:
            print("config_set: False")
            error = {"error_message": e, "status_code": "Unknown"}
            print(error)
    # flags for services
    if service == "crawler":
        # NOT IMPLEMENTED/NEEDED
        pass
Esempio n. 29
0
def rechunk_dataset(
    ds: Dataset,
    output: str,
    contig: Contig,
    fn: Callable,
    chunks: Tuple[int, int],
    max_mem: str,
    progress_update_seconds: int = 60,
    remote: bool = True,
    **kwargs,
) -> Dataset:
    logger.info(f"Rechunking dataset for contig {contig} "
                f"to {output} (chunks = {chunks}):\n{ds}")

    if remote:
        gcs = gcsfs.GCSFileSystem()
        output = gcsfs.GCSMap(output, gcs=gcs, check=False, create=False)

    # Save to local zarr store with desired sample chunking
    with ProgressBar(dt=progress_update_seconds):
        res = fn(
            ds,
            output=output,
            chunk_length=chunks[0],
            chunk_width=chunks[1],
            max_mem=max_mem,
            **kwargs,
        )

    logger.info(f"Rechunked dataset:\n{res}")
    return res
Esempio n. 30
0
def read(rem_result, json_serializable=True):
    # compute studio results have public read access.
    fs = gcsfs.GCSFileSystem(token="anon")
    s = time.time()
    RemoteResult().load(rem_result)
    read = {"renderable": [], "downloadable": []}
    for category in rem_result:
        with fs.open(f"{BUCKET}/{rem_result[category]['ziplocation']}", "rb") as f:
            res = f.read()

        buff = io.BytesIO(res)
        zipfileobj = zipfile.ZipFile(buff)

        for rem_output in rem_result[category]["outputs"]:
            ser = get_serializer(rem_output["media_type"])
            rem_data = ser.deserialize(
                zipfileobj.read(rem_output["filename"]), json_serializable
            )
            read[category].append(
                {
                    "id": rem_output.get("id", None),
                    "title": rem_output["title"],
                    "media_type": rem_output["media_type"],
                    "data": rem_data,
                }
            )
    f = time.time()
    print(f"Read finished in {f-s}s")
    return read