Esempio n. 1
0
def access_precomputed(
    store_path: str,
    key: str,
    mode: str,
    array_type=None,
    dtype=None,
    num_channels=None,
    shape=None,
    resolution=None,
    encoding=None,
    chunks=None,
    jpeg_quality=None,
    voxel_offset=None,
    scale_index=None,
) -> TensorStoreArray:
    driver = "neuroglancer_precomputed"

    kvstore_driver, _store_path = fsspec.core.split_protocol(store_path)
    if kvstore_driver == None:
        kvstore_driver = "file"
        kvstore_path = "/"
        # remove the leading slash after making the absolute path
        _store_path = os.path.abspath(_store_path)[1:]
    else:
        kvstore_path = _store_path.split(os.path.sep)[0]

    if kvstore_driver not in KVSTORE_DRIVERS:
        raise ValueError(
            f"File system protocol {kvstore_driver} is not supported by tensorstore."
        )

    info_path = os.path.join(store_path, "info")

    if mode == 'r':
        with fsspec.open(info_path) as fh:
            json_data = json.loads(fh.read())
            precomputed_metadata = parse_info(json_data)
            scale_matches = [
                scale.key == key for scale in precomputed_metadata.scales
            ]
            if not any(scale_matches):
                raise ValueError(
                    'Could not find key: {key} in info file at {info_path}')
            else:
                scale_index = scale_matches.index(True)
                scale_meta = precomputed_metadata.scales[scale_index]
    else:
        scale_meta = ScaleMetadata(
            size=shape,
            resolution=resolution,
            encoding=encoding,
            chunk_size=chunks,
            key=key,
            voxel_offset=voxel_offset,
            jpeg_quality=jpeg_quality,
        )
        precomputed_metadata = PrecomputedMetadata(type=array_type,
                                                   data_type=dtype,
                                                   num_channels=num_channels,
                                                   scales=[scale_meta])

    if mode == "r":
        read = True
        # So cool that tensorstore errors when these are set to False for reading...
        write = None
        create = None
        delete_existing = None
    elif mode == 'a':
        read = True
        write = True
        create = True
        delete_existing = False
    elif mode == "rw":
        read = True
        write = True
        create = True
        delete_existing = True
    elif mode == "w":
        read = False
        write = True
        create = True
        delete_existing = True
    elif mode == "w-":
        read = False
        write = True
        create = True
        delete_existing = False
    else:
        raise ValueError('Mode must be "r", "rw", "a", "w", or "w-"')

    tsa = TensorStoreArray(driver=driver,
                           path=_store_path,
                           kvstore_path=kvstore_path,
                           kvstore_driver=kvstore_driver,
                           encoding=scale_meta.encoding,
                           scale_index=scale_index,
                           key=scale_meta.key,
                           num_channels=precomputed_metadata.num_channels,
                           volume_type=precomputed_metadata.type,
                           dtype=precomputed_metadata.data_type,
                           resolution=scale_meta.resolution,
                           size=scale_meta.size,
                           chunk_size=scale_meta.chunk_size,
                           jpeg_quality=jpeg_quality)
    return tsa.open(read=read,
                    write=write,
                    create=create,
                    delete_existing=delete_existing).result()
Esempio n. 2
0
def input_opener(fname, **kwargs):
    logger.info(f"Opening input '{fname}'")
    with fsspec.open(fname, **kwargs) as f:
        yield f
Esempio n. 3
0
 def read_text_from_href(self, href: str, *args: Any, **kwargs: Any) -> str:
     with fsspec.open(href, "r") as f:
         return f.read()
Esempio n. 4
0
def read_bytes(
    vineyard_socket: str,
    path: str,
    storage_options: Dict,
    read_options: Dict,
    proc_num: int,
    proc_index: int,
):
    """Read bytes from external storage and produce a ByteStream,
    which will later be assembled into a ParallelStream.

    Args:
        vineyard_socket (str): Ipc socket
        path (str): External storage path to write to
        storage_options (dict): Configurations of external storage
        read_options (dict): Additional options that could control the behavior of read
        proc_num (int): Total amount of process
        proc_index (int): The sequence of this process

    Raises:
        ValueError: If the stream is invalid.
    """
    client = vineyard.connect(vineyard_socket)
    builder = ByteStreamBuilder(client)

    serialization_mode = read_options.pop('serialization_mode', False)
    if serialization_mode:
        parsed = urlparse(path)
        try:
            fs = fsspec.filesystem(parsed.scheme)
        except ValueError as e:
            report_status("error", str(e))
            raise
        meta_file = f"{path}_{proc_index}.meta"
        blob_file = f"{path}_{proc_index}"
        if not fs.exists(meta_file) or not fs.exists(blob_file):
            report_status("error", f"Some serialization file cannot be found. Expected: {meta_file} and {blob_file}")
            raise FileNotFoundError('{}, {}'.format(meta_file, blob_file))
        # Used for read bytes of serialized graph
        meta_file = fsspec.open(meta_file, mode="rb", **storage_options)
        with meta_file as f:
            meta = f.read().decode('utf-8')
            meta = json.loads(meta)
        lengths = meta.pop("lengths")
        for k, v in meta.items():
            builder[k] = v
        stream = builder.seal(client)
        client.persist(stream)
        ret = {"type": "return", "content": repr(stream.id)}
        print(json.dumps(ret), flush=True)
        writer = stream.open_writer(client)
        of = fsspec.open(blob_file, mode="rb", **storage_options)
        with of as f:
            try:
                total_size = f.size()
            except TypeError:
                total_size = f.size
            assert total_size == sum(lengths), "Target file is corrupted"
            for length in lengths:
                buf = f.read(length)
                chunk = writer.next(length)
                buf_writer = pa.FixedSizeBufferWriter(chunk)
                buf_writer.write(buf)
                buf_writer.close()
        writer.finish()
    else:
        # Used when reading tables from external storage.
        # Usually for load a property graph
        header_row = read_options.get("header_row", False)
        for k, v in read_options.items():
            if k in ("header_row", "include_all_columns"):
                builder[k] = "1" if v else "0"
            elif k == "delimiter":
                builder[k] = bytes(v, "utf-8").decode("unicode_escape")
            else:
                builder[k] = v

        offset = 0
        chunk_size = 1024 * 1024 * 4
        try:
            of = fsspec.open(path, mode="rb", **storage_options)
        except Exception as e:
            report_status("error", str(e))
            raise
        with of as f:
            header_line = read_block(f, 0, 1, b'\n')
            builder["header_line"] = header_line.decode("unicode_escape")
            if header_row:
                offset = len(header_line)
            stream = builder.seal(client)
            client.persist(stream)
            ret = {"type": "return", "content": repr(stream.id)}
            print(json.dumps(ret), flush=True)

            writer = stream.open_writer(client)
            try:
                total_size = f.size()
            except TypeError:
                total_size = f.size
            part_size = (total_size - offset) // proc_num
            begin = part_size * proc_index + offset
            end = min(begin + part_size, total_size)
            if proc_index == 0:
                begin -= int(header_row)

            while begin < end:
                buf = read_block(f, begin, min(chunk_size, end - begin), delimiter=b"\n")
                size = len(buf)
                if not size:
                    break
                begin += size - 1
                chunk = writer.next(size)
                buf_writer = pa.FixedSizeBufferWriter(chunk)
                buf_writer.write(buf)
                buf_writer.close()

            writer.finish()
Esempio n. 5
0
 def from_json(cls, fname: Union[str, Path], open_kwargs: dict = {}):
     with fsspec.open(str(fname), mode='rt', **open_kwargs) as fh:
         jblob = json.loads(fh.read())
     return cls(**jblob)
Esempio n. 6
0
def save_speaker_mapping(out_path, speaker_mapping):
    """Saves speaker mapping if not yet present."""
    if out_path is not None:
        speakers_json_path = _set_file_path(out_path)
        with fsspec.open(speakers_json_path, "w") as f:
            json.dump(speaker_mapping, f, indent=4)
Esempio n. 7
0
 def _save_json(json_file_path: str, data: dict) -> None:
     with fsspec.open(json_file_path, "w") as f:
         json.dump(data, f, indent=4)
Esempio n. 8
0
    def inspect(self, dataset, columns_dict, output_file):
        """
        Parameters
        -----------
        path: str, list of str, or <dask.dataframe|cudf|pd>.DataFrame
            Dataset path (or list of paths), or a DataFrame. If string,
            should specify a specific file or directory path. If this is a
            directory path, the directory structure must be flat (nested
            directories are not yet supported).
        dataset_format: str
            Dataset format (i.e parquet or csv)
        columns_dict: dict
            Dictionary indicating the different columns type
        output_file: str
            Filename to write the output statistics
        """

        # Get dataset columns
        cats = columns_dict["cats"]
        conts = columns_dict["conts"]
        labels = columns_dict["labels"]

        # Create Dataset, Workflow, and get Stats
        stats = DataStats()
        features = ColumnSelector(cats + conts + labels) >> stats
        workflow = Workflow(features, client=self.client)
        workflow.fit(dataset)

        # get statistics from the datastats op
        output = stats.output

        # Dictionary to store collected information
        data = {}
        # Store num_rows
        data["num_rows"] = dataset.num_rows
        # Store cols
        for col_type in ["conts", "cats", "labels"]:
            data[col_type] = {}
            for col in columns_dict[col_type]:
                data[col_type][col] = {}
                data[col_type][col]["dtype"] = output[col]["dtype"]

                if col_type != "conts":
                    data[col_type][col]["cardinality"] = output[col][
                        "cardinality"]

                if col_type == "cats":
                    data[col_type][col]["min_entry_size"] = output[col]["min"]
                    data[col_type][col]["max_entry_size"] = output[col]["max"]
                    data[col_type][col]["avg_entry_size"] = output[col]["mean"]
                elif col_type == "conts":
                    data[col_type][col]["min_val"] = output[col]["min"]
                    data[col_type][col]["max_val"] = output[col]["max"]
                    data[col_type][col]["mean"] = output[col]["mean"]
                    data[col_type][col]["std"] = output[col]["std"]

                data[col_type][col]["per_nan"] = output[col]["per_nan"]

        # Write json file
        with fsspec.open(output_file, "w") as outfile:
            json.dump(data, outfile, cls=NpEncoder)
Esempio n. 9
0
def test_automkdir_readonly(tmpdir):
    dir = os.path.join(str(tmpdir), "d")
    with pytest.raises(FileNotFoundError):
        of = fsspec.open(os.path.join(dir, "dfile"), "r")
        with of:
            pass
Esempio n. 10
0
def open_file(url, *args, **kwargs):
    of = fsspec.open(url, *args, **kwargs)
    with of as f:
        yield f
N_EXAMPLES = 2

if __name__ == "__main__":

    for i, config in enumerate(Oscar.BUILDER_CONFIGS):
        print(f"Loading config '{config.name}' ({i + 1}/{len(Oscar.BUILDER_CONFIGS)})")

        # Get data url
        checksum_filename = _BASE_CHECKSUM_FILE_NAME.format(language=config.language)
        checksum_url = config.base_data_url + checksum_filename
        checksum_file_content = requests.get(checksum_url).text.splitlines()
        data_filename = checksum_file_content[0].split("\t")[0]
        data_url = config.base_data_url + data_filename

        # Get a few examples
        with fs.open(data_url, "rt", compression="gzip") as f:
            current_examples = 0
            dummy_content = []
            for line in f:
                dummy_content.append(line)
                current_examples += len(line.strip()) == 0
                if current_examples == N_EXAMPLES:
                    break
            dummy_content = "".join(dummy_content).rstrip()

        # Write dummy files
        dummy_data_dir = Path(__file__).resolve().parent / "dummy" / config.name / str(config.version) / "dummy_data"
        dummy_data_dir.mkdir(parents=True, exist_ok=True)
        (dummy_data_dir / checksum_filename).open("w").write(data_filename + "\t insert_hash_here")
        with fs.open(str(dummy_data_dir / data_filename), "wt", compression="gzip") as f:
            f.write(dummy_content)
Esempio n. 12
0
def to_image(
    mols: Union[List[Chem.rdchem.Mol], Chem.rdchem.Mol],
    legends: Union[List[Union[str, None]], str, None] = None,
    n_cols: int = 4,
    use_svg: bool = False,
    mol_size: Union[Tuple[int, int], int] = (200, 200),
    highlight_atom: List[List[int]] = None,
    highlight_bond: List[List[int]] = None,
    outfile: str = None,
    max_mols: int = 32,
    copy: bool = False,
    indices: bool = False,
):
    """Generate an image out of a molecule or a list of molecule.

    Args:
        mols: one or a list of molecules.
        legends: a string or a list of string as legend for every molecules.
        n_cols: number of molecules per column.
        use_svg: whether to ouput an SVG (or a PNG).
        mol_size: a int or a tuple of int defining the size per molecule.
        highlight_atom: atom to highlight.
        highlight_bond: bonds to highlight.
        outfile: path where to save the image (local or remote path).
        max_mols: the maximum number of molecules to display.
        copy: whether to copy the molecules or not.
        indices: Whether to draw the atom indices.
    """

    if isinstance(mol_size, int):
        mol_size = (mol_size, mol_size)

    if isinstance(mols, Chem.rdchem.Mol):
        mols = [mols]

    if isinstance(legends, str):
        legends = [legends]

    if copy:
        mols = [dm.copy_mol(mol) for mol in mols]

    if max_mols is not None:
        mols = mols[:max_mols]

        if legends is not None:
            legends = legends[:max_mols]

    if indices is True:
        [dm.atom_indices_to_mol(mol) for mol in mols]

    _highlight_atom = highlight_atom
    if highlight_atom is not None and isinstance(highlight_atom[0], int):
        _highlight_atom = [highlight_atom]

    _highlight_bond = highlight_bond
    if highlight_bond is not None and isinstance(highlight_bond[0], int):
        _highlight_bond = [highlight_bond]

    # Don't make the image bigger than it
    if len(mols) < n_cols:
        n_cols = len(mols)

    image = Draw.MolsToGridImage(
        mols,
        legends=legends,
        molsPerRow=n_cols,
        useSVG=use_svg,
        subImgSize=mol_size,
        highlightAtomLists=_highlight_atom,
        highlightBondLists=_highlight_bond,
    )

    if outfile is not None:
        with fsspec.open(outfile, "wb") as f:
            if use_svg:
                if isinstance(image, str):
                    # in a terminal process
                    f.write(image.encode())
                else:
                    # in a jupyter kernel process
                    f.write(image.data.encode())  # type: ignore
            else:
                if isinstance(image,
                              PIL.PngImagePlugin.PngImageFile):  # type: ignore
                    # in a terminal process
                    image.save(f)
                else:
                    # in a jupyter kernel process
                    f.write(image.data)  # type: ignore

    return image
Esempio n. 13
0
def makedirs(url, exist_ok=False):
    fs, path = get_fs_and_path(url)
    fs.makedirs(path, exist_ok=exist_ok)
    if not path_exists(url):
        with fsspec.open(url, mode="wb"):
            pass
Esempio n. 14
0
def read_file(path: str, **kwargs):
    """Support fetching files from arbitrary filesystems
    """
    with fsspec.open(path, **kwargs) as f:
        content = f.read()
    return content
Esempio n. 15
0
 def _create_checkpoint(checkpoint_stream, global_step: int,
                        filename: str):
     with tune.checkpoint_dir(step=global_step) as checkpoint_dir:
         file_path = os.path.join(checkpoint_dir, filename)
         with fsspec.open(file_path, "wb") as f:
             f.write(checkpoint_stream)
def upload(entry, parquet_dir, url):
    of = fsspec.open(url)
    if of.fs.exists(url):
        of.fs.delete(url, recursive=True)
    entry.fs.upload(parquet_dir, url, recursive=True)
    return True
                ],
                'pressure_level': [
                    '250', '500', '700',
                    '850', '925', '1000',
                ],
                'year': date.year,
                'month': date.month,
                'day': date.day,
                'time': [
                    '00:00', '12:00',
                ],
                'area'          : [75.,185, 15., 320.] #N,W,S,E
        }
        r1 = c1.retrieve(name, request, None)

        with fsspec.open(r1.location) as f1:
            iso_vars = xr.open_dataset(f1, engine='scipy')
        #    print(iso_vars)

        print('getting single-level data')
        
        ## also need MSLP & PW
        ## get the data from ECMWF API
        c2 = cdsapi.Client()
        name = 'reanalysis-era5-single-levels'
        request = {
                'product_type': 'reanalysis',
                'format': 'netcdf',
                'variable': ['mean_sea_level_pressure','total_column_water_vapour'],
                'year': date.year,
                'month': date.month,
Esempio n. 18
0
def _fsspec_safe_open(fname, **kwargs):
    # workaround for inconsistent behavior of fsspec.open
    # https://github.com/intake/filesystem_spec/issues/579
    with fsspec.open(fname, **kwargs) as fp:
        with fp as fp2:
            yield fp2
Esempio n. 19
0
 def _load_json(json_file_path: str) -> Dict:
     with fsspec.open(json_file_path, "r") as f:
         return json.load(f)
Esempio n. 20
0
def _get_url_size(fname):
    with fsspec.open(fname, mode="rb") as of:
        size = of.size
    return size
Esempio n. 21
0
#  tests for pillow-8.1.2-py38ha0e1e83_1 (this is a generated file);
print('===== testing package: pillow-8.1.2-py38ha0e1e83_1 =====')
print('running run_test.py')
#  --- run_test.py (begin) ---
import fsspec

from PIL import Image

# Test JPEG2k
with fsspec.open("https://www.fnordware.com/j2k/relax.jp2") as f:
    image = Image.open(f)
    image.load()

#  --- run_test.py (end) ---

print('===== pillow-8.1.2-py38ha0e1e83_1 OK =====')
print("import: 'PIL'")
import PIL

print("import: 'PIL.Image'")
import PIL.Image

print("import: 'PIL.ImageCms'")
import PIL.ImageCms
Esempio n. 22
0
    def __init__(self, filename: str, hdf5group: str = None, hdf5file_mode: str = 'r',
                 store: Union[MutableMapping, str, Path] = None, store_path: str = None,
                 store_mode: str = 'a', LRU: bool = False, LRU_max_size: int = 2**30,
                 max_chunksize=2*2**20):

        """
        Args:
            filename:                    str or File-like object, file name string or File-like object to be read by zarr
            hdf5group:                   str, hdf5 group in hdf5 file to be read by zarr
                                         along with its children. default is the root group.
            hdf5file_mode                str, subset of h5py file access modes, filename must exist
                                         'r'          readonly, default 'r'
                                         'r+'         read and write
            store:                       collections.abc.MutableMapping or str, zarr store.
                                         if string path is passed, zarr.DirectoryStore
                                         is created at the given path, if None, zarr.MemoryStore is used
            store_mode:                  store data access mode, default 'a'
                                         'r'          readonly, compatible zarr hierarchy should
                                                      already exist in the passed store
                                         'r+'         read and write, return error if file does not exist,
                                                      for updating zarr hierarchy
                                         'w'          create store, remove data if it exists
                                         'w-' or 'x'  create store, fail if exists
                                         'a'          read and write, create if it does not exist, default 'r'
            store_path:                  string, path in zarr store
            LRU:                         bool, if store is not already zarr.LRUStoreCache, add
                                         a zarr.LRUStoreCache store layer on top of currently used store
            LRU_max_size:                int, maximum zarr.LRUStoreCache cache size, only used
                                         if store is zarr.LRUStoreCache, or LRU argument is True
            max_chunksize:               maximum chunk size to use when creating zarr hierarchy, this is useful if
                                         only a small slice of data needs to be read
        """
        # Verify arguments
        if hdf5file_mode not in ('r', 'r+'):
            raise ValueError("hdf5file_mode must be 'r' or 'r+'")
        self.hdf5file_mode = hdf5file_mode

        # Verify arguments
        if not isinstance(LRU, bool):
            raise TypeError(f"Expected bool for LRU, received {type(LRU)}")
        self.LRU = LRU
        if not isinstance(LRU_max_size, int):
            raise TypeError(f"Expected int for LRU_max_size, received {type(LRU_max_size)}")
        self.LRU_max_size = LRU_max_size
        if not isinstance(max_chunksize, int):
            raise TypeError(f"Expected int for max_chunksize, received {type(max_chunksize)}")
        self.max_chunksize = max_chunksize

        # store, store_path, and store_mode are passed through to zarr
        self.store_path = store_path
        self.store_mode = store_mode
        if store is not None and LRU is True and not isinstance(store, zarr.LRUStoreCache):
            self.store = zarr.LRUStoreCache(store, max_size=self.LRU_max_size)
        else:
            self.store = store

        # create dictionary mapping hdf5 filter numbers to compatible zarr codec
        self._hdf5_regfilters_subset = {}
        self._fill_regfilters()

        # dictionary to hold addresses of hdf5 objects in file
        self._address_dict = {}

        # create zarr format hierarchy for datasets and attributes compatible with hdf5 file,
        # dataset contents are not copied, unless it contains variable-length strings

        self.zgroup = zarr.open_group(self.store, mode=self.store_mode, path=self.store_path)
        if self.store is None:
            self.store = self.zgroup.store

        # FileChunkStore requires uri
        if isinstance(filename, str):
            self.uri = filename
        else:
            try:
                self.uri = getattr(filename, 'path', None)
                if self.uri is None:
                    self.uri = filename.name
            except:
                self.uri = ''

        # Access hdf5 file and create zarr hierarchy
        if hdf5group is not None and not isinstance(hdf5group, str):
            raise TypeError(f"Expected str for hdf5group, recieved {type(hdf5group)}")
        self.hdf5group = hdf5group
        self.filename = filename
        if self.store_mode != 'r':
            self.file = h5py.File(self.filename, mode=self.hdf5file_mode)
            self.group = self.file[self.hdf5group] if self.hdf5group is not None else self.file
            self.create_zarr_hierarchy(self.group, self.zgroup)
            self.file.close()
        if isinstance(self.filename, str):
            self.chunkstore_file = fsspec.open(self.filename, mode='rb')
            self.chunk_store = FileChunkStore(self.store, chunk_source=self.chunkstore_file.open())
        else:
            self.chunk_store = FileChunkStore(self.store, chunk_source=self.filename)
        if LRU is True and not isinstance(self.chunk_store, zarr.LRUStoreCache):
            self.chunk_store = zarr.LRUStoreCache(self.chunk_store, max_size=self.LRU_max_size)

        # open zarr group
        store_mode_cons = 'r' if self.store_mode == 'r' else 'r+'
        self.zgroup = zarr.open_group(self.store, mode=store_mode_cons, path=self.store_path, chunk_store=self.chunk_store)
Esempio n. 23
0
def open_gzip(path: PathType, storage_options: Optional[Dict[str, str]]) -> IO[Any]:
    url = str(path)
    storage_options = storage_options or {}
    openfile: IO[Any] = fsspec.open(url, compression="gzip", **storage_options)
    return openfile
def generate_wb_timeseries(shapes, config_dict):
    """
    This is where the code processing is actually done. This code takes in a
    polygon, and the and a config dict which contains: shapefile's crs, output
    directory, id_field, time_span, and include_uncertainty which says whether
    to include all data as well as an invalid pixel count which can be used
    for measuring uncertainty performs a polygon drill into the wofs_albers
    product. The resulting xarray, which contains the water classified pixels
    for that polygon over every available timestep, is used to calculate the
    percentage of the water body that is wet at each time step. The outputs
    are written to a csv file named using the polygon UID, which is a geohash
    of the polygon's centre coords.

    Inputs:
    shapes - polygon to be interrogated
    config_dict - many config settings including crs, id_field, time_span,
                  shapefile

    Outputs:
    Nothing is returned from the function, but a csv file is written out to
        disk
    """
    output_dir = config_dict['output_dir']
    crs = config_dict['crs']
    id_field = config_dict['id_field']
    time_span = config_dict['time_span']
    include_uncertainty = config_dict['include_uncertainty']
    wofls = config_dict['wofls']
    assert wofls

    # Some query parameters will be different for different WOfL products.
    output_res = get_resolution(wofls)
    dataset_maturity = get_dataset_maturity(wofls)

    if include_uncertainty:
        unknown_percent_threshold = 100
    else:
        unknown_percent_threshold = 10

    with Datacube(app='Polygon drill') as dc:
        first_geometry = shapes['geometry']

        str_poly_name = shapes['properties'][id_field]

        try:
            fpath = os.path.join(output_dir,
                                 f'{str_poly_name[0:4]}/{str_poly_name}.csv')
        except TypeError:
            str_poly_name = str(int(str_poly_name)).zfill(6)
            fpath = os.path.join(output_dir,
                                 f'{str_poly_name[0:4]}/{str_poly_name}.csv')
        geom = geometry.Geometry(first_geometry, crs=crs)
        current_year = datetime.now().year

        if time_span == 'ALL':
            if shapely_geom.shape(first_geometry).envelope.area > 2000000:
                years = range(1986, current_year + 1, 5)
                time_periods = [(str(year), str(year + 4)) for year in years]
            else:
                time_periods = [('1986', str(current_year))]
        elif time_span == 'APPEND':
            start_date = get_last_date(fpath)
            if start_date is None:
                logger.debug(f'There is no csv for {str_poly_name}')
                return 1
            time_periods = [(start_date, str(current_year))]
        elif time_span == 'CUSTOM':
            time_periods = [(config_dict['start_dt'], config_dict['end_date'])]

        valid_capacity_pc = []
        valid_capacity_ct = []
        invalid_capacity_ct = []
        date_list = []
        for time in time_periods:
            wb_capacity_pc = []
            wb_capacity_ct = []
            wb_invalid_ct = []
            dry_observed = []
            invalid_observations = []

            # Set up the query, and load in all of the WOFS layers
            query = {
                'geopolygon': geom,
                'time': time,
                'output_crs': crs,
                'resolution': output_res,
                'resampling': 'nearest'
            }
            if dataset_maturity:
                query['dataset_maturity'] = dataset_maturity
            logger.debug('Query: {}'.format(
                {k: v
                 for k, v in query.items() if k != 'geopolygon'}))
            wofl = dc.load(product=wofls,
                           group_by='solar_day',
                           fuse_func=wofls_fuser,
                           **query)

            if len(wofl.attrs) == 0:
                logger.debug(
                    f'There is no new data for {str_poly_name} in {time}')
                # TODO(MatthewJA): Confirm (with Ness?) that changing this
                # return to a continue doesn't break things.
                continue
            # Make a mask based on the polygon (to remove extra data
            # outside of the polygon)
            mask = rasterio.features.geometry_mask(
                [geom.to_crs(wofl.geobox.crs) for geoms in [geom]],
                out_shape=wofl.geobox.shape,
                transform=wofl.geobox.affine,
                all_touched=False,
                invert=True)
            # mask the data to the shape of the polygon
            # the geometry width and height must both be larger than one pixel
            # to mask.
            if (geom.boundingbox.width > 25.3
                    and geom.boundingbox.height > 25.3):
                wofl_masked = wofl.water.where(mask)
            else:
                wofl_masked = wofl.water

            # Work out how full the waterbody is at every time step
            for ix, times in enumerate(wofl.time):

                # Grab the data for our timestep
                all_the_bit_flags = wofl_masked.isel(time=ix)

                # Find all the wet/dry pixels for that timestep
                lsa_wet = all_the_bit_flags.where(
                    all_the_bit_flags == 136).count().item()
                lsa_dry = all_the_bit_flags.where(
                    all_the_bit_flags == 8).count().item()
                sea_wet = all_the_bit_flags.where(
                    all_the_bit_flags == 132).count().item()
                sea_dry = all_the_bit_flags.where(
                    all_the_bit_flags == 4).count().item()
                sea_lsa_wet = all_the_bit_flags.where(
                    all_the_bit_flags == 140).count().item()
                sea_lsa_dry = all_the_bit_flags.where(
                    all_the_bit_flags == 12).count().item()
                wet_pixels = (all_the_bit_flags.where(
                    all_the_bit_flags == 128).count().item() + lsa_wet +
                              sea_wet + sea_lsa_wet)
                dry_pixels = (all_the_bit_flags.where(
                    all_the_bit_flags == 0).count().item() + lsa_dry +
                              sea_dry + sea_lsa_dry)

                # Count the number of masked observations
                masked_all = all_the_bit_flags.count().item()
                # Turn our counts into percents
                try:
                    water_percent = round((wet_pixels / masked_all * 100), 1)
                    dry_percent = round((dry_pixels / masked_all * 100), 1)
                    missing_pixels = masked_all - (wet_pixels + dry_pixels)
                    unknown_percent = missing_pixels / masked_all * 100

                except ZeroDivisionError:
                    water_percent = 0.0
                    dry_percent = 0.0
                    unknown_percent = 100.0
                    missing_pixels = masked_all
                    logger.debug(f'{str_poly_name} has divide by zero error')

                # Append the percentages to a list for each timestep
                # Filter out timesteps with < 90% valid observations. Add
                # empty values for timesteps with < 90% valid. if you set
                # 'UNCERTAINTY = True' in your config file then you will
                # only filter out timesteps with 100% invalid pixels.
                # You will also record the number invalid pixels per timestep.

                if unknown_percent < unknown_percent_threshold:
                    wb_capacity_pc.append(water_percent)
                    invalid_observations.append(unknown_percent)
                    wb_invalid_ct.append(missing_pixels)
                    dry_observed.append(dry_percent)
                    wb_capacity_ct.append(wet_pixels)
                else:
                    wb_capacity_pc.append('')
                    invalid_observations.append('')
                    wb_invalid_ct.append('')
                    dry_observed.append('')
                    wb_capacity_ct.append('')

            valid_obs = wofl.time.dropna(dim='time')
            valid_obs = valid_obs.to_dataframe()
            if 'spatial_ref' in valid_obs.columns:
                valid_obs = valid_obs.drop(columns=['spatial_ref'])
            valid_capacity_pc += wb_capacity_pc
            valid_capacity_ct += wb_capacity_ct
            invalid_capacity_ct += wb_invalid_ct
            date_list += valid_obs.to_csv(
                None,
                header=False,
                index=False,
                date_format="%Y-%m-%dT%H:%M:%SZ").split('\n')
            date_list.pop()

        if date_list:
            if include_uncertainty:
                rows = zip(date_list, valid_capacity_pc, valid_capacity_ct,
                           invalid_capacity_ct)
            else:
                rows = zip(date_list, valid_capacity_pc, valid_capacity_ct)
            os.makedirs(os.path.dirname(fpath), exist_ok=True)
            if time_span == 'APPEND':
                of = fsspec.open(fpath, 'a')
                with of as f:
                    writer = csv.writer(f)
                    for row in rows:
                        writer.writerow(row)
            else:
                of = fsspec.open(fpath, 'w')
                with of as f:
                    writer = csv.writer(f)
                    headings = [
                        'Observation Date', 'Wet pixel percentage',
                        'Wet pixel count (n = {0})'.format(masked_all)
                    ]
                    if include_uncertainty:
                        headings.append('Invalid pixel count')
                    writer.writerow(headings)
                    for row in rows:
                        writer.writerow(row)
        else:
            logger.info(f'{str_poly_name} has no new good valid data')
        return True
Esempio n. 25
0
 def to_json(self, fname: Union[str, Path], open_kwargs: dict = {}) -> int:
     jblob = json.dumps(asdict(self))
     with fsspec.open(str(fname), mode='wt', **open_kwargs) as fh:
         result = fh.write(jblob)
     return result
Esempio n. 26
0
def download(url, csv_path):
    of = fsspec.open(url)
    of.fs.download(url, csv_path)
    return csv_path
Esempio n. 27
0
def save_csv(suffix: str, contents: bytes) -> str:
    fd, tmpfile = tempfile.mkstemp(prefix="csv_sniffer", suffix=suffix)
    os.close(fd)
    with fsspec.open(tmpfile, mode="wb", compression="infer") as out:
        out.write(contents)
    return tmpfile
Esempio n. 28
0
def bus_peak_frequencies(
    gtfs_path: str,
    test_date: typing.Optional[datetime.date] = None,
    am_peak: typing.Optional[typing.Tuple[int, int]] = None,
    pm_peak: typing.Optional[typing.Tuple[int, int]] = None,
) -> geopandas.GeoDataFrame:
    """
    Compute AM and PM Peak frequencies for all the lines in a GTFS Feed.

    Parameters
    ==========
    gtfs_path: str
        The path (or URL) to a GTFS feed.
    test_date: datetime.date
        The test date for which to compute frequencies. Defaults to February
        18th, 2020, an unremarkable weekday February.
    am_peak: tuple of integers
        The two hours (out of 24) demarcating the AM peak period.
    pm_peak: tuple of integers
        The two hours (out of 24) demarcating the PM peak period.
    """

    # Set default values
    test_date = test_date or TEST_DATE
    am_peak = am_peak or (6, 9)
    pm_peak = pm_peak or (15, 19)

    am_duration = am_peak[1] - am_peak[0]
    pm_duration = pm_peak[1] - pm_peak[0]

    assert am_duration > 0
    assert pm_duration > 0

    # Download and read the GTFS feed
    with fsspec.open(gtfs_path, "rb") as infile:
        data = infile.read()
    with open(GTFS_FILE, "wb") as outfile:
        outfile.write(data)
    service_by_date = partridge.read_service_ids_by_date(GTFS_FILE)
    feed = partridge.load_geo_feed(GTFS_FILE)

    # Get the service for the test date
    try:
        test_service = next(v for k, v in service_by_date.items()
                            if k == test_date)
    except StopIteration:
        raise ValueError(f"Could not find service for {test_date}")

    test_trips = feed.trips[feed.trips.service_id.isin(test_service)]
    test_stops = feed.stop_times[feed.stop_times.trip_id.isin(
        test_trips.trip_id)]

    # Get the departure, arrival, and mean time for each trip
    trip_timings = test_stops.groupby(test_stops.trip_id).agg({
        "departure_time":
        min,
        "arrival_time":
        max
    })
    trip_timings = trip_timings.assign(
        mean_time=trip_timings.departure_time +
        (trip_timings.arrival_time - trip_timings.departure_time) / 2.0)

    # Find all of the trips that fall within the AM and PM peak times.
    am_peak_trips = trip_timings[
        (trip_timings.mean_time > am_peak[0] * 60 * 60)
        & (trip_timings.mean_time < am_peak[1] * 60 * 60)]
    pm_peak_trips = trip_timings[
        (trip_timings.mean_time > pm_peak[0] * 60 * 60)
        & (trip_timings.mean_time < pm_peak[1] * 60 * 60)]
    am_peak_trips = test_trips.merge(
        am_peak_trips,
        left_on=test_trips.trip_id,
        right_index=True,
    )
    pm_peak_trips = test_trips.merge(
        pm_peak_trips,
        left_on=test_trips.trip_id,
        right_index=True,
    )

    # Compute the peak frequency
    am_peak_frequency = (am_peak_trips.groupby(
        [am_peak_trips.route_id,
         am_peak_trips.direction_id]).size().to_frame("am_peak_trips"))
    am_peak_frequency = am_peak_frequency.assign(
        am_peak_frequency=am_duration * 60 / am_peak_frequency.am_peak_trips)
    pm_peak_frequency = (pm_peak_trips.groupby(
        [pm_peak_trips.route_id,
         pm_peak_trips.direction_id]).size().to_frame("pm_peak_trips"))
    pm_peak_frequency = pm_peak_frequency.assign(
        pm_peak_frequency=pm_duration * 60 / pm_peak_frequency.pm_peak_trips)
    peak_frequency = pandas.concat([am_peak_frequency, pm_peak_frequency],
                                   axis=1,
                                   sort=False)

    # Add the route short name for easier legibility.
    peak_frequency = peak_frequency.join(
        feed.routes[["route_id", "route_short_name"]].set_index("route_id"),
        how="left",
        on="route_id",
    )

    # Grab the most popular shape as the official one.
    route_shapes = (test_trips.groupby("route_id").agg({
        "shape_id":
        lambda s: s.value_counts().index[0]
    }).reset_index().merge(
        feed.shapes, how="left",
        on="shape_id").set_index("route_id").drop(columns=["shape_id"]))

    peak_frequency = peak_frequency.merge(
        route_shapes, how="left", right_index=True,
        left_index=True).assign(agency=feed.agency.agency_name.iloc[0])

    gdf = geopandas.GeoDataFrame(peak_frequency, geometry="geometry")
    gdf.crs = f"EPSG:{WGS84}"
    return gdf
Esempio n. 29
0
 def write_text_from_href(self, href: str, txt: str, *args: Any,
                          **kwargs: Any) -> None:
     with fsspec.open(href, "w") as destination:
         return destination.write(txt)
def open_file(urlpath, mode="rb", compression=None):
    return fsspec.open(urlpath,
                       mode=mode,
                       compression=compression,
                       **fsspec_kwargs)