def _quick_download_lowres_misc_datasets(): """ Retrieves low resolution and miscellaneous datasets quickly using Quilt instead of downloading from the original source. """ with open(os.devnull, "w") as null: print("Downloading neural network model input datasets ...", end=" ") _stdout = sys.stdout _stderr = sys.stderr sys.stdout = sys.stderr = null for geotiff in [ "lowres/bedmap2_bed", "misc/REMA_100m_dem", "misc/REMA_200m_dem_filled", "misc/MEaSUREs_IceFlowSpeed_450m", "misc/Arthern_accumulation_bedmap2_grid1", ]: if not os.path.exists(path=f"{geotiff}.tif"): # Download packages first quilt.install(package=f"weiji14/deepbedmap/{geotiff}", force=True) # Export the files to the right pathname quilt.export(package=f"weiji14/deepbedmap/{geotiff}", force=True) # Add .tif extension to filename os.rename(src=geotiff, dst=f"{geotiff}.tif") sys.stderr = _stderr sys.stdout = _stdout print("done!")
def df_to_quilt(df, path): parts = path.split('/') assert len(parts) > 2 root_pkg = '/'.join(parts[0:2]) try: quilt.install(root_pkg, force=True) except Exception: pass object_encoding = {} df = df.copy() for col, dtype in df.dtypes.iteritems(): if dtype.name in ('Int8', 'Int32'): object_encoding[col] = 'int32' df[col] = df[col].astype(object) else: object_encoding[col] = 'infer' with tempfile.NamedTemporaryFile(suffix='.parquet') as f: print('writing to %s' % f.name) fastparquet.write(f.name, df, compression='snappy', object_encoding=object_encoding) print('build') quilt.build(path, f.name) print('push') quilt.push(root_pkg, is_public=True)
def _load_from_quilt(package_path): user, root_pkg, *sub_paths = package_path.split('/') pkg_store, root_node = store.PackageStore.find_package( None, user, root_pkg) if root_node is None: quilt.install(package_path, force=True) pkg_store, root_node = store.PackageStore.find_package( None, user, root_pkg) node = root_node while len(sub_paths): name = sub_paths.pop(0) for child_name, child_node in node.children.items(): if child_name != name: continue try: node = _from_core_node(pkg_store, child_node) except store.StoreException: quilt.install(package_path, force=True) node = _from_core_node(pkg_store, child_node) break else: raise Exception('Dataset %s not found' % package_path) return node
def upload_to_quilt(spark, schemas_dic): """ Function to upload data to quilt and to append it to already existing data :param spark: Spark Sessuin :return: None """ # remove old data and get new one logging.info("Installing quilt gdelt data...") quilt.rm("nmduarte/gdelt", force=True) quilt.install("nmduarte/gdelt", force=True) from quilt.data.nmduarte import gdelt # get the old data from quilt logging.info("getting data from quilt...") events_from_quilt = gdelt.events() mentions_from_quilt = gdelt.mentions() news_from_quilt = gdelt.news() # transform the data into dataframes so it can be appended logging.info("Creating dataframes from quilt data...") events_from_quilt_df = spark.createDataFrame(events_from_quilt, schema=schemas_dic['events2']) mentions_from_quilt_df = spark.createDataFrame( mentions_from_quilt, schema=schemas_dic['mentions']) news_from_quilt_df = spark.createDataFrame(news_from_quilt, schema=schemas_dic['news']) # mentions data - new data logging.info("Reading last 15min data from S3...") mentions_df = tools.read_from_s3_enriched(spark, "mentions", schemas_dic['mentions'], cmd_opts.date) events_df = tools.read_from_s3_enriched(spark, "events", schemas_dic['events2'], cmd_opts.date) news_df = tools.read_from_s3_enriched(spark, "news", schemas_dic['news'], cmd_opts.date) # concatenate already existing data with new data logging.info("Appending data to old quilt data...") mentions_concat = mentions_from_quilt_df.union(mentions_df) events_concat = events_from_quilt_df.union(events_df) news_concat = news_from_quilt_df.union(news_df) # build the 3 packages logging.info("Building quilt packages...") quilt.build("nmduarte/gdelt/mentions", mentions_concat.toPandas()) quilt.build("nmduarte/gdelt/events", events_concat.toPandas()) quilt.build("nmduarte/gdelt/news", news_concat.toPandas()) # push the 3 packages logging.info("Pushing quilt info...") quilt.push("nmduarte/gdelt/mentions", is_public=True, is_team=False) quilt.push("nmduarte/gdelt/events", is_public=True, is_team=False) quilt.push("nmduarte/gdelt/news", is_public=True, is_team=False)
def __init__(self, package_name, sub_path, timestamp=None): if '/' not in package_name: package_name = '/'.join([settings.QUILT_USER, package_name]) self.package_name = package_name try: quilt.install(self.package_name, force=True) except HTTPResponseException: pass self.sub_path = sub_path self.timestamp = timestamp
def load_datasets(packages, include_units=False): if not isinstance(packages, (list, tuple)): packages = [packages] datasets = [] for package_path in packages: user, root_pkg, *sub_paths = package_path.split('/') pkg_store, root_node = store.PackageStore.find_package( None, user, root_pkg) if root_node is None: # Quilt seems to have a bug that loading a sub path as fragments # will corrupt the local db. Load the full data set always for now. #quilt.install(package_path, force=True) quilt.install(user + "/" + root_pkg) pkg_store, root_node = store.PackageStore.find_package( None, user, root_pkg) node = root_node while len(sub_paths): name = sub_paths.pop(0) for child_name, child_node in node.children.items(): if child_name != name: continue try: node = _from_core_node(pkg_store, child_node) except store.StoreException: quilt.install(package_path, force=True) node = _from_core_node(pkg_store, child_node) break else: raise Exception('Dataset %s not found' % package_path) try: df = node() except store.StoreException: _materialize(node) df = node() if include_units: for col_name in df.columns: unit = node._meta.get('%s_unit' % col_name, None) if not unit: continue df[col_name] = df[col_name].astype('pint[%s]' % unit) datasets.append(df) if len(datasets) == 1: return datasets[0] return datasets
def FromQuilt( package: str = DEFAULT_QUILT_PKG, hash: str = None, version: str = DEFAULT_QUILT_VERSION, tag: str = None, force: bool = True, ) -> object: """Create a GroupsData object from quilt.""" quilt.install( package=package, version=version, force=force, tag=tag, hash=hash, ) cc_pkg = quilt.load(DEFAULT_QUILT_PKG) return GroupsData.FromDataFrame(cc_pkg.data.group_definitions())
def get_pkg(user: str, package: str, hash_key=None, force=True) -> pd.DataFrame: r""" Parameters ---------- user package hash_key force Returns ------- """ pkg_path = f'{user}/{package}' quilt.install(pkg_path, hash=hash_key, force=force) return quilt.load(pkg_path)
def get_deepbedmap_model_inputs( window_bound: rasterio.coords.BoundingBox, padding: int = 1000, use_whole_rema: bool = False, ) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray): """ Outputs one large tile for each of: BEDMAP2, REMA, MEASURES Ice Flow Velocity and Antarctic Snow Accumulation according to a given window_bound in the form of (xmin, ymin, xmax, ymax). """ data_prep = _load_ipynb_modules("data_prep.ipynb") if window_bound == rasterio.coords.BoundingBox( left=-1_594_000.0, bottom=-166_500.0, right=-1_575_000.0, top=-95_500.0 ): # Quickly pull from cached quilt storage if using (hardcoded) test region quilt.install(package="weiji14/deepbedmap/model/test", force=True) pkg = quilt.load(pkginfo="weiji14/deepbedmap/model/test") X_tile = pkg.X_tile() W1_tile = pkg.W1_tile() W2_tile = pkg.W2_tile() W3_tile = pkg.W3_tile()
import matplotlib.pyplot as plt import pandas as pd import quilt import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from util import adjust_inflation, convert_gdf try: from quilt.data.spatialucr import census except ImportError: warn("Fetching data. This should only happen once") quilt.install("spatialucr/census") quilt.install("spatialucr/census_cartographic") from quilt.data.spatialucr import census try: from quilt.data.geosnap_data import data_store except ImportError: quilt.build("geosnap_data/data_store") from quilt.data.geosnap_data import data_store class Bunch(dict): """A dict with attribute-access.""" def __getattr__(self, key): try: return self.__getitem__(key) except KeyError:
# # This notebook estimates the greenhouse gases emitted by passenger cars in Helsinki. The main source data for the model is the [LIPASTO](http://lipasto.vtt.fi/en/index.htm) calculation system developed by [VTT Technical Research Centre of Finland Ltd.](http://www.vttresearch.com/) # # Click Run -> Run All Cells to run the calculations. # %% import math import re import pandas as pd import numpy as np import scipy try: from quilt.data.jyrjola import lipasto except ImportError: import quilt quilt.install('jyrjola/lipasto') from quilt.data.jyrjola import lipasto import plotly import plotly.graph_objs as go import cufflinks as cf import aplans_graphs plotly.offline.init_notebook_mode(connected=True) cf.set_config_file(offline=True) # %% [markdown] # First we load the municipality-specific data from LIPASTO. We are mostly interested in the total mileage in Helsinki specified by the road type (_highways_ or _urban driving_). The mileage column below is in million kilometres (_Mkm_) and the gases are in metric tonnes (_t_). # %% muni = lipasto.emissions_by_municipality().set_index(
def install_data(): quilt.install("gudbrandtandberg/chesspieces", force=True)
# %% INPUT_DATASETS = ['jyrjola/ymparistotilastot'] import math import re import pandas as pd import numpy as np import importlib for dataset in INPUT_DATASETS: mod_path = dataset.replace('/', '.') try: mod = importlib.import_module('quilt.data.%s' % mod_path) except ImportError: import quilt quilt.install(dataset) from quilt.data.jyrjola import ymparistotilastot from utils import dict_merge import aplans_graphs import plotly import plotly.graph_objs as go import cufflinks as cf plotly.offline.init_notebook_mode(connected=True) cf.set_config_file(offline=True) # %% df = ymparistotilastot.l34_polttoaine_tavoitteet().copy() display(df.set_index(['Vuosi']))
def install_data(): # force to avoid y/n prompt; does not re-download PKG = 'akarve/BSDS300' quilt.install(PKG, force=True)
def install_data(): quilt.install("gudbrandtandberg/chessboard_segmentation")