def test_default_package_get_local(self): foodir = pathlib.Path("foo_dir") bazdir = pathlib.Path("baz_dir") foodir.mkdir(parents=True, exist_ok=True) bazdir.mkdir(parents=True, exist_ok=True) with open('bar', 'w') as fd: fd.write(fd.name) with open('foo', 'w') as fd: fd.write(fd.name) with open(bazdir / 'baz', 'w') as fd: fd.write(fd.name) with open(foodir / 'bar', 'w') as fd: fd.write(fd.name) currdir = pathlib.Path('.').resolve().as_uri() + '/' # consistent local case pkg = quilt3.Package().set_dir("/", "./") assert pkg.get() == currdir # package with one inconsistent path, leading case pkg = quilt3.Package().set_dir("/", "./") pkg.set('badpath', 'bar') with pytest.raises(QuiltException): pkg.get() # package with one inconsistent path, training case pkg = quilt3.Package().set_dir("/", "./") # prefix with 'z_' to ensure that this entry is last in sorted order pkg.set('z_badpath', 'bar') with pytest.raises(QuiltException): pkg.get() # package with inconsistent schemes with patch('quilt3.packages.get_size_and_meta', return_value=(0, dict(), '0')): pkg = quilt3.Package().set_dir("/", "./") pkg.set("bar", "s3://test-bucket/bar") with pytest.raises(QuiltException): pkg.get() # package with inconsistent root directories with open('foo_dir/foo', 'w') as fd: fd.write(fd.name) pkg = quilt3.Package().set_dir("/", "./") pkg.set('foo', 'foo_dir/foo') with pytest.raises(QuiltException): pkg.get()
def aggregate_and_push( pkg_map, source_S3_url="s3://allencell-internal-quilt", dest_S3_url="s3://allencell", dest_pkg_name="aics/integrated_transcriptomics_structural_organization_hipsc_cm", message="Public data set", public=False, ): internal = boto3.session.Session(profile_name="default") # noqa: F841 # real data q = quilt3.Package() q.set("README.md", "../../data/README.md") q.set( "resources/Website_schematic_data_flow_20200310_v2.png", "../../data/resources/Website_schematic_data_flow_20200310_v2.png", ) for (low_level_pkg_str, new_subdir) in pkg_map.items(): p = quilt3.Package.browse(low_level_pkg_str, source_S3_url) for (logical_key, physical_key) in p.walk(): q.set(f"{new_subdir}/{logical_key}", physical_key) git_commit_hash = (subprocess.check_output(["git", "rev-parse", "HEAD" ]).strip().decode("utf-8")) label = f"{message}. git commit hash of fish_morphology_code = {git_commit_hash}." # set profile to public bucket access if pushing public if public: external = boto3.session.Session( profile_name="allencell") # noqa: F841 # external = boto3.session.Session(profile_name="public") # noqa: F841 q.push(dest_pkg_name, dest_S3_url, message=label)
def test_fetch(self): """ Verify fetching a package entry. """ pkg = (Package().set('foo', DATA_DIR / 'foo.txt', { 'user_meta': 'blah' }).set('bar', DATA_DIR / 'foo.txt', {'user_meta': 'blah'})) pkg['foo'].meta['target'] = 'unicode' pkg['bar'].meta['target'] = 'unicode' with open(DATA_DIR / 'foo.txt') as fd: assert fd.read().replace('\n', '') == '123' # Copy foo.text to bar.txt pkg['foo'].fetch('data/bar.txt') with open('data/bar.txt') as fd: assert fd.read().replace('\n', '') == '123' # Raise an error if you copy to yourself. with pytest.raises(shutil.SameFileError): pkg.set('foo', DATA_DIR / 'foo.txt')['foo'].fetch(DATA_DIR / 'foo.txt') # The key gets re-rooted correctly. pkg = quilt3.Package().set('foo', DATA_DIR / 'foo.txt') new_pkg_entry = pkg['foo'].fetch('bar.txt') out_abs_path = pathlib.Path("bar.txt").resolve().as_uri() assert new_pkg_entry.physical_keys[0] == out_abs_path
def test_push_with_meta_data(meta_arg, meta_data, expected_set_dir_count, expected_push_count, expected_meta, expected_stderr, capsys): name = 'test/name' pkg = quilt3.Package() with tempfile.TemporaryDirectory() as tmp_dir: (Path(tmp_dir) / 'foo').touch() (Path(tmp_dir) / 'bar').mkdir() (Path(tmp_dir) / 'bar' / 'baz') with mock.patch('quilt3.Package.__new__', return_value=pkg) as mocked_package_class,\ mock.patch.object(pkg, 'set_dir', wraps=pkg.set_dir) as mocked_set_dir, \ mock.patch.object(pkg, 'push') as mocked_push: # '--registry' defaults to configured remote registry hence optional. if meta_arg: main.main( ('push', '--dir', tmp_dir, name, meta_arg, meta_data)) else: main.main(('push', '--dir', tmp_dir, name)) mocked_package_class.assert_called_once_with(quilt3.Package) assert mocked_set_dir.call_count == expected_set_dir_count assert mocked_push.call_count == expected_push_count assert pkg.meta == expected_meta # check for expected stderr exception message captured = capsys.readouterr() assert expected_stderr in captured.err
def manuscript_plots_dataset( test=False, col_name_map={}, dataset_name="manuscript_plots", package_owner="rorydm", readme_path="README.md", s3_bucket="s3://allencell-internal-quilt", ): df = collate_plot_dataset() df_rna = make_small_rna_df() # subsample df for a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset p = quilt3.Package() p = p.set("README.md", readme_path) p = p.set("data.csv", df) p = p.set("data_rnaseq.csv", df_rna) # tag with commit hash label = ( subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8") ) # upload to quilt p.push( f"{package_owner}/{dataset_name}", s3_bucket, message=f"git commit hash of fish_morphology_code = {label}", )
def get_pkg(src_registry, data): p = quilt3.Package() for entry in data['entries']: set_entry = p.set_dir if entry['is_dir'] else p.set set_entry(entry['logical_key'], str(src_registry.base.join(entry['path']))) return p
def get_pkg(src_registry, data): p = quilt3.Package() for entry in data['entries']: set_entry = p.set_dir if entry['is_dir'] else p.set set_entry(entry['logical_key'], str(src_registry.base.join(entry['path']))) calculate_pkg_hashes(user_boto_session, p) return p
def init(self): """ Init a new data package Should be used only when creating a new dataset or data form. Raises ------ OSError when package with this names already exists. """ os.makedirs(self.local_data_dir, exist_ok=True) self.package = quilt3.Package() self.package.set_dir('/', self.local_package_root)
def generate_new_package_version(package_name, registry_name, push_dest): time.sleep( 1 ) # named_packages is timestamp based - ensure that we get a fresh timestamp fn_info = FunctionReporter( f"Creating package '{package_name}' in registry {registry_name}") try: pkg = quilt3.Package() pkg.set(f"test-{uuid.uuid4()}", pd.DataFrame(list(range(random.randint(0, 100_000))))) pkg.push(package_name, registry=registry_name, dest=push_dest) fn_info.succeeded(output=None) except Exception as ex: fn_info.failed(ex)
def upload_package(self, message=None): """ Build and upload package from local directory, ignoring all files listed in .quiltignore Args: message (:obj:`str`): commit message """ # build package, ignoring all files in .quiltignore package = quilt3.Package() package.set_dir('/', self.path) # upload package package.push(self.get_full_package_id(), message=message)
def test_import(self): with patch('quilt3.Package.browse') as browse_mock, \ patch('quilt3.imports._list_packages') as list_packages_mock: browse_mock.return_value = quilt3.Package() list_packages_mock.return_value = ['foo/bar', 'foo/baz'] from quilt3.data.foo import bar assert isinstance(bar, Package) browse_mock.assert_has_calls( [call('foo/baz', registry=ANY), call('foo/bar', registry=ANY)], any_order=True ) from quilt3.data import foo assert hasattr(foo, 'bar') and hasattr(foo, 'baz')
def test_push(self): name = 'test/name' pkg = quilt3.Package() with tempfile.TemporaryDirectory() as tmp_dir: (Path(tmp_dir) / 'foo').touch() (Path(tmp_dir) / 'bar').mkdir() (Path(tmp_dir) / 'bar' / 'baz') with mock.patch('quilt3.Package.__new__', return_value=pkg) as mocked_package_class, \ mock.patch.object(pkg, 'set_dir', wraps=pkg.set_dir) as mocked_set_dir, \ mock.patch.object(pkg, 'push') as mocked_push: main.main(('push', '--dir', tmp_dir, name)) mocked_package_class.assert_called_once_with(quilt3.Package) mocked_set_dir.assert_called_once_with('.', tmp_dir, meta=None) mocked_push.assert_called_once_with(name, registry=None, dest=None, message=None)
def distribute( self, push_uri: Optional[str] = None, message: Optional[str] = None, attach_associates: bool = True, ) -> quilt3.Package: """ Push a package to a specific S3 bucket. If no bucket is provided, the un-built, un-pushed package is returned. You can push a dataset with the same name multiple times to the same bucket multiple times as instead of overriding a prior dataset, Quilt simply creates a new dataset version. Please refer to Quilt documentation for more details: https://docs.quiltdata.com :param push_uri: The S3 bucket uri to push to. Example: "s3://quilt-jacksonb" :param message: An optional message to attach to that version of the dataset. :param attach_associates: Boolean option to attach associates as metadata to each file. Associates are used to retain quick navigation between related files. :return: The built and optionally pushed quilt3.Package. """ # Confirm name matches approved pattern # We previously checked during init, but the name could have been changed name = self.return_or_raise_approved_name(self.name) # Create empty package pkg = quilt3.Package() # Write any extra files to tempdir to send to the build with tempfile.TemporaryDirectory() as tmpdir: # Set all referenced files text = self.readme.text for rf in self.readme.referenced_files: replaced = f"referenced_files/{rf.resolved.name}" text = text.replace(rf.target, replaced) pkg.set(replaced, str(rf.resolved)) # Write the updated readme to temp readme_pk = Path(tmpdir, "README.md") with open(readme_pk, "w") as readme_write: readme_write.write(text) # Set the readme pkg.set("README.md", readme_pk) # Validate the dataset v_ds = validate(self.data) # Set package contents if len(self.path_columns) > 0: fp_cols = self.path_columns else: fp_cols = v_ds.schema.df.index[ v_ds.schema.df["dtype"].str.contains("Path")].tolist() # Create associate mappings: List[Dict[str, str]] # This list is in index order. Meaning that as the column values are descended we can simply add a # new associate to the already existing associate map at that list index. associates = [] # Create metadata reduction map # This will be used to clean up and standardize the metadata access after object construction # Metadata column name to boolean value for should or should not reduce metadata values # This will be used during the "clean up the package metadata step" # If we have multiple files each with the same keys for the metadata, but for one reason or another, one # packaged file's value for a certain key is a list while another's is a single string, this leads to a # confusing mixed return value API for the same _type_ of object. Example: # fov/ # obj1/ # {example_key: "hello"} # obj2/ # {example_key: ["hello", "world"]} # Commonly this happens when a manifest has rows of unique instances of a child object but retains a # reference to a parent object, example: rows of information about unique cells that were all generated # using the same algorithm, whose information is stored in a column, for each cell information row. # This could result in some files (which only have one cell) being a single string while other files # (which have more than one cell) being a list of the same string over and over again. # "Why spend all this time to reduce/ collapse the metadata anyway?", besides making it so that users won't # have to call `obj2.meta["example_key"][0]` every time they want the value, and besides the fact that it # standardizes the metadata api, the biggest reason is that S3 objects can only have 2KB of metadata, # without this reduction/ collapse step, manifests are more likely to hit that limit and cause a package # distribution error. metadata_reduction_map = { index_col: True for index_col in self.metadata_columns } # Set all files with tqdm(total=len(fp_cols) * len(v_ds.data), desc="Constructing package") as pbar: for col in fp_cols: # Check display name for col if col in self.column_names_map: col_label = self.column_names_map[col] else: col_label = col # Update values to the logical key as they are set for i, val in enumerate(v_ds.data[col].values): # Fully resolve the path physical_key = Path(val).expanduser().resolve() # Just using val.name could result in files that shouldn't be grouped being grouped # Example column: # SourceReadpath # a/0.tiff # a/1.tiff # b/0.tiff # b/1.tiff # Even though there are four files, this would result in both a/0.tiff and b/0.tiff, and, # a/1.tiff and b/1.tiff being grouped together. To solve this we can prepend a the first couple # of characters from a hash of the fully resolved path to the logical key. unique_file_name = file_utils.create_unique_logical_key( physical_key) logical_key = f"{col_label}/{unique_file_name}" if physical_key.is_file(): v_ds.data[col].values[i] = logical_key # Create metadata dictionary to attach to object meta = {} for meta_col in self.metadata_columns: # Short reference to current metadata value v = v_ds.data[meta_col].values[i] # Enforce simple JSON serializable type # First check if value is a numpy value # It likely is because pandas relies on numpy # All numpy types have the "dtype" attribute and can be cast to python type by using # the `item` function, details here: # https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.item.html if hasattr(v, "dtype"): v = v.item() if isinstance(v, JSONSerializableTypes): meta[meta_col] = [v] else: raise TypeError( f"Non-simple-JSON-serializable type found in column: '{meta_col}', " f"at index: {i}: ({type(v)} '{v}').\n\n " f"At this time only the following types are allowing in metadata: " f"{JSONSerializableTypes}") # Check if object already exists if logical_key in pkg: # Join the two meta dictionaries joined_meta = {} for meta_col, curr_v in pkg[ logical_key].meta.items(): # Join the values for the current iteration of the metadata joined_values = [*curr_v, *meta[meta_col]] # Only check if the metadata at this index can be reduced if currently is still # being decided. We know if the metadata value at this index is still be decided if: # the boolean value in the metadata reduction map is True, as in, this index can be # reduced or collapsed. # The other reason to make this check is so that we don't override an earlier False # reduction value. In the case where early on we encounter an instance of the # metadata that should not be reduced but then later on we say it can be, this check # prevents that. As we want all metadata access across the dataset to be uniform. if metadata_reduction_map[meta_col]: # Update the metadata reduction map # For the current column being checked, as long as it is still being # determined that the column can be reduced (aka we have entered this if block) # check if we can still reduce the metadata after the recent addition. # "We can reduce the metadata if the count of the first value (or any value) is # the same as the length of the entire list of values" # This runs quickly for small lists as seen here: # https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical metadata_reduction_map[meta_col] = ( joined_values.count( joined_values[0]) == len( joined_values)) # Attached the joined values to the joined metadata joined_meta[meta_col] = joined_values # Update meta pkg[logical_key].set_meta(joined_meta) # Object didn't already exist, simply set it else: pkg.set(logical_key, physical_key, meta) # Update associates try: associates[i][col_label] = logical_key except IndexError: associates.append({col_label: logical_key}) else: v_ds.data[col].values[i] = logical_key pkg.set_dir(logical_key, physical_key) # Update progress bar pbar.update() # Clean up package metadata pkg = self._recursive_clean(pkg, metadata_reduction_map) # Attach associates if desired if attach_associates: for i, associate_mapping in tqdm( enumerate(associates), desc="Creating associate metadata blocks"): for col, lk in associate_mapping.items(): # Having dictionary expansion in this order means that associates will override a prior # existing `associates` key, this is assumed safe because attach_associates was set to True. pkg[lk].set_meta({ **pkg[lk].meta, **{ "associates": associate_mapping } }) # Store validated dataset in the temp dir with paths replaced meta_path = Path(tmpdir, "metadata.csv") v_ds.data.to_csv(meta_path, index=False) pkg.set("metadata.csv", meta_path) # Set logical keys for all extra files for lk_parent, files_list in self.extra_files.items(): for f in files_list: pkg.set(f"{lk_parent}/{f.name}", f) # Optionally push if push_uri: pkg = pkg.push(f"{self.package_owner}/{name}", registry=push_uri, message=message) return pkg
from analyze import cluster as _cluster, cluster_spatial as _cluster_spatial from harmonize import harmonize as _harmonize from .util import adjust_inflation, convert_gdf appname = "geosnap" appauthor = "geosnap" data_dir = user_data_dir(appname, appauthor) if not os.path.exists(data_dir): pathlib.Path(data_dir).mkdir(parents=True, exist_ok=True) # look for local storage and create if missing try: from quilt3.data.geosnap_data import storage except ImportError: storage = quilt3.Package() # look for local storage and create if missing try: from quilt3.data.geosnap_data import storage except ImportError: storage = quilt3.Package() try: # if any of these aren't found, stream them insteead from quilt3.data.census import tracts_cartographic, administrative except ImportError: warn("Unable to locate local census data. Streaming instead.\n" "If you plan to use census data repeatedly you can store it locally" "with the data.store_census function for better performance") try: tracts_cartographic = quilt3.Package.browse(
def create_package(request): json_iterator = map(json.JSONDecoder().decode, (line.decode() for line in request.stream)) data = next(json_iterator) get_schema_validator(PACKAGE_CREATE_SCHEMA)(data) handle = data['name'] registry = data['registry'] try: package_registry = get_registry(registry) meta = data.get('meta') message = data.get('message') quilt3.util.validate_package_name(handle) pkg = quilt3.Package() if meta is not None: pkg.set_meta(meta) size_to_hash = 0 files_to_hash = 0 for entry in map(get_schema_validator(PACKAGE_CREATE_ENTRY_SCHEMA), json_iterator): try: physical_key = PhysicalKey.from_url(entry['physical_key']) except ValueError: raise ApiException( HTTPStatus.BAD_REQUEST, f"{entry['physical_key']} is not a valid s3 URL.") if physical_key.is_local(): raise ApiException(HTTPStatus.BAD_REQUEST, f"{str(physical_key)} is not in S3.") logical_key = entry['logical_key'] hash_ = entry.get('hash') obj_size = entry.get('size') meta = entry.get('meta') if hash_ and obj_size is not None: pkg.set( logical_key, quilt3.packages.PackageEntry( physical_key, obj_size, { 'type': 'SHA256', 'value': hash_ }, meta, )) else: pkg.set(logical_key, str(physical_key), meta) size_to_hash += pkg[logical_key].size if size_to_hash > PKG_FROM_FOLDER_MAX_PKG_SIZE: raise ApiException( HTTPStatus.BAD_REQUEST, f"Total size of new S3 files is {size_to_hash}, " f"but max supported size is {PKG_FROM_FOLDER_MAX_PKG_SIZE}" ) files_to_hash += 1 if files_to_hash > PKG_FROM_FOLDER_MAX_FILES: raise ApiException( HTTPStatus.BAD_REQUEST, f"Package has new S3 {files_to_hash} files, " f"but max supported number is {PKG_FROM_FOLDER_MAX_FILES}" ) pkg._validate_with_workflow( registry=package_registry, workflow=data.get('workflow', ...), name=handle, message=message, ) except quilt3.util.QuiltException as qe: raise ApiException(HTTPStatus.BAD_REQUEST, qe.message) calculate_pkg_hashes(user_boto_session, pkg) try: top_hash = pkg._build( name=handle, registry=registry, message=message, ) except ClientError as boto_error: raise ApiException.from_botocore_error(boto_error) return make_json_response(200, { 'top_hash': top_hash, })
def create_package(req_file): json_iterator = map(json.JSONDecoder().decode, (line.decode() for line in req_file)) data = next(json_iterator) get_schema_validator(PACKAGE_CREATE_SCHEMA)(data) handle = data['name'] registry = data['registry'] try: package_registry = get_registry(registry) meta = data.get('meta') message = data.get('message') quilt3.util.validate_package_name(handle) pkg = quilt3.Package() if meta is not None: pkg.set_meta(meta) size_to_hash = 0 files_to_hash = 0 for entry in map(get_schema_validator(PACKAGE_CREATE_ENTRY_SCHEMA), json_iterator): try: physical_key = PhysicalKey.from_url(entry['physical_key']) except ValueError: raise PkgpushException( "InvalidS3PhysicalKey", {"physical_key": entry['physical_key']}, ) if physical_key.is_local(): raise PkgpushException( "InvalidLocalPhysicalKey", {"physical_key": str(physical_key)}, ) logical_key = entry['logical_key'] hash_ = entry.get('hash') obj_size = entry.get('size') meta = entry.get('meta') if hash_ and obj_size is not None: pkg.set( logical_key, quilt3.packages.PackageEntry( physical_key, None if obj_size is None else int(obj_size), { 'type': 'SHA256', 'value': hash_ }, meta, )) else: pkg.set(logical_key, str(physical_key), meta) size_to_hash += pkg[logical_key].size if size_to_hash > PKG_FROM_FOLDER_MAX_PKG_SIZE: raise PkgpushException( "PackageTooLargeToHash", { "size": size_to_hash, "max_size": PKG_FROM_FOLDER_MAX_PKG_SIZE }, ) files_to_hash += 1 if files_to_hash > PKG_FROM_FOLDER_MAX_FILES: raise PkgpushException( "TooManyFilesToHash", { "num_files": files_to_hash, "max_files": PKG_FROM_FOLDER_MAX_FILES }, ) pkg._validate_with_workflow( registry=package_registry, workflow=data.get('workflow', ...), name=handle, message=message, ) except quilt3.util.QuiltException as qe: raise PkgpushException.from_quilt_exception(qe) calculate_pkg_hashes(user_boto_session, pkg) try: top_hash = pkg._build( name=handle, registry=registry, message=message, ) except ClientError as boto_error: raise PkgpushException.from_boto_error(boto_error) # XXX: return mtime? return {'top_hash': top_hash}
def push(self, bucket: Optional[str] = None): """ Push the most recently generated data. Parameters ---------- bucket: Optional[str] Push data to a specific bucket different from the bucket defined by your workflow_config.json or the defaulted bucket. Notes ----- If your git status isn't clean, or you haven't commited and pushed to origin, any attempt to push data will be rejected. """ # Check if manifest is None if self.manifest is None: raise exceptions.PackagingError( "No manifest found to construct package with.") # Resolve None bucket if bucket is None: bucket = self._storage_bucket # Get current git branch current_branch = self._get_current_git_branch() # Normalize branch name # This is to stop quilt from making extra directories from names like: # feature/some-feature current_branch = current_branch.replace("/", ".") # Resolve push target quilt_loc = f"{self._quilt_package_owner}/{self._quilt_package_name}" push_target = f"{quilt_loc}/{current_branch}/{self.step_name}" # Check git status is clean self._check_git_status_is_clean(push_target) # Construct the package step_pkg, relative_manifest = quilt_utils.create_package( manifest=self.manifest, step_pkg_root=self.step_local_staging_dir, filepath_columns=self.filepath_columns, metadata_columns=self.metadata_columns, ) # Add the relative manifest and generated README to the package with TemporaryDirectory() as tempdir: # Store the relative manifest in a temporary directory m_path = Path(tempdir) / "manifest.parquet" relative_manifest.to_parquet(m_path) step_pkg.set("manifest.parquet", m_path) # Add the params files to the package for param_file in ["run_parameters.json", "init_parameters.json"]: param_file_path = self.step_local_staging_dir / param_file step_pkg.set(param_file, param_file_path) # Generate README readme_path = Path(tempdir) / "README.md" with open(readme_path, "w") as write_readme: write_readme.write( constants.README_TEMPLATE.render( quilt_package_name=self._quilt_package_name, source_url=self._get_git_origin_url(), branch_name=self._get_current_git_branch(), commit_hash=self._get_current_git_commit_hash(), creator=getpass.getuser(), )) step_pkg.set("README.md", readme_path) # Browse top level project package and add / overwrite to it in step dir try: project_pkg = quilt3.Package.browse(quilt_loc, self._storage_bucket) except botocore.errorfactory.ClientError: log.info(f"Could not find existing package: {quilt_loc} " f"in bucket: {self._storage_bucket}. " f"Creating a new package.") project_pkg = quilt3.Package() # Regardless of if we found a prior version of the package or starting from # a new package, we "merge" them together to place this steps data in the # correct location. # Remove the current step if it exists in the previous project package if current_branch in project_pkg.keys(): if self.step_name in project_pkg[current_branch].keys(): project_pkg = project_pkg.delete( f"{current_branch}/{self.step_name}") # Merge packages for (logical_key, pkg_entry) in step_pkg.walk(): project_pkg.set( f"{current_branch}/{self.step_name}/{logical_key}", pkg_entry) # Push the data project_pkg.push( quilt_loc, registry=self._storage_bucket, message=self._create_data_commit_message(), )