Example #1
0
    def test_default_package_get_local(self):
        foodir = pathlib.Path("foo_dir")
        bazdir = pathlib.Path("baz_dir")
        foodir.mkdir(parents=True, exist_ok=True)
        bazdir.mkdir(parents=True, exist_ok=True)
        with open('bar', 'w') as fd:
            fd.write(fd.name)
        with open('foo', 'w') as fd:
            fd.write(fd.name)
        with open(bazdir / 'baz', 'w') as fd:
            fd.write(fd.name)
        with open(foodir / 'bar', 'w') as fd:
            fd.write(fd.name)

        currdir = pathlib.Path('.').resolve().as_uri() + '/'

        # consistent local case
        pkg = quilt3.Package().set_dir("/", "./")
        assert pkg.get() == currdir

        # package with one inconsistent path, leading case
        pkg = quilt3.Package().set_dir("/", "./")
        pkg.set('badpath', 'bar')
        with pytest.raises(QuiltException):
            pkg.get()

        # package with one inconsistent path, training case
        pkg = quilt3.Package().set_dir("/", "./")
        # prefix with 'z_' to ensure that this entry is last in sorted order
        pkg.set('z_badpath', 'bar')
        with pytest.raises(QuiltException):
            pkg.get()

        # package with inconsistent schemes
        with patch('quilt3.packages.get_size_and_meta',
                   return_value=(0, dict(), '0')):
            pkg = quilt3.Package().set_dir("/", "./")
            pkg.set("bar", "s3://test-bucket/bar")
            with pytest.raises(QuiltException):
                pkg.get()

        # package with inconsistent root directories
        with open('foo_dir/foo', 'w') as fd:
            fd.write(fd.name)
        pkg = quilt3.Package().set_dir("/", "./")
        pkg.set('foo', 'foo_dir/foo')
        with pytest.raises(QuiltException):
            pkg.get()
def aggregate_and_push(
    pkg_map,
    source_S3_url="s3://allencell-internal-quilt",
    dest_S3_url="s3://allencell",
    dest_pkg_name="aics/integrated_transcriptomics_structural_organization_hipsc_cm",
    message="Public data set",
    public=False,
):

    internal = boto3.session.Session(profile_name="default")  # noqa: F841

    # real data
    q = quilt3.Package()
    q.set("README.md", "../../data/README.md")
    q.set(
        "resources/Website_schematic_data_flow_20200310_v2.png",
        "../../data/resources/Website_schematic_data_flow_20200310_v2.png",
    )

    for (low_level_pkg_str, new_subdir) in pkg_map.items():
        p = quilt3.Package.browse(low_level_pkg_str, source_S3_url)
        for (logical_key, physical_key) in p.walk():
            q.set(f"{new_subdir}/{logical_key}", physical_key)

    git_commit_hash = (subprocess.check_output(["git", "rev-parse", "HEAD"
                                                ]).strip().decode("utf-8"))
    label = f"{message}. git commit hash of fish_morphology_code = {git_commit_hash}."

    # set profile to public bucket access if pushing public
    if public:
        external = boto3.session.Session(
            profile_name="allencell")  # noqa: F841
    #         external = boto3.session.Session(profile_name="public")  # noqa: F841

    q.push(dest_pkg_name, dest_S3_url, message=label)
Example #3
0
    def test_fetch(self):
        """ Verify fetching a package entry. """
        pkg = (Package().set('foo', DATA_DIR / 'foo.txt', {
            'user_meta': 'blah'
        }).set('bar', DATA_DIR / 'foo.txt', {'user_meta': 'blah'}))
        pkg['foo'].meta['target'] = 'unicode'
        pkg['bar'].meta['target'] = 'unicode'

        with open(DATA_DIR / 'foo.txt') as fd:
            assert fd.read().replace('\n', '') == '123'
        # Copy foo.text to bar.txt
        pkg['foo'].fetch('data/bar.txt')
        with open('data/bar.txt') as fd:
            assert fd.read().replace('\n', '') == '123'

        # Raise an error if you copy to yourself.
        with pytest.raises(shutil.SameFileError):
            pkg.set('foo',
                    DATA_DIR / 'foo.txt')['foo'].fetch(DATA_DIR / 'foo.txt')

        # The key gets re-rooted correctly.
        pkg = quilt3.Package().set('foo', DATA_DIR / 'foo.txt')
        new_pkg_entry = pkg['foo'].fetch('bar.txt')
        out_abs_path = pathlib.Path("bar.txt").resolve().as_uri()
        assert new_pkg_entry.physical_keys[0] == out_abs_path
Example #4
0
def test_push_with_meta_data(meta_arg, meta_data, expected_set_dir_count,
                             expected_push_count, expected_meta,
                             expected_stderr, capsys):
    name = 'test/name'
    pkg = quilt3.Package()

    with tempfile.TemporaryDirectory() as tmp_dir:
        (Path(tmp_dir) / 'foo').touch()
        (Path(tmp_dir) / 'bar').mkdir()
        (Path(tmp_dir) / 'bar' / 'baz')

        with mock.patch('quilt3.Package.__new__', return_value=pkg) as mocked_package_class,\
             mock.patch.object(pkg, 'set_dir', wraps=pkg.set_dir) as mocked_set_dir, \
             mock.patch.object(pkg, 'push') as mocked_push:

            # '--registry' defaults to configured remote registry hence optional.
            if meta_arg:
                main.main(
                    ('push', '--dir', tmp_dir, name, meta_arg, meta_data))
            else:
                main.main(('push', '--dir', tmp_dir, name))
            mocked_package_class.assert_called_once_with(quilt3.Package)
            assert mocked_set_dir.call_count == expected_set_dir_count
            assert mocked_push.call_count == expected_push_count
            assert pkg.meta == expected_meta
            # check for expected stderr exception message
            captured = capsys.readouterr()
            assert expected_stderr in captured.err
def manuscript_plots_dataset(
    test=False,
    col_name_map={},
    dataset_name="manuscript_plots",
    package_owner="rorydm",
    readme_path="README.md",
    s3_bucket="s3://allencell-internal-quilt",
):

    df = collate_plot_dataset()
    df_rna = make_small_rna_df()

    # subsample df for a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    p = quilt3.Package()
    p = p.set("README.md", readme_path)
    p = p.set("data.csv", df)
    p = p.set("data_rnaseq.csv", df_rna)

    # tag with commit hash
    label = (
        subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")
    )

    # upload to quilt
    p.push(
        f"{package_owner}/{dataset_name}",
        s3_bucket,
        message=f"git commit hash of fish_morphology_code = {label}",
    )
Example #6
0
File: index.py Project: zkan/quilt
 def get_pkg(src_registry, data):
     p = quilt3.Package()
     for entry in data['entries']:
         set_entry = p.set_dir if entry['is_dir'] else p.set
         set_entry(entry['logical_key'],
                   str(src_registry.base.join(entry['path'])))
     return p
Example #7
0
 def get_pkg(src_registry, data):
     p = quilt3.Package()
     for entry in data['entries']:
         set_entry = p.set_dir if entry['is_dir'] else p.set
         set_entry(entry['logical_key'],
                   str(src_registry.base.join(entry['path'])))
     calculate_pkg_hashes(user_boto_session, p)
     return p
Example #8
0
    def init(self):
        """ Init a new data package

        Should be used only when creating a new dataset or data form.

        Raises
        ------
        OSError
            when package with this names already exists.

        """
        os.makedirs(self.local_data_dir, exist_ok=True)
        self.package = quilt3.Package()
        self.package.set_dir('/', self.local_package_root)
def generate_new_package_version(package_name, registry_name, push_dest):
    time.sleep(
        1
    )  # named_packages is timestamp based - ensure that we get a fresh timestamp
    fn_info = FunctionReporter(
        f"Creating package '{package_name}' in registry {registry_name}")
    try:
        pkg = quilt3.Package()
        pkg.set(f"test-{uuid.uuid4()}",
                pd.DataFrame(list(range(random.randint(0, 100_000)))))
        pkg.push(package_name, registry=registry_name, dest=push_dest)
        fn_info.succeeded(output=None)
    except Exception as ex:
        fn_info.failed(ex)
Example #10
0
    def upload_package(self, message=None):
        """ Build and upload package from local directory,
        ignoring all files listed in .quiltignore

        Args:
            message (:obj:`str`): commit message
        """

        # build package, ignoring all files in .quiltignore
        package = quilt3.Package()
        package.set_dir('/', self.path)

        # upload package
        package.push(self.get_full_package_id(), message=message)
Example #11
0
    def test_import(self):
        with patch('quilt3.Package.browse') as browse_mock, \
            patch('quilt3.imports._list_packages') as list_packages_mock:
            browse_mock.return_value = quilt3.Package()
            list_packages_mock.return_value = ['foo/bar', 'foo/baz']

            from quilt3.data.foo import bar
            assert isinstance(bar, Package)
            browse_mock.assert_has_calls(
                [call('foo/baz', registry=ANY), call('foo/bar', registry=ANY)], any_order=True
            )

            from quilt3.data import foo
            assert hasattr(foo, 'bar') and hasattr(foo, 'baz')
Example #12
0
    def test_push(self):
        name = 'test/name'
        pkg = quilt3.Package()

        with tempfile.TemporaryDirectory() as tmp_dir:
            (Path(tmp_dir) / 'foo').touch()
            (Path(tmp_dir) / 'bar').mkdir()
            (Path(tmp_dir) / 'bar' / 'baz')

            with mock.patch('quilt3.Package.__new__', return_value=pkg) as mocked_package_class, \
                 mock.patch.object(pkg, 'set_dir', wraps=pkg.set_dir) as mocked_set_dir, \
                 mock.patch.object(pkg, 'push') as mocked_push:
                main.main(('push', '--dir', tmp_dir, name))

                mocked_package_class.assert_called_once_with(quilt3.Package)
                mocked_set_dir.assert_called_once_with('.', tmp_dir, meta=None)
                mocked_push.assert_called_once_with(name,
                                                    registry=None,
                                                    dest=None,
                                                    message=None)
Example #13
0
    def distribute(
        self,
        push_uri: Optional[str] = None,
        message: Optional[str] = None,
        attach_associates: bool = True,
    ) -> quilt3.Package:
        """
        Push a package to a specific S3 bucket. If no bucket is provided, the un-built, un-pushed package is returned.
        You can push a dataset with the same name multiple times to the same bucket multiple times as instead of
        overriding a prior dataset, Quilt simply creates a new dataset version. Please refer to Quilt documentation for
        more details: https://docs.quiltdata.com

        :param push_uri: The S3 bucket uri to push to. Example: "s3://quilt-jacksonb"
        :param message: An optional message to attach to that version of the dataset.
        :param attach_associates: Boolean option to attach associates as metadata to each file. Associates are used
            to retain quick navigation between related files.
        :return: The built and optionally pushed quilt3.Package.
        """
        # Confirm name matches approved pattern
        # We previously checked during init, but the name could have been changed
        name = self.return_or_raise_approved_name(self.name)

        # Create empty package
        pkg = quilt3.Package()

        # Write any extra files to tempdir to send to the build
        with tempfile.TemporaryDirectory() as tmpdir:
            # Set all referenced files
            text = self.readme.text
            for rf in self.readme.referenced_files:
                replaced = f"referenced_files/{rf.resolved.name}"
                text = text.replace(rf.target, replaced)
                pkg.set(replaced, str(rf.resolved))

            # Write the updated readme to temp
            readme_pk = Path(tmpdir, "README.md")
            with open(readme_pk, "w") as readme_write:
                readme_write.write(text)

            # Set the readme
            pkg.set("README.md", readme_pk)

            # Validate the dataset
            v_ds = validate(self.data)

            # Set package contents
            if len(self.path_columns) > 0:
                fp_cols = self.path_columns
            else:
                fp_cols = v_ds.schema.df.index[
                    v_ds.schema.df["dtype"].str.contains("Path")].tolist()

            # Create associate mappings: List[Dict[str, str]]
            # This list is in index order. Meaning that as the column values are descended we can simply add a
            # new associate to the already existing associate map at that list index.
            associates = []

            # Create metadata reduction map
            # This will be used to clean up and standardize the metadata access after object construction
            # Metadata column name to boolean value for should or should not reduce metadata values
            # This will be used during the "clean up the package metadata step"
            # If we have multiple files each with the same keys for the metadata, but for one reason or another, one
            # packaged file's value for a certain key is a list while another's is a single string, this leads to a
            # confusing mixed return value API for the same _type_ of object. Example:
            # fov/
            #   obj1/
            #      {example_key: "hello"}
            #   obj2/
            #      {example_key: ["hello", "world"]}
            # Commonly this happens when a manifest has rows of unique instances of a child object but retains a
            # reference to a parent object, example: rows of information about unique cells that were all generated
            # using the same algorithm, whose information is stored in a column, for each cell information row.
            # This could result in some files (which only have one cell) being a single string while other files
            # (which have more than one cell) being a list of the same string over and over again.
            # "Why spend all this time to reduce/ collapse the metadata anyway?", besides making it so that users won't
            # have to call `obj2.meta["example_key"][0]` every time they want the value, and besides the fact that it
            # standardizes the metadata api, the biggest reason is that S3 objects can only have 2KB of metadata,
            # without this reduction/ collapse step, manifests are more likely to hit that limit and cause a package
            # distribution error.
            metadata_reduction_map = {
                index_col: True
                for index_col in self.metadata_columns
            }

            # Set all files
            with tqdm(total=len(fp_cols) * len(v_ds.data),
                      desc="Constructing package") as pbar:
                for col in fp_cols:
                    # Check display name for col
                    if col in self.column_names_map:
                        col_label = self.column_names_map[col]
                    else:
                        col_label = col

                    # Update values to the logical key as they are set
                    for i, val in enumerate(v_ds.data[col].values):
                        # Fully resolve the path
                        physical_key = Path(val).expanduser().resolve()

                        # Just using val.name could result in files that shouldn't be grouped being grouped
                        # Example column:
                        # SourceReadpath
                        # a/0.tiff
                        # a/1.tiff
                        # b/0.tiff
                        # b/1.tiff
                        # Even though there are four files, this would result in both a/0.tiff and b/0.tiff, and,
                        # a/1.tiff and b/1.tiff being grouped together. To solve this we can prepend a the first couple
                        # of characters from a hash of the fully resolved path to the logical key.
                        unique_file_name = file_utils.create_unique_logical_key(
                            physical_key)
                        logical_key = f"{col_label}/{unique_file_name}"
                        if physical_key.is_file():
                            v_ds.data[col].values[i] = logical_key

                            # Create metadata dictionary to attach to object
                            meta = {}
                            for meta_col in self.metadata_columns:
                                # Short reference to current metadata value
                                v = v_ds.data[meta_col].values[i]

                                # Enforce simple JSON serializable type
                                # First check if value is a numpy value
                                # It likely is because pandas relies on numpy
                                # All numpy types have the "dtype" attribute and can be cast to python type by using
                                # the `item` function, details here:
                                # https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.item.html
                                if hasattr(v, "dtype"):
                                    v = v.item()
                                if isinstance(v, JSONSerializableTypes):
                                    meta[meta_col] = [v]
                                else:
                                    raise TypeError(
                                        f"Non-simple-JSON-serializable type found in column: '{meta_col}', "
                                        f"at index: {i}: ({type(v)} '{v}').\n\n "
                                        f"At this time only the following types are allowing in metadata: "
                                        f"{JSONSerializableTypes}")

                            # Check if object already exists
                            if logical_key in pkg:
                                # Join the two meta dictionaries
                                joined_meta = {}
                                for meta_col, curr_v in pkg[
                                        logical_key].meta.items():
                                    # Join the values for the current iteration of the metadata
                                    joined_values = [*curr_v, *meta[meta_col]]

                                    # Only check if the metadata at this index can be reduced if currently is still
                                    # being decided. We know if the metadata value at this index is still be decided if:
                                    # the boolean value in the metadata reduction map is True, as in, this index can be
                                    # reduced or collapsed.
                                    # The other reason to make this check is so that we don't override an earlier False
                                    # reduction value. In the case where early on we encounter an instance of the
                                    # metadata that should not be reduced but then later on we say it can be, this check
                                    # prevents that. As we want all metadata access across the dataset to be uniform.
                                    if metadata_reduction_map[meta_col]:
                                        # Update the metadata reduction map
                                        # For the current column being checked, as long as it is still being
                                        # determined that the column can be reduced (aka we have entered this if block)
                                        # check if we can still reduce the metadata after the recent addition.
                                        # "We can reduce the metadata if the count of the first value (or any value) is
                                        # the same as the length of the entire list of values"
                                        # This runs quickly for small lists as seen here:
                                        # https://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical
                                        metadata_reduction_map[meta_col] = (
                                            joined_values.count(
                                                joined_values[0]) == len(
                                                    joined_values))

                                    # Attached the joined values to the joined metadata
                                    joined_meta[meta_col] = joined_values

                                # Update meta
                                pkg[logical_key].set_meta(joined_meta)

                            # Object didn't already exist, simply set it
                            else:
                                pkg.set(logical_key, physical_key, meta)

                            # Update associates
                            try:
                                associates[i][col_label] = logical_key
                            except IndexError:
                                associates.append({col_label: logical_key})
                        else:
                            v_ds.data[col].values[i] = logical_key
                            pkg.set_dir(logical_key, physical_key)

                        # Update progress bar
                        pbar.update()

            # Clean up package metadata
            pkg = self._recursive_clean(pkg, metadata_reduction_map)

            # Attach associates if desired
            if attach_associates:
                for i, associate_mapping in tqdm(
                        enumerate(associates),
                        desc="Creating associate metadata blocks"):
                    for col, lk in associate_mapping.items():
                        # Having dictionary expansion in this order means that associates will override a prior
                        # existing `associates` key, this is assumed safe because attach_associates was set to True.
                        pkg[lk].set_meta({
                            **pkg[lk].meta,
                            **{
                                "associates": associate_mapping
                            }
                        })

            # Store validated dataset in the temp dir with paths replaced
            meta_path = Path(tmpdir, "metadata.csv")
            v_ds.data.to_csv(meta_path, index=False)
            pkg.set("metadata.csv", meta_path)

            # Set logical keys for all extra files
            for lk_parent, files_list in self.extra_files.items():
                for f in files_list:
                    pkg.set(f"{lk_parent}/{f.name}", f)

            # Optionally push
            if push_uri:
                pkg = pkg.push(f"{self.package_owner}/{name}",
                               registry=push_uri,
                               message=message)

        return pkg
Example #14
0
from analyze import cluster as _cluster, cluster_spatial as _cluster_spatial
from harmonize import harmonize as _harmonize
from .util import adjust_inflation, convert_gdf

appname = "geosnap"
appauthor = "geosnap"
data_dir = user_data_dir(appname, appauthor)
if not os.path.exists(data_dir):
    pathlib.Path(data_dir).mkdir(parents=True, exist_ok=True)

# look for local storage and create if missing
try:
    from quilt3.data.geosnap_data import storage
except ImportError:
    storage = quilt3.Package()

# look for local storage and create if missing
try:
    from quilt3.data.geosnap_data import storage
except ImportError:
    storage = quilt3.Package()

try:  # if any of these aren't found, stream them insteead
    from quilt3.data.census import tracts_cartographic, administrative
except ImportError:
    warn("Unable to locate local census data. Streaming instead.\n"
         "If you plan to use census data repeatedly you can store it locally"
         "with the data.store_census function for better performance")
    try:
        tracts_cartographic = quilt3.Package.browse(
Example #15
0
def create_package(request):
    json_iterator = map(json.JSONDecoder().decode,
                        (line.decode() for line in request.stream))

    data = next(json_iterator)
    get_schema_validator(PACKAGE_CREATE_SCHEMA)(data)
    handle = data['name']
    registry = data['registry']

    try:
        package_registry = get_registry(registry)

        meta = data.get('meta')
        message = data.get('message')
        quilt3.util.validate_package_name(handle)
        pkg = quilt3.Package()
        if meta is not None:
            pkg.set_meta(meta)

        size_to_hash = 0
        files_to_hash = 0
        for entry in map(get_schema_validator(PACKAGE_CREATE_ENTRY_SCHEMA),
                         json_iterator):
            try:
                physical_key = PhysicalKey.from_url(entry['physical_key'])
            except ValueError:
                raise ApiException(
                    HTTPStatus.BAD_REQUEST,
                    f"{entry['physical_key']} is not a valid s3 URL.")
            if physical_key.is_local():
                raise ApiException(HTTPStatus.BAD_REQUEST,
                                   f"{str(physical_key)} is not in S3.")
            logical_key = entry['logical_key']

            hash_ = entry.get('hash')
            obj_size = entry.get('size')
            meta = entry.get('meta')

            if hash_ and obj_size is not None:
                pkg.set(
                    logical_key,
                    quilt3.packages.PackageEntry(
                        physical_key,
                        obj_size,
                        {
                            'type': 'SHA256',
                            'value': hash_
                        },
                        meta,
                    ))
            else:
                pkg.set(logical_key, str(physical_key), meta)

                size_to_hash += pkg[logical_key].size
                if size_to_hash > PKG_FROM_FOLDER_MAX_PKG_SIZE:
                    raise ApiException(
                        HTTPStatus.BAD_REQUEST,
                        f"Total size of new S3 files is {size_to_hash}, "
                        f"but max supported size is {PKG_FROM_FOLDER_MAX_PKG_SIZE}"
                    )

                files_to_hash += 1
                if files_to_hash > PKG_FROM_FOLDER_MAX_FILES:
                    raise ApiException(
                        HTTPStatus.BAD_REQUEST,
                        f"Package has new S3 {files_to_hash} files, "
                        f"but max supported number is {PKG_FROM_FOLDER_MAX_FILES}"
                    )

        pkg._validate_with_workflow(
            registry=package_registry,
            workflow=data.get('workflow', ...),
            name=handle,
            message=message,
        )

    except quilt3.util.QuiltException as qe:
        raise ApiException(HTTPStatus.BAD_REQUEST, qe.message)

    calculate_pkg_hashes(user_boto_session, pkg)
    try:
        top_hash = pkg._build(
            name=handle,
            registry=registry,
            message=message,
        )
    except ClientError as boto_error:
        raise ApiException.from_botocore_error(boto_error)

    return make_json_response(200, {
        'top_hash': top_hash,
    })
Example #16
0
def create_package(req_file):
    json_iterator = map(json.JSONDecoder().decode,
                        (line.decode() for line in req_file))

    data = next(json_iterator)
    get_schema_validator(PACKAGE_CREATE_SCHEMA)(data)
    handle = data['name']
    registry = data['registry']

    try:
        package_registry = get_registry(registry)

        meta = data.get('meta')
        message = data.get('message')
        quilt3.util.validate_package_name(handle)
        pkg = quilt3.Package()
        if meta is not None:
            pkg.set_meta(meta)

        size_to_hash = 0
        files_to_hash = 0
        for entry in map(get_schema_validator(PACKAGE_CREATE_ENTRY_SCHEMA),
                         json_iterator):
            try:
                physical_key = PhysicalKey.from_url(entry['physical_key'])
            except ValueError:
                raise PkgpushException(
                    "InvalidS3PhysicalKey",
                    {"physical_key": entry['physical_key']},
                )
            if physical_key.is_local():
                raise PkgpushException(
                    "InvalidLocalPhysicalKey",
                    {"physical_key": str(physical_key)},
                )
            logical_key = entry['logical_key']

            hash_ = entry.get('hash')
            obj_size = entry.get('size')
            meta = entry.get('meta')

            if hash_ and obj_size is not None:
                pkg.set(
                    logical_key,
                    quilt3.packages.PackageEntry(
                        physical_key,
                        None if obj_size is None else int(obj_size),
                        {
                            'type': 'SHA256',
                            'value': hash_
                        },
                        meta,
                    ))
            else:
                pkg.set(logical_key, str(physical_key), meta)

                size_to_hash += pkg[logical_key].size
                if size_to_hash > PKG_FROM_FOLDER_MAX_PKG_SIZE:
                    raise PkgpushException(
                        "PackageTooLargeToHash",
                        {
                            "size": size_to_hash,
                            "max_size": PKG_FROM_FOLDER_MAX_PKG_SIZE
                        },
                    )

                files_to_hash += 1
                if files_to_hash > PKG_FROM_FOLDER_MAX_FILES:
                    raise PkgpushException(
                        "TooManyFilesToHash",
                        {
                            "num_files": files_to_hash,
                            "max_files": PKG_FROM_FOLDER_MAX_FILES
                        },
                    )

        pkg._validate_with_workflow(
            registry=package_registry,
            workflow=data.get('workflow', ...),
            name=handle,
            message=message,
        )

    except quilt3.util.QuiltException as qe:
        raise PkgpushException.from_quilt_exception(qe)

    calculate_pkg_hashes(user_boto_session, pkg)
    try:
        top_hash = pkg._build(
            name=handle,
            registry=registry,
            message=message,
        )
    except ClientError as boto_error:
        raise PkgpushException.from_boto_error(boto_error)

    # XXX: return mtime?
    return {'top_hash': top_hash}
Example #17
0
    def push(self, bucket: Optional[str] = None):
        """
        Push the most recently generated data.

        Parameters
        ----------
        bucket: Optional[str]
            Push data to a specific bucket different from the bucket defined
            by your workflow_config.json or the defaulted bucket.

        Notes
        -----
        If your git status isn't clean, or you haven't commited and pushed to
        origin, any attempt to push data will be rejected.
        """
        # Check if manifest is None
        if self.manifest is None:
            raise exceptions.PackagingError(
                "No manifest found to construct package with.")

        # Resolve None bucket
        if bucket is None:
            bucket = self._storage_bucket

        # Get current git branch
        current_branch = self._get_current_git_branch()

        # Normalize branch name
        # This is to stop quilt from making extra directories from names like:
        # feature/some-feature
        current_branch = current_branch.replace("/", ".")

        # Resolve push target
        quilt_loc = f"{self._quilt_package_owner}/{self._quilt_package_name}"
        push_target = f"{quilt_loc}/{current_branch}/{self.step_name}"

        # Check git status is clean
        self._check_git_status_is_clean(push_target)

        # Construct the package
        step_pkg, relative_manifest = quilt_utils.create_package(
            manifest=self.manifest,
            step_pkg_root=self.step_local_staging_dir,
            filepath_columns=self.filepath_columns,
            metadata_columns=self.metadata_columns,
        )

        # Add the relative manifest and generated README to the package
        with TemporaryDirectory() as tempdir:
            # Store the relative manifest in a temporary directory
            m_path = Path(tempdir) / "manifest.parquet"
            relative_manifest.to_parquet(m_path)
            step_pkg.set("manifest.parquet", m_path)

            # Add the params files to the package
            for param_file in ["run_parameters.json", "init_parameters.json"]:
                param_file_path = self.step_local_staging_dir / param_file
                step_pkg.set(param_file, param_file_path)

            # Generate README
            readme_path = Path(tempdir) / "README.md"
            with open(readme_path, "w") as write_readme:
                write_readme.write(
                    constants.README_TEMPLATE.render(
                        quilt_package_name=self._quilt_package_name,
                        source_url=self._get_git_origin_url(),
                        branch_name=self._get_current_git_branch(),
                        commit_hash=self._get_current_git_commit_hash(),
                        creator=getpass.getuser(),
                    ))
            step_pkg.set("README.md", readme_path)

            # Browse top level project package and add / overwrite to it in step dir
            try:
                project_pkg = quilt3.Package.browse(quilt_loc,
                                                    self._storage_bucket)
            except botocore.errorfactory.ClientError:
                log.info(f"Could not find existing package: {quilt_loc} "
                         f"in bucket: {self._storage_bucket}. "
                         f"Creating a new package.")
                project_pkg = quilt3.Package()

            # Regardless of if we found a prior version of the package or starting from
            # a new package, we "merge" them together to place this steps data in the
            # correct location.

            # Remove the current step if it exists in the previous project package
            if current_branch in project_pkg.keys():
                if self.step_name in project_pkg[current_branch].keys():
                    project_pkg = project_pkg.delete(
                        f"{current_branch}/{self.step_name}")

            # Merge packages
            for (logical_key, pkg_entry) in step_pkg.walk():
                project_pkg.set(
                    f"{current_branch}/{self.step_name}/{logical_key}",
                    pkg_entry)

            # Push the data
            project_pkg.push(
                quilt_loc,
                registry=self._storage_bucket,
                message=self._create_data_commit_message(),
            )