def with_dataset(self, name=None): """Yield an editable metadata object for a dataset.""" from renku.models._locals import with_reference from renku.models.refs import LinkReference dataset = self.load_dataset(name=name) if dataset is None: identifier = str(uuid.uuid4()) path = (self.renku_datasets_path / identifier / self.METADATA) path.parent.mkdir(parents=True, exist_ok=True) with with_reference(path): dataset = Dataset( identifier=identifier, name=name, client=self ) if name: LinkReference.create(client=self, name='datasets/' + name).set_reference(path) dataset_path = self.path / self.datadir / dataset.name dataset_path.mkdir(parents=True, exist_ok=True) yield dataset # TODO # if path is None: # path = dataset_path / self.METADATA # if path.exists(): # raise ValueError('Dataset already exists') dataset.to_yaml()
def datasets(ctx, client): """Migrate dataset metadata.""" from renku.models._jsonld import asjsonld from renku.models.datasets import Dataset from renku.models.refs import LinkReference from ._checks.location_datasets import _dataset_metadata_pre_0_3_4 for old_path in _dataset_metadata_pre_0_3_4(client): dataset = Dataset.from_yaml(old_path, client=client) name = str(old_path.parent.relative_to(client.path / 'data')) new_path = (client.renku_datasets_path / dataset.uid / client.METADATA) new_path.parent.mkdir(parents=True, exist_ok=True) dataset = dataset.rename_files(lambda key: os.path.relpath( str(old_path.parent / key), start=str(new_path.parent))) with new_path.open('w') as fp: yaml.dump(asjsonld(dataset), fp, default_flow_style=False) old_path.unlink() LinkReference.create(client=client, name='datasets/' + name).set_reference(new_path)
def as_dataset(self, client): """Deserialize `ZenodoRecordSerializer` to `Dataset`.""" files = self.get_files() metadata = self.get_jsonld() dataset = Dataset.from_jsonld(metadata, client=client) serialized_files = [] for file_ in files: remote_ = file_.remote_url dataset_file = DatasetFile( url=remote_, id=file_.id, checksum=file_.checksum, filename=file_.filename, filesize=file_.filesize, filetype=file_.type, dataset=dataset.name, path='', ) serialized_files.append(dataset_file) dataset.files = serialized_files if isinstance(dataset.url, dict) and '_id' in dataset.url: dataset.url = urllib.parse.urlparse(dataset.url.pop('_id')) dataset.url = dataset.url.geturl() return dataset
def datasets(self): """Return mapping from path to dataset.""" result = {} for path in self.renku_datasets_path.rglob(self.METADATA): with path.open('r') as fp: result[path] = Dataset.from_jsonld(yaml.load(fp)) return result
def with_dataset(self, name=None): """Yield an editable metadata object for a dataset.""" from renku.models.refs import LinkReference with self.lock: path = None dataset = None if name: path = self.renku_datasets_path / name / self.METADATA if not path.exists(): path = LinkReference(client=self, name='datasets/' + name).reference if path.exists(): with path.open('r') as f: source = yaml.load(f) or {} dataset = Dataset.from_jsonld(source) if dataset is None: source = {} dataset = Dataset(name=name) path = (self.renku_datasets_path / dataset.identifier.hex / self.METADATA) path.parent.mkdir(parents=True, exist_ok=True) if name: LinkReference.create(client=self, name='datasets/' + name).set_reference(path) dataset_path = self.path / self.datadir / dataset.name dataset_path.mkdir(parents=True, exist_ok=True) yield dataset source.update(**asjsonld(dataset)) # TODO # if path is None: # path = dataset_path / self.METADATA # if path.exists(): # raise ValueError('Dataset already exists') with path.open('w') as f: yaml.dump(source, f, default_flow_style=False)
def with_dataset(self, name=None): """Yield an editable metadata object for a dataset.""" with self.lock: from renku.models._jsonld import asjsonld from renku.models.datasets import Dataset path = None dataset = None dataset_path = self.path / self.datadir / name if name: path = dataset_path / self.METADATA if path.exists(): with open(path, 'r') as f: source = yaml.load(f) or {} dataset = Dataset.from_jsonld(source) if dataset is None: source = {} dataset = Dataset(name=name) try: dataset_path.mkdir(parents=True, exist_ok=True) except FileExistsError: raise FileExistsError('This dataset already exists.') yield dataset source.update( **asjsonld( dataset, filter=lambda attr, _: attr.name != 'datadir', ) ) # TODO # if path is None: # path = dataset_path / self.METADATA # if path.exists(): # raise ValueError('Dataset already exists') with open(path, 'w') as f: yaml.dump(source, f, default_flow_style=False)
def edit(client, id): """Edit dataset metadata.""" dataset_ = client.load_dataset(id) if dataset_: metadata_edited = editor.edit(contents=bytes( yaml.safe_dump(dataset_.editable), encoding='utf-8')) edited = yaml.safe_load(metadata_edited) updated_ = Dataset(client=client, **edited) dataset_.update_metadata(updated_) dataset_.to_yaml()
def dataset_serialization(client, dataset, data_file): """Test deserializing a dataset object.""" with open(dataset.path / 'metadata.yml', 'r') as f: source = yaml.load(f) d = Dataset.from_jsonld(source) assert d.path == dataset.path d_dict = d.to_dict() assert all([key in d_dict for key in ('name', 'identifier', 'files')]) assert not len(d_dict['files'].values()) client.add_data_to_dataset(d, str(data_file)) d_dict = d.to_dict() assert len(d_dict['files'].values())
def datasets_from_commit(self, commit=None): """Return datasets defined in a commit.""" commit = commit or self.repo.head.commit try: datasets = commit.tree / self.renku_home / self.DATASETS except KeyError: return for tree in datasets: try: blob = tree / self.METADATA except KeyError: continue dataset = Dataset.from_yaml(self.path / Path(blob.path), client=self) dataset.commit = commit yield dataset
def as_dataset(self, client): """Deserialize `DataverseRecordSerializer` to `Dataset`.""" files = self.get_files() dataset = Dataset.from_jsonld(self._json, client=client) serialized_files = [] for file_ in files: remote_ = file_.remote_url dataset_file = DatasetFile( url=remote_, id=file_._id if file_._id else file_.name, filename=file_.name, filesize=file_.content_size, filetype=file_.file_format, dataset=dataset.name, path='', ) serialized_files.append(dataset_file) dataset.files = serialized_files return dataset
def get_dataset(self, path, commit=None): """Return a dataset from a given path.""" if not path.is_absolute(): path = self.path / path return Dataset.from_yaml(path, client=self, commit=commit)