def import_dataset( client, uri, short_name='', extract=False, with_prompt=False, yes=False, commit_message=None, progress=None, ): """Import data from a 3rd party provider or another renku project.""" provider, err = ProviderFactory.from_uri(uri) if err and provider is None: raise ParameterError('Could not process {0}.\n{1}'.format(uri, err)) try: record = provider.find_record(uri, client) dataset = record.as_dataset(client) files = dataset.files total_size = 0 if with_prompt and not yes: click.echo( tabulate( files, headers=OrderedDict(( ('checksum', None), ('filename', 'name'), ('size_in_mb', 'size (mb)'), ('filetype', 'type'), )), floatfmt='.2f' ) ) text_prompt = 'Do you wish to download this version?' if record.is_last_version(uri) is False: text_prompt = WARNING + 'Newer version found at {}\n'.format( record.links.get('latest_html') ) + text_prompt click.confirm(text_prompt, abort=True) for file_ in files: if file_.size_in_mb is not None: total_size += file_.size_in_mb total_size *= 2**20 except KeyError as e: raise ParameterError(( 'Could not process {0}.\n' 'Unable to fetch metadata due to {1}'.format(uri, e) )) except LookupError as e: raise ParameterError( ('Could not process {0}.\n' 'Reason: {1}'.format(uri, str(e))) ) if not files: raise ParameterError('Dataset {} has no files.'.format(uri)) dataset.same_as = Url(url_id=remove_credentials(uri)) if not provider.is_git_based: if not short_name: short_name = generate_default_short_name( dataset.name, dataset.version ) if is_doi(dataset.identifier): dataset.same_as = Url( url_str=urllib.parse. urljoin('https://doi.org', dataset.identifier) ) urls, names = zip(*[(f.url, f.filename) for f in files]) _add_to_dataset( client, urls=urls, short_name=short_name, create=True, with_metadata=dataset, force=True, extract=extract, all_at_once=True, destination_names=names, progress=progress, interactive=with_prompt, total_size=total_size, ) if dataset.version: tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version) tag_dataset( client, short_name, tag_name, 'Tag {} created by renku import'.format(dataset.version) ) else: short_name = short_name or dataset.short_name _add_to_dataset( client, urls=[record.project_url], short_name=short_name, sources=[f.path for f in files], with_metadata=dataset, create=True )
def test_dataset_provider_resolution_dataverse(doi_responses, uri): """Check that dataverse URIs resolve to ``DataverseProvider``.""" provider, _ = ProviderFactory.from_uri(uri) assert type(provider) is DataverseProvider
def export_dataset( client, short_name, provider, publish, tag, handle_access_token_fn=None, handle_tag_selection_fn=None, commit_message=None, dataverse_server_url=None, dataverse_name=None, ): """Export data to 3rd party provider. :raises: ``ValueError``, ``HTTPError``, ``InvalidAccessToken``, ``DatasetNotFound`` """ # TODO: all these callbacks are ugly, improve in #737 config_key_secret = 'access_token' provider_id = provider.lower() dataset_ = client.load_dataset(short_name) if not dataset_: raise DatasetNotFound(name=short_name) try: provider = ProviderFactory.from_id(provider_id) except KeyError: raise ParameterError('Unknown provider.') provider.set_parameters( client, dataverse_server_url=dataverse_server_url, dataverse_name=dataverse_name ) selected_tag = None selected_commit = client.repo.head.commit if tag: selected_tag = next((t for t in dataset_.tags if t.name == tag), None) if not selected_tag: raise ValueError('Tag {} not found'.format(tag)) selected_commit = selected_tag.commit elif dataset_.tags and len(dataset_.tags) > 0 and handle_tag_selection_fn: tag_result = handle_tag_selection_fn(dataset_.tags) if tag_result: selected_tag = tag_result selected_commit = tag_result.commit # If the tag is created automatically for imported datasets, it # does not have the dataset yet and we need to use the next commit with client.with_commit(selected_commit): test_ds = client.load_dataset(short_name) if not test_ds: commits = client.dataset_commits(dataset_) next_commit = selected_commit for commit in commits: if commit.hexsha == selected_commit: selected_commit = next_commit.hexsha break next_commit = commit with client.with_commit(selected_commit): dataset_ = client.load_dataset(short_name) if not dataset_: raise DatasetNotFound(name=short_name) access_token = client.get_value(provider_id, config_key_secret) exporter = provider.get_exporter(dataset_, access_token=access_token) if access_token is None: if handle_access_token_fn: access_token = handle_access_token_fn(exporter) if access_token is None or len(access_token) == 0: raise InvalidAccessToken() client.set_value( provider_id, config_key_secret, access_token, global_only=True ) exporter.set_access_token(access_token) try: destination = exporter.export(publish=publish, tag=selected_tag) except errors.AuthenticationError: client.remove_value( provider_id, config_key_secret, global_only=True ) raise result = 'Exported to: {0}'.format(destination) return result
def test_dataset_provider_resolution_zenodo(doi_responses, uri): """Check that zenodo uris resolve to ZenodoProvider.""" provider, _ = ProviderFactory.from_uri(uri) assert type(provider) is ZenodoProvider
def import_dataset( client, uri, name="", extract=False, with_prompt=False, yes=False, commit_message=None, progress=None, ): """Import data from a 3rd party provider or another renku project.""" u = urllib.parse.urlparse(uri) if u.scheme not in ("", "file", "git+https", "git+ssh", "doi"): # NOTE: Check if the url is a redirect. uri = requests.head(uri, allow_redirects=True).url provider, err = ProviderFactory.from_uri(uri) if err and provider is None: raise ParameterError("Could not process {0}.\n{1}".format(uri, err)) try: record = provider.find_record(uri, client) dataset = record.as_dataset(client) files = dataset.files total_size = 0 if with_prompt and not yes: click.echo( tabulate( files, headers=OrderedDict(( ("checksum", None), ("filename", "name"), ("size_in_mb", "size (mb)"), ("filetype", "type"), )), floatfmt=".2f", )) text_prompt = "Do you wish to download this version?" if record.is_last_version(uri) is False: text_prompt = (WARNING + "Newer version found at {}\n".format( record.links.get("latest_html")) + text_prompt) click.confirm(text_prompt, abort=True) for file_ in files: if file_.size_in_mb is not None: total_size += file_.size_in_mb total_size *= 2**20 except KeyError as e: raise ParameterError( ("Could not process {0}.\n" "Unable to fetch metadata due to {1}".format(uri, e))) except LookupError as e: raise ParameterError(("Could not process {0}.\n" "Reason: {1}".format(uri, str(e)))) if not files: raise ParameterError("Dataset {} has no files.".format(uri)) dataset.same_as = Url(url_id=remove_credentials(uri)) if not provider.is_git_based: if not name: name = generate_default_name(dataset.title, dataset.version) if is_doi(dataset.identifier): dataset.same_as = Url(url_str=urllib.parse.urljoin( "https://doi.org", dataset.identifier)) urls, names = zip(*[(f.source, f.filename) for f in files]) _add_to_dataset( client, urls=urls, name=name, create=True, with_metadata=dataset, force=True, extract=extract, all_at_once=True, destination_names=names, progress=progress, interactive=with_prompt, total_size=total_size, ) if dataset.version: tag_name = re.sub("[^a-zA-Z0-9.-_]", "_", dataset.version) tag_dataset( client, name, tag_name, "Tag {} created by renku import".format(dataset.version)) else: name = name or dataset.name if not dataset.data_dir: raise OperationError( f"Data directory for dataset must be set: {dataset.name}") sources = [f"{dataset.data_dir}/**"] for file_ in dataset.files: try: Path(file_.path).relative_to(dataset.data_dir) except ValueError: # Files that are not in dataset's data directory sources.append(file_.path) _add_to_dataset( client, urls=[record.project_url], name=name, sources=sources, with_metadata=dataset, create=True, )
def import_dataset( client, uri, short_name='', extract=False, with_prompt=False, commit_message=None, progress=None, ): """Import data from a 3rd party provider.""" provider, err = ProviderFactory.from_uri(uri) if err and provider is None: raise ParameterError('Could not process {0}.\n{1}'.format(uri, err)) try: record = provider.find_record(uri) dataset = record.as_dataset(client) files = dataset.files if with_prompt: click.echo( tabulate(files, headers=OrderedDict(( ('checksum', None), ('filename', 'name'), ('size_in_mb', 'size (mb)'), ('filetype', 'type'), )))) text_prompt = 'Do you wish to download this version?' if record.is_last_version(uri) is False: text_prompt = WARNING + 'Newer version found at {}\n'.format( record.links.get('latest_html')) + text_prompt click.confirm(text_prompt, abort=True) except KeyError as e: raise ParameterError( ('Could not process {0}.\n' 'Unable to fetch metadata due to {1}'.format(uri, e))) except LookupError: raise ParameterError(('Could not process {0}.\n' 'URI not found.'.format(uri))) if files: if not short_name: short_name = generate_default_short_name(dataset.name, dataset.version) dataset.url = remove_credentials(dataset.url) add_to_dataset( client, urls=[f.url for f in files], short_name=short_name, create=True, with_metadata=dataset, force=True, extract=extract, all_at_once=True, progress=progress, ) if dataset.version: tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version) tag_dataset( client, short_name, tag_name, 'Tag {} created by renku import'.format(dataset.version))
def export_dataset( client, id, provider, publish, tag, handle_access_token_fn=None, handle_tag_selection_fn=None, commit_message=None, ): """Export data to 3rd party provider. :raises: ``ValueError``, ``HTTPError``, ``InvalidAccessToken``, ``DatasetNotFound`` """ # TODO: all these callbacks are ugly, improve in #737 config_key_secret = 'access_token' provider_id = provider dataset_ = client.load_dataset(id) if not dataset_: raise DatasetNotFound() try: provider = ProviderFactory.from_id(provider_id) except KeyError: raise ValueError('Unknown provider.') selected_tag = None selected_commit = client.repo.head.commit if tag: selected_tag = next((t for t in dataset_.tags if t.name == tag), None) if not selected_tag: raise ValueError('Tag {} not found'.format(tag)) selected_commit = selected_tag.commit elif dataset_.tags and len(dataset_.tags) > 0 and handle_tag_selection_fn: tag_result = handle_tag_selection_fn(dataset_.tags) if tag_result: selected_tag = tag_result selected_commit = tag_result.commit with client.with_commit(selected_commit): dataset_ = client.load_dataset(id) if not dataset_: raise DatasetNotFound() access_token = client.get_value(provider_id, config_key_secret) exporter = provider.get_exporter(dataset_, access_token=access_token) if access_token is None: if handle_access_token_fn: access_token = handle_access_token_fn(exporter) if access_token is None or len(access_token) == 0: raise InvalidAccessToken() client.set_value(provider_id, config_key_secret, access_token, global_only=True) exporter.set_access_token(access_token) try: destination = exporter.export(publish, selected_tag) except HTTPError as e: if 'unauthorized' in str(e): client.remove_value(provider_id, config_key_secret, global_only=True) raise result = 'Exported to: {0}'.format(destination) return result
def export_dataset( client, short_name, provider, publish, tag, handle_access_token_fn=None, handle_tag_selection_fn=None, commit_message=None, dataverse_server_url=None, dataverse_name=None, ): """Export data to 3rd party provider. :raises: ``ValueError``, ``HTTPError``, ``InvalidAccessToken``, ``DatasetNotFound`` """ # TODO: all these callbacks are ugly, improve in #737 config_key_secret = 'access_token' provider_id = provider.lower() dataset_ = client.load_dataset(short_name) if not dataset_: raise DatasetNotFound() try: provider = ProviderFactory.from_id(provider_id) except KeyError: raise ValueError('Unknown provider.') selected_tag = None selected_commit = client.repo.head.commit if tag: selected_tag = next((t for t in dataset_.tags if t.name == tag), None) if not selected_tag: raise ValueError('Tag {} not found'.format(tag)) selected_commit = selected_tag.commit elif dataset_.tags and len(dataset_.tags) > 0 and handle_tag_selection_fn: tag_result = handle_tag_selection_fn(dataset_.tags) if tag_result: selected_tag = tag_result selected_commit = tag_result.commit with client.with_commit(selected_commit): dataset_ = client.load_dataset(short_name) if not dataset_: raise DatasetNotFound() access_token = client.get_value(provider_id, config_key_secret) exporter = provider.get_exporter(dataset_, access_token=access_token) if access_token is None: if handle_access_token_fn: access_token = handle_access_token_fn(exporter) if access_token is None or len(access_token) == 0: raise InvalidAccessToken() client.set_value(provider_id, config_key_secret, access_token, global_only=True) exporter.set_access_token(access_token) if provider_id == 'dataverse': if not dataverse_name: raise errors.ParameterError('Dataverse name is required.') CONFIG_BASE_URL = 'server_url' if not dataverse_server_url: dataverse_server_url = client.get_value( provider_id, CONFIG_BASE_URL) else: client.set_value(provider_id, CONFIG_BASE_URL, dataverse_server_url, global_only=True) try: destination = exporter.export(publish=publish, tag=selected_tag, server_url=dataverse_server_url, dataverse_name=dataverse_name) except errors.AuthenticationError: client.remove_value(provider_id, config_key_secret, global_only=True) raise result = 'Exported to: {0}'.format(destination) return result