Example #1
0
def import_dataset(
    client,
    uri,
    short_name='',
    extract=False,
    with_prompt=False,
    yes=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider or another renku project."""
    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError('Could not process {0}.\n{1}'.format(uri, err))

    try:
        record = provider.find_record(uri, client)
        dataset = record.as_dataset(client)
        files = dataset.files
        total_size = 0

        if with_prompt and not yes:
            click.echo(
                tabulate(
                    files,
                    headers=OrderedDict((
                        ('checksum', None),
                        ('filename', 'name'),
                        ('size_in_mb', 'size (mb)'),
                        ('filetype', 'type'),
                    )),
                    floatfmt='.2f'
                )
            )

            text_prompt = 'Do you wish to download this version?'
            if record.is_last_version(uri) is False:
                text_prompt = WARNING + 'Newer version found at {}\n'.format(
                    record.links.get('latest_html')
                ) + text_prompt

            click.confirm(text_prompt, abort=True)

            for file_ in files:
                if file_.size_in_mb is not None:
                    total_size += file_.size_in_mb

            total_size *= 2**20

    except KeyError as e:
        raise ParameterError((
            'Could not process {0}.\n'
            'Unable to fetch metadata due to {1}'.format(uri, e)
        ))

    except LookupError as e:
        raise ParameterError(
            ('Could not process {0}.\n'
             'Reason: {1}'.format(uri, str(e)))
        )

    if not files:
        raise ParameterError('Dataset {} has no files.'.format(uri))

    dataset.same_as = Url(url_id=remove_credentials(uri))

    if not provider.is_git_based:
        if not short_name:
            short_name = generate_default_short_name(
                dataset.name, dataset.version
            )

        if is_doi(dataset.identifier):
            dataset.same_as = Url(
                url_str=urllib.parse.
                urljoin('https://doi.org', dataset.identifier)
            )

        urls, names = zip(*[(f.url, f.filename) for f in files])

        _add_to_dataset(
            client,
            urls=urls,
            short_name=short_name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            destination_names=names,
            progress=progress,
            interactive=with_prompt,
            total_size=total_size,
        )

        if dataset.version:
            tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
            tag_dataset(
                client, short_name, tag_name,
                'Tag {} created by renku import'.format(dataset.version)
            )
    else:
        short_name = short_name or dataset.short_name

        _add_to_dataset(
            client,
            urls=[record.project_url],
            short_name=short_name,
            sources=[f.path for f in files],
            with_metadata=dataset,
            create=True
        )
Example #2
0
def test_dataset_provider_resolution_dataverse(doi_responses, uri):
    """Check that dataverse URIs resolve to ``DataverseProvider``."""
    provider, _ = ProviderFactory.from_uri(uri)
    assert type(provider) is DataverseProvider
Example #3
0
def export_dataset(
    client,
    short_name,
    provider,
    publish,
    tag,
    handle_access_token_fn=None,
    handle_tag_selection_fn=None,
    commit_message=None,
    dataverse_server_url=None,
    dataverse_name=None,
):
    """Export data to 3rd party provider.

    :raises: ``ValueError``, ``HTTPError``, ``InvalidAccessToken``,
             ``DatasetNotFound``
    """
    # TODO: all these callbacks are ugly, improve in #737
    config_key_secret = 'access_token'
    provider_id = provider.lower()

    dataset_ = client.load_dataset(short_name)
    if not dataset_:
        raise DatasetNotFound(name=short_name)

    try:
        provider = ProviderFactory.from_id(provider_id)
    except KeyError:
        raise ParameterError('Unknown provider.')

    provider.set_parameters(
        client,
        dataverse_server_url=dataverse_server_url,
        dataverse_name=dataverse_name
    )

    selected_tag = None
    selected_commit = client.repo.head.commit

    if tag:
        selected_tag = next((t for t in dataset_.tags if t.name == tag), None)

        if not selected_tag:
            raise ValueError('Tag {} not found'.format(tag))

        selected_commit = selected_tag.commit
    elif dataset_.tags and len(dataset_.tags) > 0 and handle_tag_selection_fn:
        tag_result = handle_tag_selection_fn(dataset_.tags)

        if tag_result:
            selected_tag = tag_result
            selected_commit = tag_result.commit

            # If the tag is created automatically for imported datasets, it
            # does not have the dataset yet and we need to use the next commit
            with client.with_commit(selected_commit):
                test_ds = client.load_dataset(short_name)
            if not test_ds:
                commits = client.dataset_commits(dataset_)
                next_commit = selected_commit
                for commit in commits:
                    if commit.hexsha == selected_commit:
                        selected_commit = next_commit.hexsha
                        break
                    next_commit = commit

    with client.with_commit(selected_commit):
        dataset_ = client.load_dataset(short_name)
        if not dataset_:
            raise DatasetNotFound(name=short_name)

        access_token = client.get_value(provider_id, config_key_secret)
        exporter = provider.get_exporter(dataset_, access_token=access_token)

        if access_token is None:
            if handle_access_token_fn:
                access_token = handle_access_token_fn(exporter)

            if access_token is None or len(access_token) == 0:
                raise InvalidAccessToken()

            client.set_value(
                provider_id, config_key_secret, access_token, global_only=True
            )
            exporter.set_access_token(access_token)

        try:
            destination = exporter.export(publish=publish, tag=selected_tag)
        except errors.AuthenticationError:
            client.remove_value(
                provider_id, config_key_secret, global_only=True
            )
            raise

    result = 'Exported to: {0}'.format(destination)
    return result
Example #4
0
def test_dataset_provider_resolution_zenodo(doi_responses, uri):
    """Check that zenodo uris resolve to ZenodoProvider."""
    provider, _ = ProviderFactory.from_uri(uri)
    assert type(provider) is ZenodoProvider
Example #5
0
def import_dataset(
    client,
    uri,
    name="",
    extract=False,
    with_prompt=False,
    yes=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider or another renku project."""
    u = urllib.parse.urlparse(uri)
    if u.scheme not in ("", "file", "git+https", "git+ssh", "doi"):
        # NOTE: Check if the url is a redirect.
        uri = requests.head(uri, allow_redirects=True).url

    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError("Could not process {0}.\n{1}".format(uri, err))

    try:
        record = provider.find_record(uri, client)
        dataset = record.as_dataset(client)
        files = dataset.files
        total_size = 0

        if with_prompt and not yes:
            click.echo(
                tabulate(
                    files,
                    headers=OrderedDict((
                        ("checksum", None),
                        ("filename", "name"),
                        ("size_in_mb", "size (mb)"),
                        ("filetype", "type"),
                    )),
                    floatfmt=".2f",
                ))

            text_prompt = "Do you wish to download this version?"
            if record.is_last_version(uri) is False:
                text_prompt = (WARNING + "Newer version found at {}\n".format(
                    record.links.get("latest_html")) + text_prompt)

            click.confirm(text_prompt, abort=True)

            for file_ in files:
                if file_.size_in_mb is not None:
                    total_size += file_.size_in_mb

            total_size *= 2**20

    except KeyError as e:
        raise ParameterError(
            ("Could not process {0}.\n"
             "Unable to fetch metadata due to {1}".format(uri, e)))

    except LookupError as e:
        raise ParameterError(("Could not process {0}.\n"
                              "Reason: {1}".format(uri, str(e))))

    if not files:
        raise ParameterError("Dataset {} has no files.".format(uri))

    dataset.same_as = Url(url_id=remove_credentials(uri))

    if not provider.is_git_based:
        if not name:
            name = generate_default_name(dataset.title, dataset.version)

        if is_doi(dataset.identifier):
            dataset.same_as = Url(url_str=urllib.parse.urljoin(
                "https://doi.org", dataset.identifier))

        urls, names = zip(*[(f.source, f.filename) for f in files])

        _add_to_dataset(
            client,
            urls=urls,
            name=name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            destination_names=names,
            progress=progress,
            interactive=with_prompt,
            total_size=total_size,
        )

        if dataset.version:
            tag_name = re.sub("[^a-zA-Z0-9.-_]", "_", dataset.version)
            tag_dataset(
                client, name, tag_name,
                "Tag {} created by renku import".format(dataset.version))
    else:
        name = name or dataset.name

        if not dataset.data_dir:
            raise OperationError(
                f"Data directory for dataset must be set: {dataset.name}")

        sources = [f"{dataset.data_dir}/**"]
        for file_ in dataset.files:
            try:
                Path(file_.path).relative_to(dataset.data_dir)
            except ValueError:  # Files that are not in dataset's data directory
                sources.append(file_.path)

        _add_to_dataset(
            client,
            urls=[record.project_url],
            name=name,
            sources=sources,
            with_metadata=dataset,
            create=True,
        )
Example #6
0
def import_dataset(
    client,
    uri,
    short_name='',
    extract=False,
    with_prompt=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider."""
    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError('Could not process {0}.\n{1}'.format(uri, err))

    try:
        record = provider.find_record(uri)
        dataset = record.as_dataset(client)
        files = dataset.files

        if with_prompt:
            click.echo(
                tabulate(files,
                         headers=OrderedDict((
                             ('checksum', None),
                             ('filename', 'name'),
                             ('size_in_mb', 'size (mb)'),
                             ('filetype', 'type'),
                         ))))

            text_prompt = 'Do you wish to download this version?'
            if record.is_last_version(uri) is False:
                text_prompt = WARNING + 'Newer version found at {}\n'.format(
                    record.links.get('latest_html')) + text_prompt

            click.confirm(text_prompt, abort=True)

    except KeyError as e:
        raise ParameterError(
            ('Could not process {0}.\n'
             'Unable to fetch metadata due to {1}'.format(uri, e)))

    except LookupError:
        raise ParameterError(('Could not process {0}.\n'
                              'URI not found.'.format(uri)))

    if files:
        if not short_name:
            short_name = generate_default_short_name(dataset.name,
                                                     dataset.version)

        dataset.url = remove_credentials(dataset.url)

        add_to_dataset(
            client,
            urls=[f.url for f in files],
            short_name=short_name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            progress=progress,
        )

        if dataset.version:
            tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
            tag_dataset(
                client, short_name, tag_name,
                'Tag {} created by renku import'.format(dataset.version))
Example #7
0
def export_dataset(
    client,
    id,
    provider,
    publish,
    tag,
    handle_access_token_fn=None,
    handle_tag_selection_fn=None,
    commit_message=None,
):
    """Export data to 3rd party provider.

    :raises: ``ValueError``, ``HTTPError``, ``InvalidAccessToken``,
             ``DatasetNotFound``
    """
    # TODO: all these callbacks are ugly, improve in #737
    config_key_secret = 'access_token'
    provider_id = provider

    dataset_ = client.load_dataset(id)
    if not dataset_:
        raise DatasetNotFound()

    try:
        provider = ProviderFactory.from_id(provider_id)
    except KeyError:
        raise ValueError('Unknown provider.')

    selected_tag = None
    selected_commit = client.repo.head.commit

    if tag:
        selected_tag = next((t for t in dataset_.tags if t.name == tag), None)

        if not selected_tag:
            raise ValueError('Tag {} not found'.format(tag))

        selected_commit = selected_tag.commit
    elif dataset_.tags and len(dataset_.tags) > 0 and handle_tag_selection_fn:
        tag_result = handle_tag_selection_fn(dataset_.tags)

        if tag_result:
            selected_tag = tag_result
            selected_commit = tag_result.commit

    with client.with_commit(selected_commit):
        dataset_ = client.load_dataset(id)
        if not dataset_:
            raise DatasetNotFound()

        access_token = client.get_value(provider_id, config_key_secret)
        exporter = provider.get_exporter(dataset_, access_token=access_token)

        if access_token is None:

            if handle_access_token_fn:
                access_token = handle_access_token_fn(exporter)

            if access_token is None or len(access_token) == 0:
                raise InvalidAccessToken()

            client.set_value(provider_id,
                             config_key_secret,
                             access_token,
                             global_only=True)
            exporter.set_access_token(access_token)

        try:
            destination = exporter.export(publish, selected_tag)
        except HTTPError as e:
            if 'unauthorized' in str(e):
                client.remove_value(provider_id,
                                    config_key_secret,
                                    global_only=True)

            raise

    result = 'Exported to: {0}'.format(destination)
    return result
Example #8
0
def export_dataset(
    client,
    short_name,
    provider,
    publish,
    tag,
    handle_access_token_fn=None,
    handle_tag_selection_fn=None,
    commit_message=None,
    dataverse_server_url=None,
    dataverse_name=None,
):
    """Export data to 3rd party provider.

    :raises: ``ValueError``, ``HTTPError``, ``InvalidAccessToken``,
             ``DatasetNotFound``
    """
    # TODO: all these callbacks are ugly, improve in #737
    config_key_secret = 'access_token'
    provider_id = provider.lower()

    dataset_ = client.load_dataset(short_name)
    if not dataset_:
        raise DatasetNotFound()

    try:
        provider = ProviderFactory.from_id(provider_id)
    except KeyError:
        raise ValueError('Unknown provider.')

    selected_tag = None
    selected_commit = client.repo.head.commit

    if tag:
        selected_tag = next((t for t in dataset_.tags if t.name == tag), None)

        if not selected_tag:
            raise ValueError('Tag {} not found'.format(tag))

        selected_commit = selected_tag.commit
    elif dataset_.tags and len(dataset_.tags) > 0 and handle_tag_selection_fn:
        tag_result = handle_tag_selection_fn(dataset_.tags)

        if tag_result:
            selected_tag = tag_result
            selected_commit = tag_result.commit

    with client.with_commit(selected_commit):
        dataset_ = client.load_dataset(short_name)
        if not dataset_:
            raise DatasetNotFound()

        access_token = client.get_value(provider_id, config_key_secret)
        exporter = provider.get_exporter(dataset_, access_token=access_token)

        if access_token is None:

            if handle_access_token_fn:
                access_token = handle_access_token_fn(exporter)

            if access_token is None or len(access_token) == 0:
                raise InvalidAccessToken()

            client.set_value(provider_id,
                             config_key_secret,
                             access_token,
                             global_only=True)
            exporter.set_access_token(access_token)

        if provider_id == 'dataverse':
            if not dataverse_name:
                raise errors.ParameterError('Dataverse name is required.')

            CONFIG_BASE_URL = 'server_url'

            if not dataverse_server_url:
                dataverse_server_url = client.get_value(
                    provider_id, CONFIG_BASE_URL)
            else:
                client.set_value(provider_id,
                                 CONFIG_BASE_URL,
                                 dataverse_server_url,
                                 global_only=True)

        try:
            destination = exporter.export(publish=publish,
                                          tag=selected_tag,
                                          server_url=dataverse_server_url,
                                          dataverse_name=dataverse_name)
        except errors.AuthenticationError:
            client.remove_value(provider_id,
                                config_key_secret,
                                global_only=True)
            raise

    result = 'Exported to: {0}'.format(destination)
    return result