Example #1
0
    def from_uri(uri):
        """Get provider type based on uri."""
        is_doi_ = is_doi(uri)
        if is_doi_ is None:
            url = urlparse(uri)
            if bool(url.scheme and url.netloc and url.params == '') is False:
                return None, 'Cannot parse URL.'

        provider = None
        warning = ''

        for _, potential_provider in ProviderFactory.PROVIDERS.items():
            try:
                if potential_provider.supports(uri):
                    provider = potential_provider
                    break
            except (Exception, BaseException) as e:
                warning += 'Couldn\'t test provider {prov}: {err}\n'.format(
                    prov=potential_provider, err=e)

        supported_providers = ', '.join(ProviderFactory.PROVIDERS.keys())

        if is_doi_ and provider is None:
            return None, (
                warning + 'Provider {} not found. '.format(
                    uri.split('/')[1].split('.')[0]  # Get DOI provider name.
                ) + 'Currently supporting following providers: {}'.format(
                    supported_providers))
        elif provider is None:
            return None, (warning + 'Provider not found for {}. '.format(uri) +
                          'Currently supporting following providers: {}'.
                          format(supported_providers))
        else:
            return provider(is_doi=is_doi_), warning
Example #2
0
def _migrate_doi_identifier(data, client):
    """If the dataset _id is doi, make it a UUID."""
    from renku.core.utils.doi import is_doi
    from renku.core.utils.uuid import is_uuid

    _id = data.get('_id', '')
    identifier = data.get('identifier', '')

    if not is_uuid(_id):
        if not is_uuid(identifier):
            data['identifier'] = str(uuid.uuid4())
        if is_doi(data.get('_id', '')):
            data['same_as'] = {'@type': ['schema:URL'], 'url': data['_id']}
            if data.get('@context'):
                data['@context'].setdefault(
                    'same_as', {
                        '@id': 'schema:sameAs',
                        '@type': 'schema:URL',
                        '@context': {
                            '@version': '1.1',
                            'url': 'schema:url',
                            'schema': 'http://schema.org/'
                        }
                    })
        data['_id'] = data['identifier']
    return data
Example #3
0
def _migrate_doi_identifier(data, client):
    """If the dataset _id is doi, make it a UUID."""
    from renku.core.utils.doi import is_doi
    from renku.core.utils.uuid import is_uuid

    _id = data.get("_id", "")
    identifier = data.get("identifier", "")

    if not is_uuid(_id):
        if not is_uuid(identifier):
            data["identifier"] = str(uuid.uuid4())
        if is_doi(data.get("_id", "")):
            data["same_as"] = {"@type": ["schema:URL"], "url": data["_id"]}
            if data.get("@context"):
                data["@context"].setdefault(
                    "same_as",
                    {
                        "@id": "schema:sameAs",
                        "@type": "schema:URL",
                        "@context": {
                            "@version": "1.1",
                            "url": "schema:url",
                            "schema": "http://schema.org/"
                        },
                    },
                )
        data["_id"] = data["identifier"]
    return data
Example #4
0
    def supports(uri):
        """Check if provider supports a given uri."""
        is_doi_ = is_doi(uri)

        is_dataverse_uri = is_doi_ is None and check_dataverse_uri(uri)
        is_dataverse_doi = is_doi_ and check_dataverse_doi(is_doi_.group(0))

        return is_dataverse_uri or is_dataverse_doi
Example #5
0
def migrate_doi_identifier(data):
    """If the dataset has a doi, make identifier be based on it."""
    from renku.core.utils.doi import is_doi, extract_doi

    if is_doi(data.get('_id', '')):
        data['identifier'] = extract_doi(data.get('_id'))
        data['same_as'] = data['_id']
        if data.get('@context'):
            data['@context'].setdefault('same_as', 'schema:sameAs')
    return data
Example #6
0
def test_doi_migration(dataset_metadata):
    """Test migration of id with doi."""
    dataset = Dataset.from_jsonld(
        dataset_metadata,
        client=LocalClient('.'),
    )
    assert is_doi(dataset.identifier)
    assert urljoin('https://localhost', 'datasets/' +
                   quote(dataset.identifier, safe='')) == dataset._id
    assert dataset.same_as == urljoin('https://doi.org', dataset.identifier)
Example #7
0
def test_dataset_doi_metadata(dataset_metadata):
    """Check dataset metadata for correct DOI."""
    from renku.core.utils.doi import is_doi
    dataset = Dataset.from_jsonld(
        dataset_metadata,
        client=LocalClient('.'),
    )

    if is_doi(dataset.identifier):
        assert urljoin('https://doi.org',
                       dataset.identifier) == dataset.same_as

    assert dataset._id.endswith('datasets/{}'.format(
        quote(dataset.identifier, safe='')))
Example #8
0
    def update_metadata(self, other_dataset):
        """Updates instance attributes with other dataset attributes.

        :param other_dataset: `Dataset`
        :return: self
        """
        if is_doi(other_dataset.identifier):
            self.same_as = urllib.parse.urljoin('https://doi.org',
                                                other_dataset.identifier)

        for field_ in self.EDITABLE_FIELDS:
            val = getattr(other_dataset, field_)
            if val:
                setattr(self, field_, val)

        return self
Example #9
0
    def from_uri(uri):
        """Get provider type based on uri."""
        is_doi_ = is_doi(uri)
        if is_doi_ is None:
            url = urlparse(uri)
            if bool(url.scheme and url.netloc and url.params == "") is False:
                return None, "Cannot parse URL."

        provider = None
        warning = ""

        for _, potential_provider in ProviderFactory.PROVIDERS.items():
            try:
                if potential_provider.supports(uri):
                    provider = potential_provider
                    break
            except (Exception, BaseException) as e:
                warning += "Couldn't test provider {prov}: {err}\n".format(
                    prov=potential_provider, err=e)

        supported_providers = ", ".join(ProviderFactory.PROVIDERS.keys())

        if is_doi_ and provider is None:
            return (
                None,
                (
                    warning + "Reason: provider {} not found".format(
                        uri.split("/")[1].split(".")
                        [0])  # Get DOI provider name.
                    + "\nHint: Supported providers are: {}".format(
                        supported_providers)),
            )
        elif provider is None:
            return (
                None,
                (warning + "Reason: provider not found for {} ".format(uri) +
                 "\nHint: Supported providers are: {}".format(
                     supported_providers)),
            )
        else:
            return provider(is_doi=is_doi_), warning
Example #10
0
 def short_id(self):
     """Shorter version of identifier."""
     if is_doi(self.identifier):
         return self.identifier
     return str(self.uid)[:8]
Example #11
0
 def uid(self):
     """UUID part of identifier."""
     if is_doi(self.identifier):
         return self.identifier
     return self.identifier.split('/')[-1]
Example #12
0
def _extract_doi(value):
    """Return either a string or the doi part of a URL."""
    value = str(value)
    if is_doi(value):
        return extract_doi(value)
    return value
Example #13
0
 def supports(uri):
     """Whether or not this provider supports a given uri."""
     return bool(is_doi(uri))
Example #14
0
    def supports(uri):
        """Whether or not this provider supports a given uri."""
        if is_doi(uri) is not None:
            return True

        return False
Example #15
0
def import_dataset(
    client,
    uri,
    short_name='',
    extract=False,
    with_prompt=False,
    yes=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider or another renku project."""
    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError('Could not process {0}.\n{1}'.format(uri, err))

    try:
        record = provider.find_record(uri, client)
        dataset = record.as_dataset(client)
        files = dataset.files
        total_size = 0

        if with_prompt and not yes:
            click.echo(
                tabulate(
                    files,
                    headers=OrderedDict((
                        ('checksum', None),
                        ('filename', 'name'),
                        ('size_in_mb', 'size (mb)'),
                        ('filetype', 'type'),
                    )),
                    floatfmt='.2f'
                )
            )

            text_prompt = 'Do you wish to download this version?'
            if record.is_last_version(uri) is False:
                text_prompt = WARNING + 'Newer version found at {}\n'.format(
                    record.links.get('latest_html')
                ) + text_prompt

            click.confirm(text_prompt, abort=True)

            for file_ in files:
                if file_.size_in_mb is not None:
                    total_size += file_.size_in_mb

            total_size *= 2**20

    except KeyError as e:
        raise ParameterError((
            'Could not process {0}.\n'
            'Unable to fetch metadata due to {1}'.format(uri, e)
        ))

    except LookupError as e:
        raise ParameterError(
            ('Could not process {0}.\n'
             'Reason: {1}'.format(uri, str(e)))
        )

    if not files:
        raise ParameterError('Dataset {} has no files.'.format(uri))

    dataset.same_as = Url(url_id=remove_credentials(uri))

    if not provider.is_git_based:
        if not short_name:
            short_name = generate_default_short_name(
                dataset.name, dataset.version
            )

        if is_doi(dataset.identifier):
            dataset.same_as = Url(
                url_str=urllib.parse.
                urljoin('https://doi.org', dataset.identifier)
            )

        urls, names = zip(*[(f.url, f.filename) for f in files])

        _add_to_dataset(
            client,
            urls=urls,
            short_name=short_name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            destination_names=names,
            progress=progress,
            interactive=with_prompt,
            total_size=total_size,
        )

        if dataset.version:
            tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
            tag_dataset(
                client, short_name, tag_name,
                'Tag {} created by renku import'.format(dataset.version)
            )
    else:
        short_name = short_name or dataset.short_name

        _add_to_dataset(
            client,
            urls=[record.project_url],
            short_name=short_name,
            sources=[f.path for f in files],
            with_metadata=dataset,
            create=True
        )
Example #16
0
def import_dataset(
    client,
    uri,
    name="",
    extract=False,
    with_prompt=False,
    yes=False,
    commit_message=None,
    progress=None,
):
    """Import data from a 3rd party provider or another renku project."""
    u = urllib.parse.urlparse(uri)
    if u.scheme not in ("", "file", "git+https", "git+ssh", "doi"):
        # NOTE: Check if the url is a redirect.
        uri = requests.head(uri, allow_redirects=True).url

    provider, err = ProviderFactory.from_uri(uri)
    if err and provider is None:
        raise ParameterError("Could not process {0}.\n{1}".format(uri, err))

    try:
        record = provider.find_record(uri, client)
        dataset = record.as_dataset(client)
        files = dataset.files
        total_size = 0

        if with_prompt and not yes:
            click.echo(
                tabulate(
                    files,
                    headers=OrderedDict((
                        ("checksum", None),
                        ("filename", "name"),
                        ("size_in_mb", "size (mb)"),
                        ("filetype", "type"),
                    )),
                    floatfmt=".2f",
                ))

            text_prompt = "Do you wish to download this version?"
            if record.is_last_version(uri) is False:
                text_prompt = (WARNING + "Newer version found at {}\n".format(
                    record.links.get("latest_html")) + text_prompt)

            click.confirm(text_prompt, abort=True)

            for file_ in files:
                if file_.size_in_mb is not None:
                    total_size += file_.size_in_mb

            total_size *= 2**20

    except KeyError as e:
        raise ParameterError(
            ("Could not process {0}.\n"
             "Unable to fetch metadata due to {1}".format(uri, e)))

    except LookupError as e:
        raise ParameterError(("Could not process {0}.\n"
                              "Reason: {1}".format(uri, str(e))))

    if not files:
        raise ParameterError("Dataset {} has no files.".format(uri))

    dataset.same_as = Url(url_id=remove_credentials(uri))

    if not provider.is_git_based:
        if not name:
            name = generate_default_name(dataset.title, dataset.version)

        if is_doi(dataset.identifier):
            dataset.same_as = Url(url_str=urllib.parse.urljoin(
                "https://doi.org", dataset.identifier))

        urls, names = zip(*[(f.source, f.filename) for f in files])

        _add_to_dataset(
            client,
            urls=urls,
            name=name,
            create=True,
            with_metadata=dataset,
            force=True,
            extract=extract,
            all_at_once=True,
            destination_names=names,
            progress=progress,
            interactive=with_prompt,
            total_size=total_size,
        )

        if dataset.version:
            tag_name = re.sub("[^a-zA-Z0-9.-_]", "_", dataset.version)
            tag_dataset(
                client, name, tag_name,
                "Tag {} created by renku import".format(dataset.version))
    else:
        name = name or dataset.name

        if not dataset.data_dir:
            raise OperationError(
                f"Data directory for dataset must be set: {dataset.name}")

        sources = [f"{dataset.data_dir}/**"]
        for file_ in dataset.files:
            try:
                Path(file_.path).relative_to(dataset.data_dir)
            except ValueError:  # Files that are not in dataset's data directory
                sources.append(file_.path)

        _add_to_dataset(
            client,
            urls=[record.project_url],
            name=name,
            sources=sources,
            with_metadata=dataset,
            create=True,
        )