Python DataDir Exemples, cldfbench.datadir.DataDir Python Exemples

Exemple #1

0

Afficher le fichier

 def __init__(self):
     if not self.dir:
         self.dir = pathlib.Path(inspect.getfile(self.__class__)).parent
     self.dir = DataDir(self.dir)
     md = self.dir / 'metadata.json'
     self.metadata = self.metadata_cls.from_file(
         md) if md.exists() else self.metadata_cls()
     self.metadata.id = self.id

Exemple #2

0

Afficher le fichier

class Dataset(object):
    """
    A cldfbench dataset ties together
    - `raw` data, to be used as source for the
    - `cldf` data, which is created using config data from
    - `etc`.

    To use the cldfbench infrastructure, one should sub-class `Dataset`.

    cldfbench supports the following workflow:
    - a `download` command populates a `Dataset`'s `raw` directory.
    - a `makecldf` command (re)creates the CLDF dataset in `cldf`.
    """
    dir = None
    id = None
    metadata_cls = Metadata

    def __init__(self):
        if not self.dir:
            self.dir = pathlib.Path(inspect.getfile(self.__class__)).parent
        self.dir = DataDir(self.dir)
        md = self.dir / 'metadata.json'
        self.metadata = self.metadata_cls.from_file(
            md) if md.exists() else self.metadata_cls()
        self.metadata.id = self.id

    def __str__(self):
        return '{0.__class__.__name__} "{0.id}" at {1}'.format(
            self, self.dir.resolve())

    def cldf_specs(self):
        """
        A `Dataset` must declare all CLDF datasets that are derived from it.

        :return: A single `CLDFSpec` instance, or a `dict`, mapping names to `CLDFSpec` \
        instances, where the name will be used by `cldf_reader`/`cldf_writer` to look up \
        the spec.
        """
        return CLDFSpec(dir=self.cldf_dir)

    @property
    def cldf_specs_dict(self):
        """
        Turn cldf_specs into a `dict` for simpler lookup.

        :return: `dict` mapping lookup keys to `CLDFSpec` instances.
        """
        specs = self.cldf_specs()
        if isinstance(specs, CLDFSpec):
            return {None: specs}
        assert isinstance(specs, dict)
        return specs

    @lazyproperty
    def cldf_dir(self):
        return self.dir / 'cldf'

    @lazyproperty
    def raw_dir(self):
        return self.dir / 'raw'

    @lazyproperty
    def etc_dir(self):
        return self.dir / 'etc'

    def cldf_writer(self, args, cldf_spec=None, clean=True):
        """
        :param args:
        :param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs`
        :param clean: `bool` flag signaling whether to clean the CLDF dir before writing. \
        Note that `False` must be passed for subsequent calls to `cldf_writer` in case the \
        spec re-uses a directory.
        :return: a `cldf_spec.writer_cls` instance, for write-access to CLDF data. \
        This method should be used in a with-statement, and will then return a `CLDFWriter` with \
        an empty working directory.
        """
        if not isinstance(cldf_spec, CLDFSpec):
            cldf_spec = self.cldf_specs_dict[cldf_spec]
        return cldf_spec.get_writer(args=args, dataset=self, clean=clean)

    def cldf_reader(self, cldf_spec=None):
        """
        :param cldf_spec:
        :return: a `pycldf.Dataset` instance, for read-access to the CLDF data.
        """
        if not isinstance(cldf_spec, CLDFSpec):
            cldf_spec = self.cldf_specs_dict[cldf_spec]
        return cldf_spec.get_dataset()

    @lazyproperty
    def repo(self):
        try:
            return Repository(self.dir)
        except ValueError:  # pragma: no cover
            return

    #
    # Workflow commands are implemented with two methods for each command:
    # - cmd_<command>: The implementation of the command, typically overwritten by datasets.
    # - _cmd_<command>: An (optional) wrapper providing setup and teardown functionality, calling
    #   cmd_<command> in between.
    #
    # Workflow commands must accept an `argparse.Namespace` as sole positional argument.
    #
    def _cmd_download(self, args):
        self.raw_dir.mkdir(exist_ok=True)
        self.cmd_download(args)
        (self.raw_dir / 'README.md').write_text(
            'Raw data downloaded {0}'.format(datetime.utcnow().isoformat()),
            encoding='utf8')

    def cmd_download(self, args):
        args.log.warning('cmd_{0} not implemented for dataset {1}'.format(
            'download', self.id))
        return NOOP

    def _cmd_readme(self, args):
        if self.metadata:
            self.dir.joinpath('README.md').write_text(self.cmd_readme(args),
                                                      encoding='utf8')

    def cmd_readme(self, args):
        return self.metadata.markdown() if self.metadata else ''

    def _cmd_makecldf(self, args):
        specs = list(self.cldf_specs_dict.values())
        if len(specs) == 1:
            # There's only one CLDF spec! We instantiate the writer now and inject it into `args`:
            with self.cldf_writer(args, cldf_spec=specs[0]) as writer:
                args.writer = writer
                self.cmd_makecldf(args)
        else:
            self.cmd_makecldf(args)

        if self.metadata and self.metadata.known_license:
            legalcode = self.metadata.known_license.legalcode
            if legalcode:
                (self.dir / 'LICENSE').write_text(legalcode, encoding='utf8')

    def cmd_makecldf(self, args):
        """
        :param args: An `argparse.Namespace` including attributes:
        - `writer`: `CLDFWriter` instance
        """
        args.log.warning('cmd_{0} not implemented for dataset {1}'.format(
            'makecldf', self.id))
        return NOOP

Exemple #3

0

Afficher le fichier

Fichier : media.py Projet : SimonGreenhill/cldfbench

def run(args):

    ds = get_dataset(args)
    ds_cldf = ds.cldf_reader()
    release_dir = args.out / '{0}_{1}'.format(ds.id, MEDIA)

    if ds_cldf.get('media.csv', None) is None:  # pragma: no cover
        args.log.error('Dataset has no media.csv')
        raise ParserError
    if args.parent_doi and not Zenodo.DOI_PATTERN.match(args.parent_doi):
        args.log.error('Invalid passed DOI')
        raise ParserError
    if args.update_zenodo:
        if not release_dir.exists():
            args.log.error(
                '"{0}" not found -- run --create-release first?'.format(
                    release_dir))
            raise ParserError
        if not (release_dir / ZENODO_FILE_NAME).exists():
            args.log.error(
                '"{0}" not found -- run --create-release first?'.format(
                    release_dir / ZENODO_FILE_NAME))
            raise ParserError
        if args.create_release:
            args.log.error(
                'You cannot create the release and update zenodo at the same time.'
            )
            raise ParserError
    if args.create_release:
        if not args.parent_doi:
            args.log.error(
                'The corresponding DOI is required (via --parent-doi).')
            raise ParserError

    mime_types = None
    if args.mimetype:
        mime_types = [m.strip() for m in nfilter(args.mimetype.split(','))]

    if args.list:
        size = collections.Counter()
        number = collections.Counter()
    else:
        media_dir = args.out / MEDIA
        media_dir.mkdir(exist_ok=True)
        media = []

    if not args.update_zenodo:
        used_file_extensions = set()
        with UnicodeWriter(media_dir /
                           INDEX_CSV if not args.list else None) as w:
            for i, row in enumerate(
                    tqdm.tqdm([r for r in ds_cldf['media.csv']],
                              desc='Getting {0} items'.format(MEDIA))):
                url = ds_cldf.get_row_url('media.csv', row)
                if isinstance(url, rfc3986.URIReference):
                    url = url.normalize().unsplit()
                    row['URL'] = url
                f_ext = url.split('.')[-1].lower()
                if args.debug and i > 500:
                    break
                if (mime_types is None) or f_ext in mime_types\
                        or any(row['mimetype'].startswith(x) for x in mime_types):
                    if args.list:
                        m = '{0} ({1})'.format(row['mimetype'], f_ext)
                        size[m] += int(row['size'])
                        number.update([m])
                    else:
                        used_file_extensions.add(f_ext.lower())
                        d = media_dir / row['ID'][:2]
                        d.mkdir(exist_ok=True)
                        fn = '.'.join([row['ID'], f_ext])
                        target = d / fn
                        row['local_path'] = pathlib.Path(row['ID'][:2]) / fn
                        if i == 0:
                            w.writerow(row)
                        w.writerow(row.values())
                        media.append(target)
                        if (not target.exists()) or md5(target) != row['ID']:
                            _create_download_thread(url, target)

    if args.list:
        for k, v in size.most_common():
            print('\t'.join([k.ljust(20), str(number[k]), format_size(v)]))
        return

    # Waiting for the download threads to finish
    if 'download_threads' in globals():
        for t in download_threads:
            t.join()

    if args.create_release:
        assert media_dir.exists(), 'No folder "{0}" found in {1}'.format(
            MEDIA, media_dir.resolve())

        release_dir.mkdir(exist_ok=True)

        media.append(media_dir / INDEX_CSV)

        try:
            zipf = zipfile.ZipFile(str(release_dir / '{0}.zip'.format(MEDIA)),
                                   'w', zipfile.ZIP_DEFLATED)
            fp = args.out
            for f in tqdm.tqdm(media, desc='Creating {0}.zip'.format(MEDIA)):
                zipf.write(str(f), str(os.path.relpath(str(f), str(fp))))
            zipf.close()
        except Exception as e:
            args.log.error(e)
            raise

        def _contrib(d):
            return {
                k: v
                for k, v in d.items()
                if k in {'name', 'affiliation', 'orcid', 'type'}
            }

        version_v = git_describe('.').split('-')[0]
        version = version_v.replace('v', '')
        git_url = [r for r in ds.repo.repo.remotes
                   if r.name == 'origin'][0].url.replace('.git', '')
        with jsonlib.update(release_dir / ZENODO_FILE_NAME,
                            indent=4,
                            default=collections.OrderedDict()) as md:
            contribs = ds.dir / 'CONTRIBUTORS.md'
            creators, contributors = get_creators_and_contributors(
                contribs.read_text(
                    encoding='utf8') if contribs.exists() else '',
                strict=False)
            if creators:
                md['creators'] = [_contrib(p) for p in creators]
            if contributors:
                md['contributors'] = [_contrib(p) for p in contributors]
            communities = [r["identifier"] for r in md.get("communities", [])] + \
                [c.strip() for c in nfilter(args.communities.split(','))] + \
                COMMUNITIES
            if communities and not args.debug:
                md['communities'] = [{
                    "identifier": community_id
                } for community_id in sorted(set(communities))]
            md.update({
                'title':
                '{0} {1} Files'.format(ds.metadata.title, MEDIA.title()),
                'access_right':
                'open',
                'keywords':
                sorted(set(md.get('keywords', []) + ['linguistics'])),
                'upload_type':
                'dataset',
                'publication_date':
                datetime.today().strftime('%Y-%m-%d'),
                'version':
                version,
                'related_identifiers': [
                    {
                        'scheme': 'url',
                        'identifier':
                        '{0}/tree/{1}'.format(git_url, version_v),
                        'relation': 'isSupplementTo'
                    },
                ],
            })
            if args.parent_doi:
                md['related_identifiers'].append({
                    'scheme': 'doi',
                    'identifier': args.parent_doi,
                    'relation': 'isPartOf'
                })
                supplement_to = " - Supplement to dataset " \
                                "<a href='https://doi.org/{0}'>{1}</a> ".format(
                    args.parent_doi, ds.metadata.title)  # noqa: E122
            if ds.metadata.url:
                md['related_identifiers'].append({
                    'scheme':
                    'url',
                    'identifier':
                    ds.metadata.url,
                    'relation':
                    'isAlternateIdentifier'
                })

            formats = ', '.join(sorted(used_file_extensions))
            descr = '<br /><br />' + ds.metadata.description if ds.metadata.description else ''
            online_url, online = '', ''
            if ds.metadata.url:
                online_url = ds.metadata.url
                online = "<br /><br />Available online at: <a href='{0}'>{0}</a>".format(
                    online_url)
            md['description'] = html.escape(
                DESCRIPTION.format(
                    url=online_url,
                    formats=' ({0})'.format(formats) if formats else '',
                    title=md['title'],
                    supplement_to=supplement_to,
                    descr=descr,
                    online=online))

            license_md = ''
            if ds.metadata.zenodo_license:
                md['license'] = {'id': ds.metadata.zenodo_license}
                license_md = LICENCE.format(ds.metadata.zenodo_license)

            DataDir(release_dir).write(
                'README.md',
                README.format(
                    title=md['title'],
                    doi='https://doi.org/{0}'.format(args.parent_doi),
                    ds_title=ds.metadata.title,
                    license=license_md,
                    formats=' ({0})'.format(formats) if formats else '',
                    media=MEDIA,
                    index=INDEX_CSV))

    if args.update_zenodo:

        md = {}
        md.update(jsonlib.load(release_dir / ZENODO_FILE_NAME))

        if args.debug:
            api_url = API_URL_SANDBOX
            access_token = os.environ.get('ZENODO_SANDBOX_ACCESS_TOKEN')
        else:
            api_url = API_URL
            access_token = ACCESS_TOKEN
        zenodo_url = api_url.replace('api/', '')

        args.log.info('Updating Deposit ID {0} on {1} with:'.format(
            args.update_zenodo, zenodo_url))
        api = Zenodo(api_url=api_url, access_token=access_token)
        try:
            rec = api.record_from_id('{0}record/{1}'.format(
                zenodo_url, args.update_zenodo))
        except Exception as e:
            args.log.error(
                'Check connection and credentials for accessing Zenodo.\n{0}'.
                format(e))
            return
        latest_version = rec.links['latest'].split('/')[-1]
        if latest_version != args.update_zenodo:
            args.log.warn(
                'Passed deposit ID does not refer to latest version {0}!'.
                format(latest_version))
        args.log.info('  DOI:     ' + rec.metadata.doi)
        args.log.info('  Title:   ' + rec.metadata.title)
        args.log.info('  Version: ' + rec.metadata.version)
        args.log.info('  Date:    ' + rec.metadata.publication_date)
        args.log.info('  Files:   ' + ', '.join([f.key for f in rec.files]))
        p = input("Proceed? [y/N]: ")
        if p.lower() == 'y':
            dep = api.update_deposit(args.update_zenodo, **md)
            if dep.state != PUBLISHED:
                api.publish_deposit(dep)
            args.log.info('Updated successfully')

Exemple #4

0

Afficher le fichier

def run(args):
    ds = Dataset().cldf_reader()

    release_dir = args.out / '{0}_audio'.format(Dataset().id)
    zenodo_file_name = 'zenodo.json'

    if args.list:
        size = collections.Counter()
        number = collections.Counter()
    else:
        f2c = {r['ID']: r['Parameter_ID'] for r in ds['FormTable']}
        audio = args.out / 'audio'
        audio.mkdir(exist_ok=True)

    if not args.update_zenodo:
        for row in tqdm.tqdm([r for r in ds['media.csv']]):
            if args.list:
                size[row['mimetype']] += int(row['size'])
                number.update([row['mimetype']])
            else:
                d = audio / f2c[row['Form_ID']]
                d.mkdir(exist_ok=True)
                url = ds.get_row_url('media.csv', row)
                target = d / '{}.{}'.format(row['ID'], url.split('.')[-1])
                if (not target.exists()) or md5(target) != row['ID']:
                    if (args.mimetype is None) or target.suffix.endswith(
                            args.mimetype):
                        create_download_thread(url, target)

    if args.list:
        for k, v in size.most_common():
            print('\t'.join([k, str(number[k]), format_size(v)]))

    if args.create_release:
        assert audio.exists(), 'No folder "audio" found in {0}'.format(
            audio.resolve())

        release_dir.mkdir(exist_ok=True)

        args.log.info('creating audio ZIP archive per parameter folder ...')
        try:
            zipf = zipfile.ZipFile(str(release_dir / 'audio.zip'), 'w',
                                   zipfile.ZIP_DEFLATED)
            fp = args.out
            for root, dirs, files in tqdm.tqdm(os.walk(audio)):
                for f in files:
                    if not f.startswith('.') and not f.startswith('__')\
                            and ((args.mimetype is None) or f.endswith(args.mimetype)):
                        zipf.write(os.path.join(root, f),
                                   os.path.relpath(os.path.join(root, f), fp))
            zipf.close()
        except Exception as e:
            args.log.error(e)
            raise

        def contrib(d):
            return {
                k: v
                for k, v in d.items()
                if k in {'name', 'affiliation', 'orcid', 'type'}
            }

        with jsonlib.update(release_dir / zenodo_file_name,
                            indent=4,
                            default=collections.OrderedDict()) as md:
            contribs = Dataset().dir / 'CONTRIBUTORS.md'
            creators, contributors = get_creators_and_contributors(
                contribs.read_text(
                    encoding='utf8') if contribs.exists() else '',
                strict=False)
            if creators:
                md['creators'] = [contrib(p) for p in creators]
            if contributors:
                md['contributors'] = [contrib(p) for p in contributors]
            if COMMUNITIES:
                md['communities'] = [{
                    'id': community_id
                } for community_id in COMMUNITIES]
            md.update({
                'title':
                '{0} Audio Files'.format(Dataset().metadata.title),
                'access_right':
                'open',
                'keywords':
                sorted(set(md.get('keywords', []) + ['linguistics'])),
                'upload_type':
                'video',
                'version':
                VERSION,
                'related_identifiers': [
                    {
                        'scheme': 'doi',
                        'identifier': '10.5281/zenodo.4309141',
                        'relation': 'isPartOf'
                    },
                    {
                        'scheme':
                        'url',
                        'identifier':
                        '{0}{1}/tree/v{2}'.format(GITHUB_PREFIX,
                                                  Dataset().id, VERSION),
                        'relation':
                        'isSupplementTo'
                    },
                ],
            })
            if Dataset().metadata.url:
                md['related_identifiers'].append({
                    'scheme':
                    'url',
                    'identifier':
                    Dataset().metadata.url,
                    'relation':
                    'isAlternateIdentifier'
                })
            md['description'] = html.escape(
                DESCRIPTION.format(
                    GITHUB_PREFIX,
                    Dataset().id,
                    Dataset().metadata.url if Dataset().metadata.url else '',
                    VERSION))

            license_md = ''
            if Dataset().metadata.zenodo_license:
                md['license'] = {'id': Dataset().metadata.zenodo_license}
                license_md = LISENCE.format(Dataset().metadata.zenodo_license)

            DataDir(release_dir).write(
                'README.md',
                RELEASE_NOTE.format(md['title'], GITHUB_PREFIX,
                                    Dataset().id,
                                    Dataset().metadata.title, license_md))

    if args.update_zenodo:
        assert release_dir.exists()
        assert (release_dir / zenodo_file_name).exists()

        md = {}
        md.update(jsonlib.load(release_dir / zenodo_file_name))

        api_url = API_URL
        zenodo_url = api_url.replace('api/', '')

        args.log.info('Updating Deposit ID {0} on {1} with:'.format(
            args.update_zenodo, zenodo_url))
        api = Zenodo(api_url=api_url, access_token=ACCESS_TOKEN)
        rec = api.record_from_id('{0}record/{1}'.format(
            zenodo_url, args.update_zenodo))
        args.log.info('  DOI:   ' + rec.metadata.doi)
        args.log.info('  Title: ' + rec.metadata.title)
        args.log.info('  Date:  ' + rec.metadata.publication_date)
        args.log.info('  Files: ' + ', '.join([f.key for f in rec.files]))
        p = input("Proceed? [y/N]: ")
        if p.lower() == 'y':
            dep = api.update_deposit(args.update_zenodo, **md)
            if dep.state != zenodoclient.models.PUBLISHED:
                api.publish_deposit(dep)
            args.log.info('Updated successfully')

Exemple #5

0

Afficher le fichier

Fichier : dataset.py Projet : SimonGreenhill/cldfbench

class Dataset(object):
    """
    A cldfbench dataset ties together

    - `raw` data, to be used as source for the
    - `cldf` data, which is created using config data from
    - `etc`.

    To use the cldfbench infrastructure, one should sub-class `Dataset`.

    cldfbench supports the following workflow:
    - a `download` command populates a `Dataset`'s `raw` directory.
    - a `makecldf` command (re)creates the CLDF dataset in `cldf`.

    The following class attributes are supposed to be overwritten by subclasses:

    :ivar dir: `pathlib.Path` pointing to the root directory of the dataset.
    :ivar id: A `str` identifier for the dataset. No assumption about uniqueness properties of \
    this identifier is made.
    :ivar metadata_cls: Subclass of :class:`Metadata` (or :class:`Metadata` if not overwritten)
    """
    dir = None
    id = None
    metadata_cls = Metadata

    def __init__(self):
        if not self.dir:
            self.dir = pathlib.Path(inspect.getfile(self.__class__)).parent
        self.dir = DataDir(self.dir)
        md = self.dir / 'metadata.json'
        self.metadata = self.metadata_cls.from_file(
            md) if md.exists() else self.metadata_cls()
        self.metadata.id = self.id

    def __str__(self):
        return '{0.__class__.__name__} "{0.id}" at {1}'.format(
            self, self.dir.resolve())

    @lazyproperty
    def cldf_dir(self) -> DataDir:
        """
        Directory where CLDF data generated from the Dataset will be stored (unless specified
        differently by a :class:`CLDFSpec`).
        """
        return self.dir / 'cldf'

    @lazyproperty
    def raw_dir(self) -> DataDir:
        """
        Directory where cldfbench expects the raw or source data.
        """
        return self.dir / 'raw'

    @lazyproperty
    def etc_dir(self) -> DataDir:
        """
        Directory where cldfbench expects additional configuration or metadata.
        """
        return self.dir / 'etc'

    def cldf_specs(self) -> typing.Union[CLDFSpec, typing.Dict[str, CLDFSpec]]:
        """
        A `Dataset` must declare all CLDF datasets that are derived from it.

        :return: A single :class:`CLDFSpec` instance, or a `dict`, mapping names to `CLDFSpec` \
        instances, where the name will be used by `cldf_reader`/`cldf_writer` to look up \
        the spec.
        """
        return CLDFSpec(dir=self.cldf_dir)

    @property
    def cldf_specs_dict(
            self) -> typing.Dict[typing.Union[str, None], CLDFSpec]:
        """
        Turn :meth:`cldf_specs` into a `dict` for simpler lookup.

        :return: `dict` mapping lookup keys to `CLDFSpec` instances.
        """
        specs = self.cldf_specs()
        if isinstance(specs, CLDFSpec):
            return {None: specs}
        assert isinstance(specs, dict)
        return specs

    def update_submodules(self):
        """
        Convenience method to be used in a `Dataset`'s `cmd_download` to update raw data curated
        as git submodules.
        """
        subprocess.check_call('git -C {} submodule update --remote'.format(
            self.dir.resolve()),
                              shell=True)

    def cldf_writer(self, args, cldf_spec=None, clean=True) -> CLDFWriter:
        """
        :param args:
        :param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs`
        :param clean: `bool` flag signaling whether to clean the CLDF dir before writing. \
        Note that `False` must be passed for subsequent calls to `cldf_writer` in case the \
        spec re-uses a directory.
        :return: a `cldf_spec.writer_cls` instance, for write-access to CLDF data. \
        This method should be used in a with-statement, and will then return a `CLDFWriter` with \
        an empty working directory.
        """
        if not isinstance(cldf_spec, CLDFSpec):
            cldf_spec = self.cldf_specs_dict[cldf_spec]
        return cldf_spec.get_writer(args=args, dataset=self, clean=clean)

    def cldf_reader(self,
                    cldf_spec: typing.Union[str,
                                            None] = None) -> pycldf.Dataset:
        """
        :param cldf_spec:
        :return: a `pycldf.Dataset` instance, for read-access to the CLDF data.
        """
        if not isinstance(cldf_spec, CLDFSpec):
            cldf_spec = self.cldf_specs_dict[cldf_spec]
        return cldf_spec.get_dataset()

    @lazyproperty
    def repo(self) -> typing.Union[Repository, None]:
        """
        The git repository cloned to the dataset's directory (or `None`).
        """
        try:
            return Repository(self.dir)
        except ValueError:  # pragma: no cover
            return

    def _cmd_download(self, args):
        self.raw_dir.mkdir(exist_ok=True)
        self.cmd_download(args)
        (self.raw_dir / 'README.md').write_text(
            'Raw data downloaded {0}'.format(datetime.utcnow().isoformat()),
            encoding='utf8')

    def cmd_download(self, args: argparse.Namespace):
        """
        Implementations of this methods should populate the dataset's `raw_dir` with the source
        data.
        """
        args.log.warning('cmd_{0} not implemented for dataset {1}'.format(
            'download', self.id))
        return NOOP

    def _cmd_readme(self, args):
        if self.metadata:
            badge = build_status_badge(self)
            md = self.cmd_readme(args)
            if badge:
                lines, title_found = [], False
                for line in md.split('\n'):
                    lines.append(line)
                    if line.startswith('# ') and not title_found:
                        title_found = True
                        lines.extend(['', badge])
                md = '\n'.join(lines)

            section = [
                '\n\n## CLDF Datasets\n',
                'The following CLDF datasets are available in [{0}]({0}):\n'.
                format(self.cldf_dir.resolve().relative_to(self.dir.resolve()))
            ]
            for ds in self.cldf_specs_dict.values():
                if ds.metadata_path.exists():
                    p = ds.metadata_path.resolve().relative_to(
                        self.dir.resolve())
                    section.append(
                        '- CLDF [{0}](https://github.com/cldf/cldf/tree/master/modules/{0}) '
                        'at [{1}]({1})'.format(ds.module, p))

            self.dir.joinpath('README.md').write_text(md + '\n'.join(section),
                                                      encoding='utf8')

    def cmd_readme(self, args: argparse.Namespace) -> str:
        """
        Implementations of this method should create the content for the dataset's README.md
        and return it as markdown formatted string.
        """
        return self.metadata.markdown() if self.metadata else ''

    def _cmd_makecldf(self, args):
        specs = list(self.cldf_specs_dict.values())
        if len(specs) == 1:
            # There's only one CLDF spec! We instantiate the writer now and inject it into `args`:
            with self.cldf_writer(args, cldf_spec=specs[0]) as writer:
                args.writer = writer
                self.cmd_makecldf(args)
        else:
            self.cmd_makecldf(args)

        if self.metadata and self.metadata.known_license:
            legalcode = self.metadata.known_license.legalcode
            if legalcode:
                (self.dir / 'LICENSE').write_text(legalcode, encoding='utf8')

    def cmd_makecldf(self, args: argparse.Namespace):
        """
        Implementations of this method should write the CLDF data curated by the dataset.

        :param args: An `argparse.Namespace` including attributes: \
        - `writer`: :class:`CLDFWriter` instance
        """
        args.log.warning('cmd_{0} not implemented for dataset {1}'.format(
            'makecldf', self.id))
        return NOOP