class Dataset(object): """ A cldfbench dataset ties together - `raw` data, to be used as source for the - `cldf` data, which is created using config data from - `etc`. To use the cldfbench infrastructure, one should sub-class `Dataset`. cldfbench supports the following workflow: - a `download` command populates a `Dataset`'s `raw` directory. - a `makecldf` command (re)creates the CLDF dataset in `cldf`. """ dir = None id = None metadata_cls = Metadata def __init__(self): if not self.dir: self.dir = pathlib.Path(inspect.getfile(self.__class__)).parent self.dir = DataDir(self.dir) md = self.dir / 'metadata.json' self.metadata = self.metadata_cls.from_file( md) if md.exists() else self.metadata_cls() self.metadata.id = self.id def __str__(self): return '{0.__class__.__name__} "{0.id}" at {1}'.format( self, self.dir.resolve()) def cldf_specs(self): """ A `Dataset` must declare all CLDF datasets that are derived from it. :return: A single `CLDFSpec` instance, or a `dict`, mapping names to `CLDFSpec` \ instances, where the name will be used by `cldf_reader`/`cldf_writer` to look up \ the spec. """ return CLDFSpec(dir=self.cldf_dir) @property def cldf_specs_dict(self): """ Turn cldf_specs into a `dict` for simpler lookup. :return: `dict` mapping lookup keys to `CLDFSpec` instances. """ specs = self.cldf_specs() if isinstance(specs, CLDFSpec): return {None: specs} assert isinstance(specs, dict) return specs @lazyproperty def cldf_dir(self): return self.dir / 'cldf' @lazyproperty def raw_dir(self): return self.dir / 'raw' @lazyproperty def etc_dir(self): return self.dir / 'etc' def cldf_writer(self, args, cldf_spec=None, clean=True): """ :param args: :param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs` :param clean: `bool` flag signaling whether to clean the CLDF dir before writing. \ Note that `False` must be passed for subsequent calls to `cldf_writer` in case the \ spec re-uses a directory. :return: a `cldf_spec.writer_cls` instance, for write-access to CLDF data. \ This method should be used in a with-statement, and will then return a `CLDFWriter` with \ an empty working directory. """ if not isinstance(cldf_spec, CLDFSpec): cldf_spec = self.cldf_specs_dict[cldf_spec] return cldf_spec.get_writer(args=args, dataset=self, clean=clean) def cldf_reader(self, cldf_spec=None): """ :param cldf_spec: :return: a `pycldf.Dataset` instance, for read-access to the CLDF data. """ if not isinstance(cldf_spec, CLDFSpec): cldf_spec = self.cldf_specs_dict[cldf_spec] return cldf_spec.get_dataset() @lazyproperty def repo(self): try: return Repository(self.dir) except ValueError: # pragma: no cover return # # Workflow commands are implemented with two methods for each command: # - cmd_<command>: The implementation of the command, typically overwritten by datasets. # - _cmd_<command>: An (optional) wrapper providing setup and teardown functionality, calling # cmd_<command> in between. # # Workflow commands must accept an `argparse.Namespace` as sole positional argument. # def _cmd_download(self, args): self.raw_dir.mkdir(exist_ok=True) self.cmd_download(args) (self.raw_dir / 'README.md').write_text( 'Raw data downloaded {0}'.format(datetime.utcnow().isoformat()), encoding='utf8') def cmd_download(self, args): args.log.warning('cmd_{0} not implemented for dataset {1}'.format( 'download', self.id)) return NOOP def _cmd_readme(self, args): if self.metadata: self.dir.joinpath('README.md').write_text(self.cmd_readme(args), encoding='utf8') def cmd_readme(self, args): return self.metadata.markdown() if self.metadata else '' def _cmd_makecldf(self, args): specs = list(self.cldf_specs_dict.values()) if len(specs) == 1: # There's only one CLDF spec! We instantiate the writer now and inject it into `args`: with self.cldf_writer(args, cldf_spec=specs[0]) as writer: args.writer = writer self.cmd_makecldf(args) else: self.cmd_makecldf(args) if self.metadata and self.metadata.known_license: legalcode = self.metadata.known_license.legalcode if legalcode: (self.dir / 'LICENSE').write_text(legalcode, encoding='utf8') def cmd_makecldf(self, args): """ :param args: An `argparse.Namespace` including attributes: - `writer`: `CLDFWriter` instance """ args.log.warning('cmd_{0} not implemented for dataset {1}'.format( 'makecldf', self.id)) return NOOP
class Dataset(object): """ A cldfbench dataset ties together - `raw` data, to be used as source for the - `cldf` data, which is created using config data from - `etc`. To use the cldfbench infrastructure, one should sub-class `Dataset`. cldfbench supports the following workflow: - a `download` command populates a `Dataset`'s `raw` directory. - a `makecldf` command (re)creates the CLDF dataset in `cldf`. The following class attributes are supposed to be overwritten by subclasses: :ivar dir: `pathlib.Path` pointing to the root directory of the dataset. :ivar id: A `str` identifier for the dataset. No assumption about uniqueness properties of \ this identifier is made. :ivar metadata_cls: Subclass of :class:`Metadata` (or :class:`Metadata` if not overwritten) """ dir = None id = None metadata_cls = Metadata def __init__(self): if not self.dir: self.dir = pathlib.Path(inspect.getfile(self.__class__)).parent self.dir = DataDir(self.dir) md = self.dir / 'metadata.json' self.metadata = self.metadata_cls.from_file( md) if md.exists() else self.metadata_cls() self.metadata.id = self.id def __str__(self): return '{0.__class__.__name__} "{0.id}" at {1}'.format( self, self.dir.resolve()) @lazyproperty def cldf_dir(self) -> DataDir: """ Directory where CLDF data generated from the Dataset will be stored (unless specified differently by a :class:`CLDFSpec`). """ return self.dir / 'cldf' @lazyproperty def raw_dir(self) -> DataDir: """ Directory where cldfbench expects the raw or source data. """ return self.dir / 'raw' @lazyproperty def etc_dir(self) -> DataDir: """ Directory where cldfbench expects additional configuration or metadata. """ return self.dir / 'etc' def cldf_specs(self) -> typing.Union[CLDFSpec, typing.Dict[str, CLDFSpec]]: """ A `Dataset` must declare all CLDF datasets that are derived from it. :return: A single :class:`CLDFSpec` instance, or a `dict`, mapping names to `CLDFSpec` \ instances, where the name will be used by `cldf_reader`/`cldf_writer` to look up \ the spec. """ return CLDFSpec(dir=self.cldf_dir) @property def cldf_specs_dict( self) -> typing.Dict[typing.Union[str, None], CLDFSpec]: """ Turn :meth:`cldf_specs` into a `dict` for simpler lookup. :return: `dict` mapping lookup keys to `CLDFSpec` instances. """ specs = self.cldf_specs() if isinstance(specs, CLDFSpec): return {None: specs} assert isinstance(specs, dict) return specs def update_submodules(self): """ Convenience method to be used in a `Dataset`'s `cmd_download` to update raw data curated as git submodules. """ subprocess.check_call('git -C {} submodule update --remote'.format( self.dir.resolve()), shell=True) def cldf_writer(self, args, cldf_spec=None, clean=True) -> CLDFWriter: """ :param args: :param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs` :param clean: `bool` flag signaling whether to clean the CLDF dir before writing. \ Note that `False` must be passed for subsequent calls to `cldf_writer` in case the \ spec re-uses a directory. :return: a `cldf_spec.writer_cls` instance, for write-access to CLDF data. \ This method should be used in a with-statement, and will then return a `CLDFWriter` with \ an empty working directory. """ if not isinstance(cldf_spec, CLDFSpec): cldf_spec = self.cldf_specs_dict[cldf_spec] return cldf_spec.get_writer(args=args, dataset=self, clean=clean) def cldf_reader(self, cldf_spec: typing.Union[str, None] = None) -> pycldf.Dataset: """ :param cldf_spec: :return: a `pycldf.Dataset` instance, for read-access to the CLDF data. """ if not isinstance(cldf_spec, CLDFSpec): cldf_spec = self.cldf_specs_dict[cldf_spec] return cldf_spec.get_dataset() @lazyproperty def repo(self) -> typing.Union[Repository, None]: """ The git repository cloned to the dataset's directory (or `None`). """ try: return Repository(self.dir) except ValueError: # pragma: no cover return def _cmd_download(self, args): self.raw_dir.mkdir(exist_ok=True) self.cmd_download(args) (self.raw_dir / 'README.md').write_text( 'Raw data downloaded {0}'.format(datetime.utcnow().isoformat()), encoding='utf8') def cmd_download(self, args: argparse.Namespace): """ Implementations of this methods should populate the dataset's `raw_dir` with the source data. """ args.log.warning('cmd_{0} not implemented for dataset {1}'.format( 'download', self.id)) return NOOP def _cmd_readme(self, args): if self.metadata: badge = build_status_badge(self) md = self.cmd_readme(args) if badge: lines, title_found = [], False for line in md.split('\n'): lines.append(line) if line.startswith('# ') and not title_found: title_found = True lines.extend(['', badge]) md = '\n'.join(lines) section = [ '\n\n## CLDF Datasets\n', 'The following CLDF datasets are available in [{0}]({0}):\n'. format(self.cldf_dir.resolve().relative_to(self.dir.resolve())) ] for ds in self.cldf_specs_dict.values(): if ds.metadata_path.exists(): p = ds.metadata_path.resolve().relative_to( self.dir.resolve()) section.append( '- CLDF [{0}](https://github.com/cldf/cldf/tree/master/modules/{0}) ' 'at [{1}]({1})'.format(ds.module, p)) self.dir.joinpath('README.md').write_text(md + '\n'.join(section), encoding='utf8') def cmd_readme(self, args: argparse.Namespace) -> str: """ Implementations of this method should create the content for the dataset's README.md and return it as markdown formatted string. """ return self.metadata.markdown() if self.metadata else '' def _cmd_makecldf(self, args): specs = list(self.cldf_specs_dict.values()) if len(specs) == 1: # There's only one CLDF spec! We instantiate the writer now and inject it into `args`: with self.cldf_writer(args, cldf_spec=specs[0]) as writer: args.writer = writer self.cmd_makecldf(args) else: self.cmd_makecldf(args) if self.metadata and self.metadata.known_license: legalcode = self.metadata.known_license.legalcode if legalcode: (self.dir / 'LICENSE').write_text(legalcode, encoding='utf8') def cmd_makecldf(self, args: argparse.Namespace): """ Implementations of this method should write the CLDF data curated by the dataset. :param args: An `argparse.Namespace` including attributes: \ - `writer`: :class:`CLDFWriter` instance """ args.log.warning('cmd_{0} not implemented for dataset {1}'.format( 'makecldf', self.id)) return NOOP