コード例 #1
0
ファイル: create.py プロジェクト: datalad/datalad
def _setup_annex_repo(path, initopts=None, fake_dates=False, description=None):
    """Create and configure a repository at `path`

    This includes a default setup of annex.largefiles.

    Parameters
    ----------
    path: str or Path
      Path of the repository
    initopts: dict, optional
      Git options to be passed to the AnnexRepo constructor
    fake_dates: bool, optional
      Passed to the AnnexRepo constructor
    description: str, optional
      Passed to the AnnexRepo constructor

    Returns
    -------
    AnnexRepo, dict
      Created repository and records for any repo component that needs to be
      passed to git-add as a result of the setup procedure.
    """
    # always come with annex when created from scratch
    tbrepo = AnnexRepo(
        path,
        create=True,
        create_sanity_checks=False,
        # do not set backend here, to avoid a dedicated commit
        backend=None,
        # None causes version to be taken from config
        version=None,
        description=description,
        git_opts=initopts,
        fake_dates=fake_dates)
    # set the annex backend in .gitattributes as a staged change
    tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'),
                               persistent=True,
                               commit=False)
    add_to_git = {
        tbrepo.pathobj / '.gitattributes': {
            'type': 'file',
            'state': 'added',
        }
    }
    # make sure that v6 annex repos never commit content under .datalad
    attrs_cfg = (('config', 'annex.largefiles', 'nothing'), (
        'metadata/aggregate*', 'annex.largefiles',
        'nothing'), ('metadata/objects/**', 'annex.largefiles', '({})'.format(
            cfg.obtain('datalad.metadata.create-aggregate-annex-limit'))))
    attrs = tbrepo.get_gitattributes(
        [op.join('.datalad', i[0]) for i in attrs_cfg])
    set_attrs = []
    for p, k, v in attrs_cfg:
        if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v:
            set_attrs.append((p, {k: v}))
    if set_attrs:
        tbrepo.set_gitattributes(set_attrs,
                                 attrfile=op.join('.datalad',
                                                  '.gitattributes'))

    # prevent git annex from ever annexing .git* stuff (gh-1597)
    attrs = tbrepo.get_gitattributes('.git')
    if not attrs.get('.git', {}).get('annex.largefiles', None) == 'nothing':
        tbrepo.set_gitattributes([('**/.git*', {
            'annex.largefiles': 'nothing'
        })])
        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbrepo.pathobj / '.gitattributes'] = {
            'type': 'file',
            'state': 'untracked'
        }
    return tbrepo, add_to_git
コード例 #2
0
ファイル: create.py プロジェクト: oesteban/datalad
    def __call__(path=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 save=True,
                 annex_version=None,
                 annex_backend='MD5E',
                 native_metadata_type=None,
                 shared_access=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_init_opts=None,
                 text_no_annex=None,
                 fake_dates=False):

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD
        if path and dataset:
            # Given a path and a dataset (path) not pointing to installed
            # dataset
            if not dataset.is_installed():
                msg = "No installed dataset at %s found." % dataset.path
                dsroot = get_dataset_root(dataset.path)
                if dsroot:
                    msg += " If you meant to add to the %s dataset, use that path " \
                           "instead but remember that if dataset is provided, " \
                           "relative paths are relative to the top of the " \
                           "dataset." % dsroot
                raise ValueError(msg)

        # sanity check first
        if git_opts:
            lgr.warning(
                "`git_opts` argument is presently ignored, please complain!")
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")
            if annex_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex and declaring no "
                                 "annex repo.")
            if annex_init_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex init and declaring no "
                                 "annex repo.")

        if not isinstance(force, bool):
            raise ValueError(
                "force should be bool, got %r.  Did you mean to provide a 'path'?"
                % force)
        annotated_paths = AnnotatePaths.__call__(
            # nothing given explicitly, assume create fresh right here
            path=path if path else getpwd() if dataset is None else None,
            dataset=dataset,
            recursive=False,
            action='create',
            # we need to know whether we have to check for potential
            # subdataset collision
            force_parentds_discovery=True,
            # it is absolutely OK to have something that does not exist
            unavailable_path_status='',
            unavailable_path_msg=None,
            # if we have a dataset given that actually exists, we want to
            # fail if the requested path is not in it
            nondataset_path_status='error' \
                if isinstance(dataset, Dataset) and dataset.is_installed() else '',
            on_failure='ignore')
        path = None
        for r in annotated_paths:
            if r['status']:
                # this is dealt with already
                yield r
                continue
            if path is not None:
                raise ValueError(
                    "`create` can only handle single target path or dataset")
            path = r

        if len(annotated_paths) and path is None:
            # we got something, we complained already, done
            return

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # prep for yield
        path.update({'logger': lgr, 'type': 'dataset'})
        # just discard, we have a new story to tell
        path.pop('message', None)
        if 'parentds' in path:
            subs = Subdatasets.__call__(
                dataset=path['parentds'],
                # any known
                fulfilled=None,
                recursive=False,
                contains=path['path'],
                result_xfm='relpaths')
            if len(subs):
                path.update({
                    'status':
                    'error',
                    'message':
                    ('collision with known subdataset %s/ in dataset %s',
                     subs[0], path['parentds'])
                })
                yield path
                return

        # TODO here we need a further test that if force=True, we need to look if
        # there is a superdataset (regardless of whether we want to create a
        # subdataset or not), and if that superdataset tracks anything within
        # this directory -- if so, we need to stop right here and whine, because
        # the result of creating a repo here will produce an undesired mess

        if git_opts is None:
            git_opts = {}
        if shared_access:
            # configure `git --shared` value
            git_opts['shared'] = shared_access

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \
            else Dataset(path['path'])

        # don't create in non-empty directory without `force`:
        if isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            path.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`force` option to ignore'
            })
            yield path
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = []

        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            GitRepo(tbds.path,
                    url=None,
                    create=True,
                    git_opts=git_opts,
                    fake_dates=fake_dates)
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(tbds.path,
                               url=None,
                               create=True,
                               backend=annex_backend,
                               version=annex_version,
                               description=description,
                               git_opts=git_opts,
                               annex_opts=annex_opts,
                               annex_init_opts=annex_init_opts,
                               fake_dates=fake_dates)

            if text_no_annex:
                attrs = tbrepo.get_gitattributes('.')
                # some basic protection against useless duplication
                # on rerun with --force
                if not attrs.get('.', {}).get(
                        'annex.largefiles', None) == '(not(mimetype=text/*))':
                    tbrepo.set_gitattributes([('*', {
                        'annex.largefiles':
                        '(not(mimetype=text/*))'
                    })])
                    add_to_git.append('.gitattributes')

        if native_metadata_type is not None:
            if not isinstance(native_metadata_type, list):
                native_metadata_type = [native_metadata_type]
            for nt in native_metadata_type:
                tbds.config.add('datalad.metadata.nativetype', nt)

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(id_var,
                        tbds.id if tbds.id is not None else uuid_id,
                        where='dataset')

        add_to_git.append('.datalad')

        # make sure that v6 annex repos never commit content under .datalad
        attrs_cfg = (
            ('config', 'annex.largefiles', 'nothing'),
            ('metadata/aggregate*', 'annex.largefiles', 'nothing'),
            ('metadata/objects/**', 'annex.largefiles', '({})'.format(
                cfg.obtain('datalad.metadata.create-aggregate-annex-limit'))))
        attrs = tbds.repo.get_gitattributes(
            [op.join('.datalad', i[0]) for i in attrs_cfg])
        set_attrs = []
        for p, k, v in attrs_cfg:
            if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v:
                set_attrs.append((p, {k: v}))
        if set_attrs:
            tbds.repo.set_gitattributes(set_attrs,
                                        attrfile=op.join(
                                            '.datalad', '.gitattributes'))
            add_to_git.append('.datalad')

        # prevent git annex from ever annexing .git* stuff (gh-1597)
        attrs = tbds.repo.get_gitattributes('.git')
        if not attrs.get('.git', {}).get('annex.largefiles',
                                         None) == 'nothing':
            tbds.repo.set_gitattributes([('**/.git*', {
                'annex.largefiles': 'nothing'
            })])
            add_to_git.append('.gitattributes')

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.add(add_to_git,
                 to_git=True,
                 save=save,
                 message='[DATALAD] new dataset')

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(dataset, Dataset) and dataset.path != tbds.path \
           and tbds.repo.get_hexsha():
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.add(tbds.path,
                                 save=save,
                                 return_type='generator',
                                 result_filter=None,
                                 result_xfm=None,
                                 on_failure='ignore'):
                yield r

        path.update({'status': 'ok'})
        yield path