Example #1
0
 def time_ls_recursive_long_all(self):
     ls(self.ds.path, recursive=True, long_=True, all_=True)
Example #2
0
 def time_ls(self):
     ls(self.ds.path)
Example #3
0
 def time_ls_recursive(self):
     ls(self.ds.path, recursive=True)
Example #4
0
 def time_ls_recursive(self):
     ls(self.ds.path, recursive=True)
Example #5
0
 def time_ls_recursive_long_all(self):
     ls(self.ds.path, recursive=True, long_=True, all_=True)
Example #6
0
 def time_ls_recursive(self, tarfile_path):
     ls(self.ds.path, recursive=True)
Example #7
0
 def time_ls(self):
     ls(self.ds.path)
Example #8
0
 def time_ls(self, tarfile_path):
     ls(self.ds.path)
Example #9
0
File: api.py Project: hanke/datalad
 def time_ls_recursive(self, tarfile_path):
     ls(self.ds.path, recursive=True)
Example #10
0
File: api.py Project: hanke/datalad
 def time_ls(self, tarfile_path):
     ls(self.ds.path)
Example #11
0
def pipeline(dataset,
             versioned_urls=True, topurl=TOPURL,
             versions_overlay_level=2,
             leading_dirs_depth=1,
             prefix='',
             s3_prefix=None):
    """Pipeline to crawl/annex an openfmri dataset

    Parameters
    ----------
    dataset: str
      Id of the OpenfMRI dataset (e.g. ds000001)
    versioned_urls: bool, optional
      Request versioned URLs.  OpenfMRI bucket is versioned, but if
      original data resides elsewhere, set to False
    topurl: str, optional
      Top level URL to the datasets.
    prefix: str, optional
      Prefix regular expression in urls to identifying subgroup of data to be
      fetched in the dataset
      (e.g. in case of ds000017 there is A and B)
    s3_prefix: str or None, optional
      Either to crawl per-dataset subdirectory in the bucket into incoming-s3
      branch, to also annex also all the extracted files available from openfmri
      bucket.  If None -- we determine depending on availability of the
      sub-directory on S3 bucket
    """
    skip_no_changes = True    # to redo incoming-processed, would finish dirty in incoming-processed
                              # when commit would fail since nothing to commit
    leading_dirs_depth = int(leading_dirs_depth)
    versions_overlay_level = int(versions_overlay_level)
    dataset_url = '%s%s/' % (topurl, dataset)
    lgr.info("Creating a pipeline for the openfmri dataset %s" % dataset)

    special_remotes = [ARCHIVES_SPECIAL_REMOTE]

    if s3_prefix is None:
        # some datasets available (fresh enough or old) from S3, so let's sense if this one is
        # s3_prefix = re.sub('^ds0*([0-9]{3})/*', r'ds\1/', dataset)  # openfmri bucket
        s3_prefix = dataset
        # was relevant only for openfmri bucket. for openneuro -- it is all under the same
        # directory, separated deep inside between A and B, so we just crawl for both
        # if dataset == 'ds000017':
        #     # we had some custom prefixing going on
        #     assert(prefix)
        #     suf = prefix[-3]
        #     assert suf in 'AB'
        #     s3_prefix = 'ds017' + suf

        openfmri_s3_prefix = 's3://openneuro/'
        try:
            if not ls('%s%s' % (openfmri_s3_prefix, s3_prefix)):
                s3_prefix = None  # not there
        except Exception as exc:
            lgr.warning(
                "Failed to access %s, not attempting to crawl S3: %s",
                s3_prefix, exc_str(exc)
            )
            s3_prefix = None

    if s3_prefix:
        # actually not needed here since we are remapping them to public http
        #  urls
        # special_remotes += [DATALAD_SPECIAL_REMOTE]
        pass


    annex = Annexificator(
        create=False,  # must be already initialized etc
        # leave in Git only obvious descriptors and code snippets -- the rest goes to annex
        # so may be eventually we could take advantage of git tags for changing layout
        statusdb='json',
        special_remotes=special_remotes,
        # all .txt and .json in root directory (only) go into git!
        options=["-c",
                 "annex.largefiles="
                 # ISSUES LICENSE Makefile
                 "exclude=Makefile and exclude=LICENSE* and exclude=ISSUES*"
                 " and exclude=CHANGES* and exclude=README* and exclude=ReadMe.txt"
                 " and exclude=*.[mc] and exclude=dataset*.json and exclude=license.txt"
                 " and (exclude=*.txt or include=*/*.txt)"
                 " and (exclude=*.json or include=*/*.json)"
                 " and (exclude=*.tsv or include=*/*.tsv)"
                 ])

    if s3_prefix:
        # a sub-pipeline to crawl s3 bucket
        s3_pipeline_here = \
            [
                [
                    annex.switch_branch('incoming-s3-openneuro'),
                    s3_pipeline(s3_prefix, bucket='openneuro', tag=False), # for 31 ;) skip_problematic=True),
                    annex.switch_branch('master'),
                ]
            ]
    else:
        s3_pipeline_here = []

    # common kwargs which would later would be tuned up
    def add_archive_content(**kw):
        if 'leading_dirs_depth' not in kw:
            kw['leading_dirs_depth'] = leading_dirs_depth
        if 'strip_leading_dirs' not in kw:
            kw['strip_leading_dirs'] = bool(leading_dirs_depth)
        return annex.add_archive_content(
            existing='archive-suffix',
            delete=True,
            exclude=['(^|%s)\._' % os.path.sep],  # some files like '._whatever'
            **kw
        # overwrite=True,
        # TODO: we might need a safeguard for cases when multiple subdirectories within a single tarball
        )

    return s3_pipeline_here + [
        # optionally "log" to annex extracted content available on openfmri S3
        annex.switch_branch('incoming'),
        [   # nested pipeline so we could quit it earlier happen we decided that nothing todo in it
            # but then we would still return to 'master' branch
            crawl_url(dataset_url),
            [  # changelog XXX there might be multiple, e.g. in case of ds000017
               a_href_match(".*%srelease_history.txt" % prefix),  # , limit=1
               assign({'filename': 'changelog.txt'}),
               annex,
            ],
            # Moving to proper meta-data descriptors, so no need to generate and possibly conflict
            # with distributed one README
            # [  # README
            #    # Somewhat sucks here since 'url' from above would be passed all the way to annex
            #    # So such nodes as extract_readme should cleans the data so only relevant pieces are left
            #    extract_readme,
            #    annex,
            # ],
            [  # and collect all URLs pointing to tarballs
                a_href_match('.*/%s.*\.(tgz|tar.*|zip)' % prefix, min_count=1),
                # Since all content of openfmri is anyways available openly, no need atm
                # to use https which complicates proxying etc. Thus replace for AWS urls
                # to openfmri S3 from https to http
                # TODO: might want to become an option for get_versioned_url?
                sub({
                 'url': {
                   '(http)s?(://.*openfmri\.s3\.amazonaws.com/|://s3\.amazonaws\.com/openfmri/)': r'\1\2'}}),
                func_to_node(get_versioned_url,
                             data_args=['url'],
                             outputs=['url'],
                             kwargs={'guarantee_versioned': versioned_urls,
                                     'verify': True}),
                annex,
            ],
            # TODO: describe_dataset
            # Now some true magic -- possibly multiple commits, 1 per each detected **new** version!
            # this one doesn't go through all files, but only through the freshly staged!
            annex.commit_versions(
                '_R(?P<version>\d+[\.\d]*)(?=[\._])',
                always_versioned='ds\d\d+.*',
                unversioned='default',
                default='1.0.0'),
        ],
        annex.remove_obsolete(),  # should be called while still within incoming but only once
        # TODO: since it is a very common pattern -- consider absorbing into e.g. add_archive_content?
        [   # nested pipeline so we could skip it entirely if nothing new to be merged
            {'loop': not skip_no_changes},  # loop for multiple versions merges
            annex.switch_branch('incoming-processed'),
            annex.merge_branch('incoming',
                               one_commit_at_a_time=True, strategy='theirs', commit=False,
                               skip_no_changes=skip_no_changes
                               ),
            # still we would have all the versions present -- we need to restrict only to the current one!
            # TODO:  we often need ability to augment next node options by checks etc in the previous ones
            # e.g. ehere overlay option depending on which dataset/version being processed
            annex.remove_other_versions('incoming',
                                        remove_unversioned=True,
                                        # ds001.tar.gz  could then become ds0000001.zip
                                        fpath_subs=[
                                            # ad-hoc fixups for some datasets
                                            ('ds005\.tgz', 'ds005_raw.tgz'),
                                            # had it split into this one with derived data separately and then joined
                                            ('ds007_01-20\.tgz', 'ds007_raw.tgz'),
                                            ('ds000107_raw\.', 'ds000107.'),
                                            # generic
                                            ('^ds0*', '^ds'),
                                            ('\.(zip|tgz|tar\.gz)$', '.ext')
                                        ],
                                        # Had manually to do this for this one since there was a switch from
                                        # overlay layout to even bigger single one within a minor 2.0.1 "release"
                                        # 158 -- 1.0.1 changed layout completely so should not be overlayed ATM
                                        overlay=None
                                            if dataset in ('ds000007', 'ds000114', 'ds000119', 'ds000158', 'ds000216')
                                            else versions_overlay_level,  # use major.minor to define overlays
                                        #overlay=None, # use major.minor to define overlays
                                        exclude='(README|changelog).*'),
            [   # Pipeline to augment content of the incoming and commit it to master
                # There might be archives within archives, so we need to loop
                {'loop': True},
                find_files("\.(zip|tgz|tar(\..+)?)$", fail_if_none=True),  #  we fail if none found -- there must be some! ;)),
                assign({'dataset_file': dataset + '///%(filename)s'}, interpolate=True),
                switch(
                    'dataset_file',
                    {
                        'ds0*158///aalmasks\.zip$': add_archive_content(add_archive_leading_dir=True),
                        '.*///ds000030_R1\.0\.1_metadata\.tgz': add_archive_content(leading_dirs_depth=4),
                    },
                    default=add_archive_content(),
                    re=True,
                ),
            ],
            [
                find_files("(\.(tsv|csv|txt|json|gz|bval|bvec|hdr|img|m|mat|pdf|png|zip|nii|jpg|fif|fig)|README|CHANGES)$"),
                fix_permissions(executable=False)
            ],
            annex.switch_branch('master'),
            annex.merge_branch('incoming-processed', commit=True, allow_unrelated=True),
            annex.finalize(tag=True, aggregate=True),
        ],
        annex.switch_branch('master'),
        annex.finalize(cleanup=True, aggregate=True),
        # TODO:  drop all files which aren't in master or incoming  since now many extracted arrive from s3
        #  - no need to keep all versions locally for all of them

    ]
Example #12
0
def pipeline(dataset,
             versioned_urls=True, topurl=TOPURL,
             versions_overlay_level=2,
             leading_dirs_depth=1,
             prefix='',
             s3_prefix=None):
    """Pipeline to crawl/annex an openfmri dataset

    Parameters
    ----------
    dataset: str
      Id of the OpenfMRI dataset (e.g. ds000001)
    versioned_urls: bool, optional
      Request versioned URLs.  OpenfMRI bucket is versioned, but if
      original data resides elsewhere, set to False
    topurl: str, optional
      Top level URL to the datasets.
    prefix: str, optional
      Prefix regular expression in urls to identifying subgroup of data to be
      fetched in the dataset
      (e.g. in case of ds000017 there is A and B)
    s3_prefix: str or None, optional
      Either to crawl per-dataset subdirectory in the bucket into incoming-s3
      branch, to also annex also all the extracted files available from openfmri
      bucket.  If None -- we determine depending on availability of the
      sub-directory on S3 bucket
    """
    skip_no_changes = True    # to redo incoming-processed, would finish dirty in incoming-processed
                              # when commit would fail since nothing to commit
    leading_dirs_depth = int(leading_dirs_depth)
    versions_overlay_level = int(versions_overlay_level)
    dataset_url = '%s%s/' % (topurl, dataset)
    lgr.info("Creating a pipeline for the openfmri dataset %s" % dataset)

    special_remotes = [ARCHIVES_SPECIAL_REMOTE]

    if s3_prefix is None:
        # some datasets available (fresh enough or old) from S3, so let's sense if this one is
        s3_prefix = re.sub('^ds0*([0-9]{3})/*', r'ds\1/', dataset)
        if dataset == 'ds000017':
            # we had some custom prefixing going on
            assert(prefix)
            suf = prefix[-3]
            assert suf in 'AB'
            s3_prefix = 'ds017' + suf

        openfmri_s3_prefix = 's3://openfmri/'
        try:
            if not ls('%s%s' % (openfmri_s3_prefix, s3_prefix)):
                s3_prefix = None  # not there
        except Exception as exc:
            lgr.warning(
                "Failed to access %s, not attempting to crawl S3: %s",
                s3_prefix, exc_str(exc)
            )
            s3_prefix = None

    if s3_prefix:
        # actually not needed here since we are remapping them to public http
        #  urls
        # special_remotes += [DATALAD_SPECIAL_REMOTE]
        pass


    annex = Annexificator(
        create=False,  # must be already initialized etc
        # leave in Git only obvious descriptors and code snippets -- the rest goes to annex
        # so may be eventually we could take advantage of git tags for changing layout
        statusdb='json',
        special_remotes=special_remotes,
        # all .txt and .json in root directory (only) go into git!
        options=["-c",
                 "annex.largefiles="
                 # ISSUES LICENSE Makefile
                 "exclude=Makefile and exclude=LICENSE* and exclude=ISSUES*"
                 " and exclude=CHANGES* and exclude=README* and exclude=ReadMe.txt"
                 " and exclude=*.[mc] and exclude=dataset*.json and exclude=license.txt"
                 " and (exclude=*.txt or include=*/*.txt)"
                 " and (exclude=*.json or include=*/*.json)"
                 " and (exclude=*.tsv or include=*/*.tsv)"
                 ])

    if s3_prefix:
        # a sub-pipeline to crawl s3 bucket
        s3_pipeline_here = \
            [
                [
                    annex.switch_branch('incoming-s3'),
                    s3_pipeline(s3_prefix, tag=False), # for 31 ;) skip_problematic=True),
                    annex.switch_branch('master'),
                ]
            ]
    else:
        s3_pipeline_here = []

    # common kwargs which would later would be tuned up
    def add_archive_content(**kw):
        if 'leading_dirs_depth' not in kw:
            kw['leading_dirs_depth'] = leading_dirs_depth
        if 'strip_leading_dirs' not in kw:
            kw['strip_leading_dirs'] = bool(leading_dirs_depth)
        return annex.add_archive_content(
            existing='archive-suffix',
            delete=True,
            exclude=['(^|%s)\._' % os.path.sep],  # some files like '._whatever'
            **kw
        # overwrite=True,
        # TODO: we might need a safeguard for cases when multiple subdirectories within a single tarball
        )

    return s3_pipeline_here + [
        # optionally "log" to annex extracted content available on openfmri S3
        annex.switch_branch('incoming'),
        [   # nested pipeline so we could quit it earlier happen we decided that nothing todo in it
            # but then we would still return to 'master' branch
            crawl_url(dataset_url),
            [  # changelog XXX there might be multiple, e.g. in case of ds000017
               a_href_match(".*%srelease_history.txt" % prefix),  # , limit=1
               assign({'filename': 'changelog.txt'}),
               annex,
            ],
            # Moving to proper meta-data descriptors, so no need to generate and possibly conflict
            # with distributed one README
            # [  # README
            #    # Somewhat sucks here since 'url' from above would be passed all the way to annex
            #    # So such nodes as extract_readme should cleans the data so only relevant pieces are left
            #    extract_readme,
            #    annex,
            # ],
            [  # and collect all URLs pointing to tarballs
                a_href_match('.*/%s.*\.(tgz|tar.*|zip)' % prefix, min_count=1),
                # Since all content of openfmri is anyways available openly, no need atm
                # to use https which complicates proxying etc. Thus replace for AWS urls
                # to openfmri S3 from https to http
                # TODO: might want to become an option for get_versioned_url?
                sub({
                 'url': {
                   '(http)s?(://.*openfmri\.s3\.amazonaws.com/|://s3\.amazonaws\.com/openfmri/)': r'\1\2'}}),
                func_to_node(get_versioned_url,
                             data_args=['url'],
                             outputs=['url'],
                             kwargs={'guarantee_versioned': versioned_urls,
                                     'verify': True}),
                annex,
            ],
            # TODO: describe_dataset
            # Now some true magic -- possibly multiple commits, 1 per each detected **new** version!
            # this one doesn't go through all files, but only through the freshly staged!
            annex.commit_versions(
                '_R(?P<version>\d+[\.\d]*)(?=[\._])',
                always_versioned='ds\d\d+.*',
                unversioned='default',
                default='1.0.0'),
        ],
        annex.remove_obsolete(),  # should be called while still within incoming but only once
        # TODO: since it is a very common pattern -- consider absorbing into e.g. add_archive_content?
        [   # nested pipeline so we could skip it entirely if nothing new to be merged
            {'loop': not skip_no_changes},  # loop for multiple versions merges
            annex.switch_branch('incoming-processed'),
            annex.merge_branch('incoming',
                               one_commit_at_a_time=True, strategy='theirs', commit=False,
                               skip_no_changes=skip_no_changes
                               ),
            # still we would have all the versions present -- we need to restrict only to the current one!
            # TODO:  we often need ability to augment next node options by checks etc in the previous ones
            # e.g. ehere overlay option depending on which dataset/version being processed
            annex.remove_other_versions('incoming',
                                        remove_unversioned=True,
                                        # ds001.tar.gz  could then become ds0000001.zip
                                        fpath_subs=[
                                            # ad-hoc fixups for some datasets
                                            ('ds005\.tgz', 'ds005_raw.tgz'),
                                            # had it split into this one with derived data separately and then joined
                                            ('ds007_01-20\.tgz', 'ds007_raw.tgz'),
                                            # generic
                                            ('^ds0*', '^ds'),
                                            ('\.(zip|tgz|tar\.gz)$', '.ext')
                                        ],
                                        # Had manually to do this for this one since there was a switch from
                                        # overlay layout to even bigger single one within a minor 2.0.1 "release"
                                        overlay=None
                                            if dataset in ('ds000007', 'ds000114', 'ds000119')
                                            else versions_overlay_level,  # use major.minor to define overlays
                                        #overlay=None, # use major.minor to define overlays
                                        exclude='(README|changelog).*'),
            [   # Pipeline to augment content of the incoming and commit it to master
                # There might be archives within archives, so we need to loop
                {'loop': True},
                find_files("\.(zip|tgz|tar(\..+)?)$", fail_if_none=True),  #  we fail if none found -- there must be some! ;)),
                assign({'dataset_file': dataset + '///%(filename)s'}, interpolate=True),
                switch(
                    'dataset_file',
                    {
                        'ds0*158///aalmasks\.zip$': add_archive_content(add_archive_leading_dir=True),
                        '.*///ds000030_R1\.0\.1_metadata\.tgz': add_archive_content(leading_dirs_depth=4),
                    },
                    default=add_archive_content(),
                    re=True,
                ),
            ],
            [
                find_files("(\.(tsv|csv|txt|json|gz|bval|bvec|hdr|img|m|mat|pdf|png|zip|nii|jpg|fif|fig)|README|CHANGES)$"),
                fix_permissions(executable=False)
            ],
            annex.switch_branch('master'),
            annex.merge_branch('incoming-processed', commit=True, allow_unrelated=True),
            annex.finalize(tag=True, aggregate=True),
        ],
        annex.switch_branch('master'),
        annex.finalize(cleanup=True, aggregate=True),
    ]