Beispiel #1
0
 def emit(self, record):
     from datalad.ui import ui
     pid = getattr(record, 'dlm_progress')
     update = getattr(record, 'dlm_progress_update', None)
     if pid not in self.pbars:
         # this is new
         pbar = ui.get_progressbar(label=getattr(record,
                                                 'dlm_progress_label', ''),
                                   unit=getattr(record, 'dlm_progress_unit',
                                                ''),
                                   total=getattr(record,
                                                 'dlm_progress_total',
                                                 None))
         self.pbars[pid] = pbar
     elif update is None:
         # not an update -> done
         # TODO if the other logging that is happening is less frontpage
         # we may want to actually "print" the completion message
         self.pbars.pop(pid).finish()
     else:
         # an update
         self.pbars[pid].update(update,
                                increment=getattr(record,
                                                  'dlm_progress_increment',
                                                  False))
         # Check for an updated label.
         label = getattr(record, 'dlm_progress_label', None)
         if label is not None:
             self.pbars[pid].set_desc(label)
Beispiel #2
0
 def emit(self, record):
     from datalad.ui import ui
     pid = getattr(record, 'dlm_progress')
     update = getattr(record, 'dlm_progress_update', None)
     # would be an actual message, not used ATM here,
     # and the record not passed to generic handler ATM
     # (filtered away by NoProgressLog)
     # so no final message is printed
     # msg = record.getMessage()
     if pid not in self.pbars:
         # this is new
         pbar = ui.get_progressbar(
             label=getattr(record, 'dlm_progress_label', ''),
             unit=getattr(record, 'dlm_progress_unit', ''),
             total=getattr(record, 'dlm_progress_total', None))
         self.pbars[pid] = pbar
     elif update is None:
         # not an update -> done
         # TODO if the other logging that is happening is less frontpage
         # we may want to actually "print" the completion message
         self.pbars.pop(pid).finish()
     else:
         # an update
         self.pbars[pid].update(
             update,
             increment=getattr(record, 'dlm_progress_increment', False))
         # Check for an updated label.
         label = getattr(record, 'dlm_progress_label', None)
         if label is not None:
             self.pbars[pid].set_desc(label)
Beispiel #3
0
 def emit(self, record):
     from datalad.ui import ui
     pid = getattr(record, 'dlm_progress')
     update = getattr(record, 'dlm_progress_update', None)
     # would be an actual message, not used ATM here,
     # and the record not passed to generic handler ATM
     # (filtered away by NoProgressLog)
     # so no final message is printed
     # msg = record.getMessage()
     if pid not in self.pbars:
         # this is new
         pbar = ui.get_progressbar(
             label=getattr(record, 'dlm_progress_label', ''),
             unit=getattr(record, 'dlm_progress_unit', ''),
             total=getattr(record, 'dlm_progress_total', None))
         self.pbars[pid] = pbar
     elif update is None:
         # not an update -> done
         # TODO if the other logging that is happening is less frontpage
         # we may want to actually "print" the completion message
         self.pbars.pop(pid).finish()
     else:
         # an update
         self.pbars[pid].update(
             update,
             increment=getattr(record, 'dlm_progress_increment', False))
         # Check for an updated label.
         label = getattr(record, 'dlm_progress_label', None)
         if label is not None:
             self.pbars[pid].set_desc(label)
Beispiel #4
0
def add_extra_filename_values(filename_format, rows, urls, dry_run):
    """Extend `rows` with values for special formatting fields.
    """
    file_fields = list(get_fmt_names(filename_format))
    if any(i.startswith("_url") for i in file_fields):
        for row, url in zip(rows, urls):
            row.update(get_url_parts(url))

    if any(i.startswith("_url_filename") for i in file_fields):
        if dry_run:  # Don't waste time making requests.
            dummy = get_file_parts("BASE.EXT", "_url_filename")
            for idx, row in enumerate(rows):
                row.update(
                    {k: v + str(idx) for k, v in dummy.items()})
        else:
            pbar = ui.get_progressbar(total=len(urls),
                                      label="Requesting names", unit=" Files")
            for row, url in zip(rows, urls):
                # If we run into any issues here, we're just going to raise an
                # exception and then abort inside dlplugin.  It'd be good to
                # disentangle this from `extract` so that we could yield an
                # individual error, drop the row, and keep going.
                filename = get_url_filename(url)
                if filename:
                    row.update(get_file_parts(filename, "_url_filename"))
                else:
                    raise ValueError(
                        "{} does not contain a filename".format(url))
                pbar.update(1, increment=True)
            pbar.finish()
Beispiel #5
0
    def _mk_schema(self, dsinfo):
        from whoosh import fields as wf
        from whoosh.analysis import StandardAnalyzer
        from whoosh.analysis import SimpleAnalyzer

        # haven for terms that have been found to be undefined
        # (for faster decision-making upon next encounter)
        # this will harvest all discovered term definitions
        definitions = {
            '@id': 'unique identifier of an entity',
            # TODO make proper JSON-LD definition
            'path': 'path name of an entity relative to the searched base dataset',
            # TODO make proper JSON-LD definition
            'parentds': 'path of the datasets that contains an entity',
            # 'type' will not come from a metadata field, hence will not be detected
            'type': 'type of a record',
        }

        schema_fields = {
            n.lstrip('@'): wf.ID(stored=True, unique=n == '@id')
            for n in definitions}

        lgr.debug('Scanning for metadata keys')
        # quick 1st pass over all dataset to gather the needed schema fields
        pbar = ui.get_progressbar(
            label='Datasets',
            unit='ds',
            total=len(dsinfo))
        for res in query_aggregated_metadata(
                # XXX TODO After #2156 datasets may not necessarily carry all
                # keys in the "unique" summary
                reporton='datasets',
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                recursive=True):
            meta = res.get('metadata', {})
            # no stringification of values for speed, we do not need/use the
            # actual values at this point, only the keys
            idxd = _meta2autofield_dict(meta, val2str=False)

            for k in idxd:
                schema_fields[k] = wf.TEXT(stored=False,
                                           analyzer=SimpleAnalyzer())
            pbar.update(1, increment=True)
        pbar.finish()

        self.schema = wf.Schema(**schema_fields)
Beispiel #6
0
 def wrapped(items, **kwargs):
     counts = defaultdict(int)
     pbar = ui.get_progressbar(total=len(items),
                               label=label, unit=" " + unit)
     results = []
     for res in fn(items, **kwargs):
         counts[res["status"]] += 1
         count_strs = (count_str(*args)
                       for args in [(counts["notneeded"], "skipped", False),
                                    (counts["error"], "failed", True)])
         pbar.update(1, increment=True)
         if counts["notneeded"] or counts["error"]:
             pbar.set_desc("{label} ({counts})".format(
                 label=label,
                 counts=", ".join(filter(None, count_strs))))
         pbar.refresh()
         results.append(res)
     pbar.finish()
     return results
Beispiel #7
0
    def upload_file(self, fname, files_url):
        # In v2 API seems no easy way to "just upload".  Need to initiate,
        # do uploads
        # and finalize
        # TODO: check if the file with the same name already available, and offer
        # to remove/prune it
        import os
        from datalad.utils import md5sum
        from datalad.ui import ui
        file_rec = {
            'md5': md5sum(fname),
            'name': os.path.basename(fname),
            'size': os.stat(fname).st_size
        }
        # Initiate upload
        j = self.post(files_url, file_rec)
        file_endpoint = j['location']
        file_info = self.get(file_endpoint)
        file_upload_info = self.get(file_info['upload_url'])

        pbar = ui.get_progressbar(
            label=fname,  # fill_text=f.name,
            total=file_rec['size'])
        with open(fname, 'rb') as f:
            for part in file_upload_info['parts']:
                udata = dict(file_info, **part)
                if part['status'] == 'PENDING':
                    f.seek(part['startOffset'])
                    data = f.read(part['endOffset'] - part['startOffset'] + 1)
                    url = '{upload_url}/{partNo}'.format(**udata)
                    ok = self.put(url,
                                  data=data,
                                  binary=True,
                                  return_json=False)
                    assert ok == b'OK'
                pbar.update(part['endOffset'], increment=False)
            pbar.finish()

        # complete upload
        jcomplete = self.post(file_endpoint, return_json=False)
        return file_info
Beispiel #8
0
    def upload_file(self, fname, files_url):
        # In v2 API seems no easy way to "just upload".  Need to initiate,
        # do uploads
        # and finalize
        # TODO: check if the file with the same name already available, and offer
        # to remove/prune it
        import os
        from datalad.utils import md5sum
        from datalad.ui import ui
        file_rec = {'md5': md5sum(fname),
                    'name': os.path.basename(fname),
                    'size': os.stat(fname).st_size
                    }
        # Initiate upload
        j = self.post(files_url, file_rec)
        file_endpoint = j['location']
        file_info = self.get(file_endpoint)
        file_upload_info = self.get(file_info['upload_url'])

        pbar = ui.get_progressbar(label=fname,  # fill_text=f.name,
                                  total=file_rec['size'])
        with open(fname, 'rb') as f:
            for part in file_upload_info['parts']:
                udata = dict(file_info, **part)
                if part['status'] == 'PENDING':
                    f.seek(part['startOffset'])
                    data = f.read(part['endOffset'] - part['startOffset'] + 1)
                    url = '{upload_url}/{partNo}'.format(**udata)
                    ok = self.put(url, data=data, binary=True, return_json=False)
                    assert ok == b'OK'
                pbar.update(part['endOffset'], increment=False)
            pbar.finish()

        # complete upload
        jcomplete = self.post(file_endpoint, return_json=False)
        return file_info
Beispiel #9
0
    def _mk_search_index(self, force_reindex):
        """Generic entrypoint to index generation

        The actual work that determines the structure and content of the index
        is done by functions that are passed in as arguments

        `meta2doc` - must return dict for index document from result input
        """
        from whoosh import index as widx
        from .metadata import agginfo_relpath
        # what is the lastest state of aggregated metadata
        metadata_state = self.ds.repo.get_last_commit_hash(agginfo_relpath)
        # use location common to all index types, they would all invalidate
        # simultaneously
        stamp_fname = opj(self.index_dir, 'datalad_metadata_state')
        index_dir = opj(self.index_dir, self._mode_label)

        if (not force_reindex) and \
                exists(index_dir) and \
                exists(stamp_fname) and \
                open(stamp_fname).read() == metadata_state:
            try:
                # TODO check that the index schema is the same
                # as the one we would have used for reindexing
                # TODO support incremental re-indexing, whoosh can do it
                idx = widx.open_dir(index_dir)
                lgr.debug(
                    'Search index contains %i documents',
                    idx.doc_count())
                self.idx_obj = idx
                return
            except widx.LockError as e:
                raise e
            except widx.IndexError as e:
                # Generic index error.
                # we try to regenerate
                # TODO log this
                pass
            except widx.IndexVersionError as e:  # (msg, version, release=None)
                # Raised when you try to open an index using a format that the
                # current version of Whoosh cannot read. That is, when the index
                # you're trying to open is either not backward or forward
                # compatible with this version of Whoosh.
                # we try to regenerate
                lgr.warning(exc_str(e))
                pass
            except widx.OutOfDateError as e:
                # Raised when you try to commit changes to an index which is not
                # the latest generation.
                # this should not happen here, but if it does ... KABOOM
                raise e
            except widx.EmptyIndexError as e:
                # Raised when you try to work with an index that has no indexed
                # terms.
                # we can just continue with generating an index
                pass

        lgr.info('{} search index'.format(
            'Rebuilding' if exists(index_dir) else 'Building'))

        if not exists(index_dir):
            os.makedirs(index_dir)

        # this is a pretty cheap call that just pull this info from a file
        dsinfo = self.ds.metadata(
            get_aggregates=True,
            return_type='list',
            result_renderer='disabled')

        self._mk_schema(dsinfo)

        idx_obj = widx.create_in(index_dir, self.schema)
        idx = idx_obj.writer(
            # cache size per process
            limitmb=cfg.obtain('datalad.search.indexercachesize'),
            # disable parallel indexing for now till #1927 is resolved
            ## number of processes for indexing
            #procs=multiprocessing.cpu_count(),
            ## write separate index segments in each process for speed
            ## asks for writer.commit(optimize=True)
            #multisegment=True,
        )

        # load metadata of the base dataset and what it knows about all its subdatasets
        # (recursively)
        old_idx_size = 0
        old_ds_rpath = ''
        idx_size = 0
        pbar = ui.get_progressbar(
            label='Datasets',
            unit='ds',
            total=len(dsinfo))
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
            # dataset will be reported again
            meta = res.get('metadata', {})
            doc = self._meta2doc(meta)
            admin = {
                'type': res['type'],
                'path': relpath(res['path'], start=self.ds.path),
            }
            if 'parentds' in res:
                admin['parentds'] = relpath(res['parentds'], start=self.ds.path)
            if admin['type'] == 'dataset':
                if old_ds_rpath:
                    lgr.debug(
                        'Added %s on dataset %s',
                        single_or_plural(
                            'document',
                            'documents',
                            idx_size - old_idx_size,
                            include_count=True),
                        old_ds_rpath)
                old_idx_size = idx_size
                old_ds_rpath = admin['path']
                admin['id'] = res.get('dsid', None)
                pbar.update(1, increment=True)

            doc.update({k: assure_unicode(v) for k, v in admin.items()})
            lgr.debug("Adding document to search index: {}".format(doc))
            # inject into index
            idx.add_document(**doc)
            idx_size += 1

        if old_ds_rpath:
            lgr.debug(
                'Added %s on dataset %s',
                single_or_plural(
                    'document',
                    'documents',
                    idx_size - old_idx_size,
                    include_count=True),
                old_ds_rpath)

        lgr.debug("Committing index")
        idx.commit(optimize=True)
        pbar.finish()


        # "timestamp" the search index to allow for automatic invalidation
        with open(stamp_fname, 'w') as f:
            f.write(metadata_state)

        lgr.info('Search index contains %i documents', idx_size)
        self.idx_obj = idx_obj