def emit(self, record): from datalad.ui import ui pid = getattr(record, 'dlm_progress') update = getattr(record, 'dlm_progress_update', None) if pid not in self.pbars: # this is new pbar = ui.get_progressbar(label=getattr(record, 'dlm_progress_label', ''), unit=getattr(record, 'dlm_progress_unit', ''), total=getattr(record, 'dlm_progress_total', None)) self.pbars[pid] = pbar elif update is None: # not an update -> done # TODO if the other logging that is happening is less frontpage # we may want to actually "print" the completion message self.pbars.pop(pid).finish() else: # an update self.pbars[pid].update(update, increment=getattr(record, 'dlm_progress_increment', False)) # Check for an updated label. label = getattr(record, 'dlm_progress_label', None) if label is not None: self.pbars[pid].set_desc(label)
def emit(self, record): from datalad.ui import ui pid = getattr(record, 'dlm_progress') update = getattr(record, 'dlm_progress_update', None) # would be an actual message, not used ATM here, # and the record not passed to generic handler ATM # (filtered away by NoProgressLog) # so no final message is printed # msg = record.getMessage() if pid not in self.pbars: # this is new pbar = ui.get_progressbar( label=getattr(record, 'dlm_progress_label', ''), unit=getattr(record, 'dlm_progress_unit', ''), total=getattr(record, 'dlm_progress_total', None)) self.pbars[pid] = pbar elif update is None: # not an update -> done # TODO if the other logging that is happening is less frontpage # we may want to actually "print" the completion message self.pbars.pop(pid).finish() else: # an update self.pbars[pid].update( update, increment=getattr(record, 'dlm_progress_increment', False)) # Check for an updated label. label = getattr(record, 'dlm_progress_label', None) if label is not None: self.pbars[pid].set_desc(label)
def add_extra_filename_values(filename_format, rows, urls, dry_run): """Extend `rows` with values for special formatting fields. """ file_fields = list(get_fmt_names(filename_format)) if any(i.startswith("_url") for i in file_fields): for row, url in zip(rows, urls): row.update(get_url_parts(url)) if any(i.startswith("_url_filename") for i in file_fields): if dry_run: # Don't waste time making requests. dummy = get_file_parts("BASE.EXT", "_url_filename") for idx, row in enumerate(rows): row.update( {k: v + str(idx) for k, v in dummy.items()}) else: pbar = ui.get_progressbar(total=len(urls), label="Requesting names", unit=" Files") for row, url in zip(rows, urls): # If we run into any issues here, we're just going to raise an # exception and then abort inside dlplugin. It'd be good to # disentangle this from `extract` so that we could yield an # individual error, drop the row, and keep going. filename = get_url_filename(url) if filename: row.update(get_file_parts(filename, "_url_filename")) else: raise ValueError( "{} does not contain a filename".format(url)) pbar.update(1, increment=True) pbar.finish()
def _mk_schema(self, dsinfo): from whoosh import fields as wf from whoosh.analysis import StandardAnalyzer from whoosh.analysis import SimpleAnalyzer # haven for terms that have been found to be undefined # (for faster decision-making upon next encounter) # this will harvest all discovered term definitions definitions = { '@id': 'unique identifier of an entity', # TODO make proper JSON-LD definition 'path': 'path name of an entity relative to the searched base dataset', # TODO make proper JSON-LD definition 'parentds': 'path of the datasets that contains an entity', # 'type' will not come from a metadata field, hence will not be detected 'type': 'type of a record', } schema_fields = { n.lstrip('@'): wf.ID(stored=True, unique=n == '@id') for n in definitions} lgr.debug('Scanning for metadata keys') # quick 1st pass over all dataset to gather the needed schema fields pbar = ui.get_progressbar( label='Datasets', unit='ds', total=len(dsinfo)) for res in query_aggregated_metadata( # XXX TODO After #2156 datasets may not necessarily carry all # keys in the "unique" summary reporton='datasets', ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], recursive=True): meta = res.get('metadata', {}) # no stringification of values for speed, we do not need/use the # actual values at this point, only the keys idxd = _meta2autofield_dict(meta, val2str=False) for k in idxd: schema_fields[k] = wf.TEXT(stored=False, analyzer=SimpleAnalyzer()) pbar.update(1, increment=True) pbar.finish() self.schema = wf.Schema(**schema_fields)
def wrapped(items, **kwargs): counts = defaultdict(int) pbar = ui.get_progressbar(total=len(items), label=label, unit=" " + unit) results = [] for res in fn(items, **kwargs): counts[res["status"]] += 1 count_strs = (count_str(*args) for args in [(counts["notneeded"], "skipped", False), (counts["error"], "failed", True)]) pbar.update(1, increment=True) if counts["notneeded"] or counts["error"]: pbar.set_desc("{label} ({counts})".format( label=label, counts=", ".join(filter(None, count_strs)))) pbar.refresh() results.append(res) pbar.finish() return results
def upload_file(self, fname, files_url): # In v2 API seems no easy way to "just upload". Need to initiate, # do uploads # and finalize # TODO: check if the file with the same name already available, and offer # to remove/prune it import os from datalad.utils import md5sum from datalad.ui import ui file_rec = { 'md5': md5sum(fname), 'name': os.path.basename(fname), 'size': os.stat(fname).st_size } # Initiate upload j = self.post(files_url, file_rec) file_endpoint = j['location'] file_info = self.get(file_endpoint) file_upload_info = self.get(file_info['upload_url']) pbar = ui.get_progressbar( label=fname, # fill_text=f.name, total=file_rec['size']) with open(fname, 'rb') as f: for part in file_upload_info['parts']: udata = dict(file_info, **part) if part['status'] == 'PENDING': f.seek(part['startOffset']) data = f.read(part['endOffset'] - part['startOffset'] + 1) url = '{upload_url}/{partNo}'.format(**udata) ok = self.put(url, data=data, binary=True, return_json=False) assert ok == b'OK' pbar.update(part['endOffset'], increment=False) pbar.finish() # complete upload jcomplete = self.post(file_endpoint, return_json=False) return file_info
def upload_file(self, fname, files_url): # In v2 API seems no easy way to "just upload". Need to initiate, # do uploads # and finalize # TODO: check if the file with the same name already available, and offer # to remove/prune it import os from datalad.utils import md5sum from datalad.ui import ui file_rec = {'md5': md5sum(fname), 'name': os.path.basename(fname), 'size': os.stat(fname).st_size } # Initiate upload j = self.post(files_url, file_rec) file_endpoint = j['location'] file_info = self.get(file_endpoint) file_upload_info = self.get(file_info['upload_url']) pbar = ui.get_progressbar(label=fname, # fill_text=f.name, total=file_rec['size']) with open(fname, 'rb') as f: for part in file_upload_info['parts']: udata = dict(file_info, **part) if part['status'] == 'PENDING': f.seek(part['startOffset']) data = f.read(part['endOffset'] - part['startOffset'] + 1) url = '{upload_url}/{partNo}'.format(**udata) ok = self.put(url, data=data, binary=True, return_json=False) assert ok == b'OK' pbar.update(part['endOffset'], increment=False) pbar.finish() # complete upload jcomplete = self.post(file_endpoint, return_json=False) return file_info
def _mk_search_index(self, force_reindex): """Generic entrypoint to index generation The actual work that determines the structure and content of the index is done by functions that are passed in as arguments `meta2doc` - must return dict for index document from result input """ from whoosh import index as widx from .metadata import agginfo_relpath # what is the lastest state of aggregated metadata metadata_state = self.ds.repo.get_last_commit_hash(agginfo_relpath) # use location common to all index types, they would all invalidate # simultaneously stamp_fname = opj(self.index_dir, 'datalad_metadata_state') index_dir = opj(self.index_dir, self._mode_label) if (not force_reindex) and \ exists(index_dir) and \ exists(stamp_fname) and \ open(stamp_fname).read() == metadata_state: try: # TODO check that the index schema is the same # as the one we would have used for reindexing # TODO support incremental re-indexing, whoosh can do it idx = widx.open_dir(index_dir) lgr.debug( 'Search index contains %i documents', idx.doc_count()) self.idx_obj = idx return except widx.LockError as e: raise e except widx.IndexError as e: # Generic index error. # we try to regenerate # TODO log this pass except widx.IndexVersionError as e: # (msg, version, release=None) # Raised when you try to open an index using a format that the # current version of Whoosh cannot read. That is, when the index # you're trying to open is either not backward or forward # compatible with this version of Whoosh. # we try to regenerate lgr.warning(exc_str(e)) pass except widx.OutOfDateError as e: # Raised when you try to commit changes to an index which is not # the latest generation. # this should not happen here, but if it does ... KABOOM raise e except widx.EmptyIndexError as e: # Raised when you try to work with an index that has no indexed # terms. # we can just continue with generating an index pass lgr.info('{} search index'.format( 'Rebuilding' if exists(index_dir) else 'Building')) if not exists(index_dir): os.makedirs(index_dir) # this is a pretty cheap call that just pull this info from a file dsinfo = self.ds.metadata( get_aggregates=True, return_type='list', result_renderer='disabled') self._mk_schema(dsinfo) idx_obj = widx.create_in(index_dir, self.schema) idx = idx_obj.writer( # cache size per process limitmb=cfg.obtain('datalad.search.indexercachesize'), # disable parallel indexing for now till #1927 is resolved ## number of processes for indexing #procs=multiprocessing.cpu_count(), ## write separate index segments in each process for speed ## asks for writer.commit(optimize=True) #multisegment=True, ) # load metadata of the base dataset and what it knows about all its subdatasets # (recursively) old_idx_size = 0 old_ds_rpath = '' idx_size = 0 pbar = ui.get_progressbar( label='Datasets', unit='ds', total=len(dsinfo)) for res in query_aggregated_metadata( reporton=self.documenttype, ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again meta = res.get('metadata', {}) doc = self._meta2doc(meta) admin = { 'type': res['type'], 'path': relpath(res['path'], start=self.ds.path), } if 'parentds' in res: admin['parentds'] = relpath(res['parentds'], start=self.ds.path) if admin['type'] == 'dataset': if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural( 'document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) old_idx_size = idx_size old_ds_rpath = admin['path'] admin['id'] = res.get('dsid', None) pbar.update(1, increment=True) doc.update({k: assure_unicode(v) for k, v in admin.items()}) lgr.debug("Adding document to search index: {}".format(doc)) # inject into index idx.add_document(**doc) idx_size += 1 if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural( 'document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) lgr.debug("Committing index") idx.commit(optimize=True) pbar.finish() # "timestamp" the search index to allow for automatic invalidation with open(stamp_fname, 'w') as f: f.write(metadata_state) lgr.info('Search index contains %i documents', idx_size) self.idx_obj = idx_obj