def __call__(self, data): input = data.pop(self._input) if self._pop_input else data[self._input] if not Response: # we have no scrapy raise MissingExternalDependency( "scrapy", msg="It is needed for this type of crawling") if isinstance(input, Response): selector = Selector(response=input) if hasattr(input, 'url') and input.url and ('url' not in data): # take the URL of the response object data = updated(data, {'url': input.url}) else: selector = Selector(text=input) count = 0 for entry, data_ in self._select_and_extract(selector, self.query, data): data_ = updated(data_, {self._output: entry.extract()}) # now get associated xpaths, css, etc for selectors_dict, entry_method in ((self._xpaths, entry.xpath), (self._csss, entry.css)): if not selectors_dict: continue for key in selectors_dict: selector_ = selectors_dict[key] key_extracted = entry_method(selector_).extract() if not len(key_extracted): # TODO: warning, make mandatory to have a hit if expected? continue if len(key_extracted) > 1: if self._allow_multiple: data_[key] = key_extracted # raise NotImplementedError("Don't know what to do yet with this one") else: lgr.warn( "Got multiple selections for xpath query %s. " "Keeping only the first one: %s" % (repr(selector_), key_extracted[0])) data_[key] = key_extracted[0] else: data_[key] = key_extracted[0] count += 1 yield data_ if self._min_count and count < self._min_count: raise ValueError( "Did not match required %d matches (got %d) using %s" % (self._min_count, count, self)) if self._max_count and count > self._max_count: raise ValueError("Matched more than %d matches (got %d) using %s" % (self._max_count, count, self))
def test_updated(): d = {} eq_(updated(d, {1: 2}), {1: 2}) eq_(d, {}) d = {'a': 'b'} eq_(updated(d, ((0, 1), (2, 3))), {0: 1, 'a': 'b', 2: 3}) eq_(d, {'a': 'b'}) # and that it would maintain the type d = OrderedDict(((99, 0), ('z', 0), ('a', 0))) d_ = updated(d, {0: 1}) ok_(isinstance(d_, OrderedDict)) eq_(d_, OrderedDict(((99, 0), ('z', 0), ('a', 0), (0, 1))))
def __init__(self, label='', fill_text=None, maxval=None, unit='B', out=sys.stdout): super(tqdmProgressBar, self).__init__(maxval=maxval) self._pbar_params = updated( self._default_pbar_params, dict(desc=label, unit=unit, unit_scale=True, total=maxval, file=out)) self._pbar = None
def __call__(self, data): jsdata = json.loads(data["response"]) for candidate in jsdata["Images"]: filename = basename(candidate["Link"]) self.meta[filename] = candidate yield updated(data, {"url": self.apibase + candidate["Link"]}) return
def __init__(self, label='', fill_text=None, total=None, unit='B', out=sys.stdout, leave=False, frontend=None): """ Parameters ---------- label fill_text total unit out leave frontend: (None, 'ipython'), optional tqdm module to use. Could be tqdm_notebook if under IPython """ super(tqdmProgressBar, self).__init__(label=label, total=total, unit=unit) if frontend not in self._frontends: raise ValueError( "Know only about following tqdm frontends: %s. Got %s" % (', '.join(map(str, self._frontends)), frontend)) tqdm_frontend = self._frontends[frontend] if not tqdm_frontend: if frontend == 'ipython': from tqdm import tqdm_notebook tqdm_frontend = self._frontends[frontend] = tqdm_notebook else: lgr.error( "Something went wrong here, using default tqdm frontend for %s", frontend) tqdm_frontend = self._frontends[ frontend] = self._frontends[None] self._tqdm = tqdm_frontend self._pbar_params = updated( self._default_pbar_params, dict( desc=label, unit=unit, unit_scale=True, total=total, file=out, leave=leave, )) if label and 'total' in label.lower( ) and 'smoothing' not in self._pbar_params: # ad-hoc: All tqdm totals will report total mean, and not some # momentary speed self._pbar_params['smoothing'] = 0 self._pbar = None
def get_projects(data): xnat = XNATServer(url) for p in xnat.get_projects( asdict=False, limit=limit or PROJECT_ACCESS_TYPES, drop_empty=drop_empty ): yield updated(data, p)
def _visit_url(self, url, data): if url in self._seen: return # this is just a cruel first attempt lgr.debug("Visiting %s" % url) try: retry = 0 orig_url = url if self._redirects_cache is not None: url = self._redirects_cache.get(url, url) while True: retry += 1 if retry > 100: raise DownloadError( "We have followed 100 redirects already. Something is wrong!" ) try: self._seen.add(url) page = self._providers.fetch(url, allow_redirects=False) break except UnhandledRedirectError as exc: # since we care about tracking URL for proper full url construction # we should disallow redirects and handle them manually here lgr.debug("URL %s was redirected to %s" % (url, exc.url)) if url == exc.url: raise DownloadError( "Was redirected to the same url upon %s" % exc_str(exc)) url = exc.url if self._redirects_cache is not None: self._redirects_cache[orig_url] = exc.url except DownloadError as exc: lgr.warning("URL %s failed to download: %s" % (url, exc_str(exc))) if self.failed in {None, 'skip'}: # TODO: config -- crawl.failed='skip' should be a config option, for now always skipping return raise # otherwise -- kaboom data_ = updated(data, zip(self._output, (page, url))) yield data_ # now recurse if matchers were provided matchers = self._matchers if matchers: lgr.debug("Looking for more URLs at %s using %s", url, matchers) for matcher in (matchers if isinstance(matchers, (list, tuple)) else [matchers]): for data_matched in matcher(data_): if 'url' not in data_matched: lgr.warning("Got data without a url from %s" % matcher) continue # proxy findings for data_matched_ in self._visit_url( data_matched['url'], data_matched): yield data_matched_
def _args(**kwargs): return Namespace( # ATM duplicates definitions done by cmdline.main and # required by code logic to be defined. (should they?) # # TODO: The common options are now added by # cmdline.helpers.parser_add_common_options(), which can be reused by # tests. **updated(dict(common_output_format="default"), kwargs))
def _new_args(**kwargs): # A few more must be specified return _args(**updated( dict( common_on_failure=None, # ['ignore', 'continue', 'stop'] common_report_status= None, # ['all', 'success', 'failure', 'ok', 'notneeded', 'impossible', 'error'] common_report_type=None, # ['dataset', 'file'] ), kwargs))
def _get_pipeline_opts(pipeline): """Return options and pipeline steps to be ran given the pipeline "definition" Definition might have options as the first element """ opts = PIPELINE_OPTS.copy() if isinstance(pipeline[0], dict): newopts, pipeline = (pipeline[0], pipeline[1:]) opts = updated(opts, newopts) return opts, pipeline
def __init__(self, label='', fill_text=None, total=None, unit='B', out=sys.stdout, leave=False): super(tqdmProgressBar, self).__init__(total=total) self._pbar_params = updated( self._default_pbar_params, dict(desc=label, unit=unit, unit_scale=True, total=total, file=out, leave=leave )) self._pbar = None
def __call__(self, data): count = 0 for fpath in _find_files(self.regex, dirs=self.dirs, topdir=self.topdir): lgr.log(5, "Found file %s" % fpath) count += 1 path, filename = ops(fpath) yield updated(data, {'path': path, 'filename': filename}) self._total_count += count if not self._total_count and self.fail_if_none: raise RuntimeError("We did not match any file using regex %r" % self.regex)
def _args(**kwargs): return Namespace( # ATM duplicates definitions done by cmdline.main and # required by code logic to be defined. (should they?) **updated( dict( common_output_format="default" ), kwargs ) )
def visits(self, data): response = json.loads(data["response"]) for visit in response["Visits"]: if not os.path.exists(visit): os.mkdir(visit) yield updated(data, { "url": self.url + "/" + visit, "visit": visit, })
def images(self, data): response = json.loads(data["response"]) for file_ in response["Files"]: filename = file_["Filename"] yield updated( data, { "url": data["url"] + "/" + filename, "filename": "{}/images/{}".format(data["visit"], filename), }, )
def __call__(self, data): # ??? for some reason didn't work when I made entire thing a list if self.keys: raise NotImplementedError("Jason will do it") else: data_ = {k: v for k, v in data.items() if not any(k.startswith(p) for p in self.ignore_prefixes)} self.data.append(data_) if self.output: data = updated(data, {self.output: self.data}) yield data
def __call__(self, data): # we do not take anything from data meta = get_metadata(self.dataset) if meta: meta_encoded = meta.encode('utf-8') if not os.path.exists('.datalad'): os.makedirs('.datalad') path_ = _path_('.datalad', 'meta.datacite.xml') with open(path_, 'w') as f: f.write(meta_encoded) yield updated(data, {'filename': path_}) else: yield data
def _new_args(**kwargs): # A few more must be specified return _args( **updated( dict( common_on_failure=None, # ['ignore', 'continue', 'stop'] common_report_status=None, # ['all', 'success', 'failure', 'ok', 'notneeded', 'impossible', 'error'] common_report_type=None, # ['dataset', 'file'] common_proc_pre=None, common_proc_post=None, ), kwargs ) )
def __call__(self, data): if self.keys: self.data.append({key: data[key] for key in self.keys if key in data}) else: data_ = {k: v for k, v in data.items() if not any(k.startswith(p) for p in self.ignore_prefixes)} self.data.append(data_) if self.output: data = updated(data, {self.output: self.data}) yield data
def __call__(self, data): # ??? for some reason didn't work when I made entire thing a list if self.keys: raise NotImplementedError("Jason will do it") else: data_ = { k: v for k, v in data.items() if not any(k.startswith(p) for p in self.ignore_prefixes) } self.data.append(data_) if self.output: data = updated(data, {self.output: self.data}) yield data
def test_continue_if(): d = {'v1': 'done'} n = continue_if(d) #eq_(list(n(d)), [d]) eq_(list(n(dict(v1='not done'))), []) eq_(list(n(dict(v1='done', someother=123))), [dict(v1='done', someother=123)]) tdict = dict(v1='not yet', someother=123) # and that we would interrupt while matching multiple values eq_(list(n(tdict)), []) eq_(list(continue_if(tdict)(tdict)), [tdict]) # regexp eq_(list(continue_if({'v1': '^(?P<negate>not +)yet$'}, re=True)(tdict)), [updated(tdict, {'negate': 'not '})])
def instruments(self, data): response = json.loads(data["response"]) meta = response["Meta"] for instrument in response["Instruments"]: filename = "{}_{}_{}".format(meta["CandID"], data["visit"], instrument) yield updated( data, { "url": data["url"] + "/" + instrument, "filename": "{}/instruments/{}".format(data["visit"], filename), }, )
def get_all_files_for_project(self, project, subjects=None, experiments=None): # TODO: grow the dictionary with all the information about subject/experiment/file # to be yielded so we could tune up file name anyway we like for subject in (subjects or self.get_subjects(project)): subject_url = 'data/projects/%s/subjects/%s' % (project, subject) subject_data = self(subject_url, return_plain=True) subject_info = extract_subject_info(subject_data) for experiment in (experiments or self.get_experiments(project, subject)): experiment_data = self('data/experiments/%s' % experiment, return_plain=True) experiment_info = extract_experiment_info(experiment_data) for file_ in self.get_files(project, subject, experiment): file_info = updated(file_, {'subject_id': subject, 'subject_info': subject_info, 'experiment_id': experiment, 'experiment_info': experiment_info}) yield file_info
def get_files(data): for f in xnat.get_all_files_for_project(project, subjects=subjects): # TODO: tune up filename # TODO: get url prefix = '/data/experiments/' assert f['uri'].startswith('%s' % prefix) # TODO: use label for subject/experiment # TODO: might want to allow for # XNAT2BIDS whenever that one is available: # http://reproducibility.stanford.edu/accepted-projects-for-the-2nd-crn-coding-sprint/ exp_label = xnat.experiment_labels[f['experiment_id']] yield updated(data, {'url': url + f['uri'], 'path': f['uri'][len(prefix):], 'name': '%s-%s' % (exp_label, f['name']) })
def __call__(self, data): if self.keys: self.data.append( {key: data[key] for key in self.keys if key in data}) else: data_ = { k: v for k, v in data.items() if not any(k.startswith(p) for p in self.ignore_prefixes) } self.data.append(data_) if self.output: data = updated(data, {self.output: self.data}) yield data
def __init__(self, label='', fill_text=None, total=None, unit='B', out=sys.stdout, leave=False, frontend=None): """ Parameters ---------- label fill_text total unit out leave frontend: (None, 'ipython'), optional tqdm module to use. Could be tqdm_notebook if under IPython """ super(tqdmProgressBar, self).__init__(label=label, total=total, unit=unit) if frontend not in self._frontends: raise ValueError( "Know only about following tqdm frontends: %s. Got %s" % (', '.join(map(str, self._frontends)), frontend)) tqdm_frontend = self._frontends[frontend] if not tqdm_frontend: if frontend == 'ipython': from tqdm import tqdm_notebook tqdm_frontend = self._frontends[frontend] = tqdm_notebook else: lgr.error( "Something went wrong here, using default tqdm frontend for %s", frontend) tqdm_frontend = self._frontends[frontend] = self._frontends[None] self._tqdm = tqdm_frontend self._pbar_params = updated( self._default_pbar_params, dict(desc=label, unit=unit, unit_scale=True, total=total, file=out, leave=leave )) self._pbar = None
def __call__(self, data): db = get_oracle_db() query = "SELECT %s FROM IMAGE03 WHERE COLLECTION_ID=%s" \ % (','.join(image03_fields), self.collection) c = db.cursor() c.execute(query) # query and wrap into named tuples to ease access #import ipdb; ipdb.set_trace() for rec in c.fetchall(): # TODO -- better access method? rec = image03_Record(*rec) for field in image03_file_fields: url = getattr(rec, field) if url: # generate a new yield updated(data, { 'url': url, }) c.close()
def _parse_checksums(data): url = data['url'] urlsplit = url.split('/') topurl = '/'.join(urlsplit[:-1]) if digest is None: # deduce from URL's file extension filename = urlsplit[-1] base, ext = splitext(filename) digest_ = ext if ext else digest content = data['response'] # split into separate lines, first entry is checksum, 2nd file path for line in content.split('\n'): if not line: # empty line continue checksum, fpath = line.split(None, 1) yield updated( data, { 'digest': digest or digest_, 'checksum': checksum, 'path': dirname(fpath), 'filename': basename(fpath), 'url': "%s/%s" % (topurl, fpath) })
def __call__(self, data={}): for i in range(self.n): yield updated(data, {self.output: i})
def node(d): yield updated(d, {'debugged': True})
def get_url_filename(data): yield updated(data, {'filename': get_url_straight_filename(data['url'])})
def n2(data): for i in range(2): ran.append(len(ran)) yield updated(data, {'f2': 'x_%d' % i})
def get_disposition_filename(data): yield updated( data, {'filename': balsa_downloader.get_status(data['url']).filename})
def get_disposition_filename(data): """For the URL request content filename disposition """ yield updated(data, {'filename': get_url_disposition_filename(data['url'])})
def __call__(self, data): stats = data.get('datalad_stats', None) url = "s3://%s" % self.bucket if self.prefix: url += "/" + self.prefix.lstrip('/') providers = Providers.from_config_files() downloader = providers.get_provider(url).get_downloader(url) # bucket = provider.authenticator.authenticate(bucket_name, provider.credential) try: _ = downloader.get_status( url) # just to authenticate and establish connection except TargetFileAbsent as exc: lgr.debug( "Initial URL %s lead to not something downloader could fetch: %s", url, exc_str(exc)) pass bucket = downloader.bucket assert (bucket is not None) if self.repo: versions_db = SingleVersionDB(self.repo) prev_version = versions_db.version if prev_version and not prev_version.get('version-id', None): # Situation might arise when a directory contains no files, only # directories which we place into subdatasets # see https://github.com/datalad/datalad-crawler/issues/68 # Workaround -- start from scratch lgr.warning( "stored version-id is empty. Crawling from the beginning") prev_version = None else: prev_version, versions_db = None, None # TODO: we could probably use headers to limit from previously crawled last-modified # for now will be inefficient -- fetch all, sort, proceed kwargs = {} if self.recursive else {'delimiter': '/'} all_versions = (bucket.list_versions if self.versioned else bucket.list)(self.prefix, **kwargs) # Comparison becomes tricky whenever as if in our test bucket we have a collection # of rapid changes within the same ms, so they couldn't be sorted by last_modified, so we resolve based # on them being marked latest, or not being null (as could happen originally), and placing Delete after creation # In real life last_modified should be enough, but life can be as tough as we made it for 'testing' def kf(k, f): """Some elements, such as Prefix wouldn't have any of attributes to sort by""" return getattr(k, f, '') # So ATM it would sort Prefixes first, but that is not necessarily correct... # Theoretically the only way to sort Prefix'es with the rest is traverse that Prefix # and take latest last_modified there but it is expensive, so -- big TODO if ever ;) # ACTUALLY -- may be there is an API call to return sorted by last_modified, then we # would need only a single entry in result to determine the last_modified for the Prefix, thus TODO cmp = lambda k: (kf(k, 'last_modified'), k.name, kf(k, 'is_latest'), kf(k, 'version_id') != 'null', isinstance(k, DeleteMarker)) versions_sorted = sorted(all_versions, key=cmp) # attrgetter('last_modified')) # print '\n'.join(map(str, [cmp(k) for k in versions_sorted])) version_fields = ['last-modified', 'name', 'version-id'] def get_version_cmp(k): # this one will return action version_id so we could uniquely identify return kf(k, 'last_modified'), k.name, kf(k, 'version_id') if prev_version: last_modified_, name_, version_id_ = [ prev_version[f] for f in version_fields ] # roll forward until we get to the element > this # to not breed list copies for i, k in enumerate(versions_sorted): lm, n, vid = get_version_cmp(k) if lm > last_modified_: start = i break elif lm == last_modified_: # go by name/version_id to be matched and then switch to the next one if (n, vid) == (name_, version_id_): start = i + 1 # from the next one if stats: stats.increment('skipped') break stats.increment('skipped') versions_sorted = versions_sorted[start:] # a set of items which we have already seen/yielded so hitting any of them again # would mean conflict/versioning is necessary since two actions came for the same item staged = set() strategy = self.strategy e_prev = None ncommits = self.ncommits or 0 # adding None so we could deal with the last commit within the loop without duplicating # logic later outside def update_versiondb(e, force=False): # this way we could recover easier after a crash # TODO: config crawl.crawl_s3.versiondb.saveaftereach=True if e is not None and (force or True): versions_db.version = dict( zip(version_fields, get_version_cmp(e))) for e in versions_sorted + [None]: filename = e.name if e is not None else None if (self.strip_prefix and self.prefix): filename = _strip_prefix(filename, self.prefix) if filename and self.exclude and re.search(self.exclude, filename): stats.skipped += 1 continue if filename in staged or e is None: # we should finish this one and commit if staged: if self.versionfx and e_prev is not None: version = self.versionfx(e_prev) if version is not None and version not in stats.versions: stats.versions.append(version) if versions_db: # save current "version" DB so we would know where to pick up from # upon next rerun. Record should contain # last_modified, name, versionid # TODO? what if e_prev was a DeleteMarker??? update_versiondb(e_prev, force=True) if strategy == 'commit-versions': yield updated(data, {'datalad_action': 'commit'}) if self.ncommits: ncommits += 1 if self.ncommits <= ncommits: lgr.debug( "Interrupting on %dth commit since asked to do %d", ncommits, self.ncommits) break staged.clear() if e is None: break # we are done if filename: # might be empty if e.g. it was the self.prefix directory removed staged.add(filename) if isinstance(e, Key): if e.name.endswith('/'): # signals a directory for which we don't care explicitly (git doesn't -- we don't! ;) ) continue url = get_key_url(e, schema=self.url_schema, versioned=self.versioned) # generate and pass along the status right away since we can yield updated( data, { 'url': url, 'url_status': S3Downloader.get_key_status(e, dateformat='iso8601'), 'filename': filename, 'datalad_action': 'annex', }) update_versiondb(e) elif isinstance(e, DeleteMarker): if strategy == 'commit-versions': # Since git doesn't care about empty directories for us makes sense only # in the case when DeleteMarker is not pointing to the subdirectory # and not empty (if original directory was removed) if filename and not filename.endswith('/'): yield updated(data, { 'filename': filename, 'datalad_action': 'remove' }) else: # Situation there is much trickier since it seems that "directory" # could also be a key itself and created/removed which somewhat interfers with # all our logic here # For an interesting example see # s3://openneuro/ds000217/ds000217_R1.0.0/compressed lgr.info("Ignoring DeleteMarker for %s", filename) update_versiondb(e) elif isinstance(e, Prefix): # so we were provided a directory (in non-recursive traversal) assert (not self.recursive) yield updated( data, { 'url': url, 'filename': filename.rstrip('/'), 'datalad_action': 'directory', }) else: raise ValueError("Don't know how to treat %s" % e) e_prev = e