def op_count(cls, crawler, stage=None): """Total operations performed for this crawler""" if stage: total_ops = conn.get(make_key(crawler, stage)) else: total_ops = conn.get(make_key(crawler, "total_ops")) return unpack_int(total_ops)
def timeout_expiration_check(self): stages_on_timeout_key = make_key('memorious', 'timeout_stages') stages_on_timeout = conn.smembers(stages_on_timeout_key) for stage in stages_on_timeout: key = make_key('memorious', 'timeout', stage) if not conn.get(key): conn.srem(stages_on_timeout_key, stage)
def operation_end(cls, crawler, run_id): conn.set(make_key(crawler, "last_run"), pack_now(), ex=REDIS_LONG) pending = conn.decr(make_key(crawler, "run", run_id)) if unpack_int(pending) == 0: conn.set(make_key(crawler, "run", run_id, "end"), pack_now(), ex=REDIS_LONG)
def __init__(self, conn, dataset, job_id): # noqa self.conn = conn self.id = job_id self.dataset = Dataset.ensure(conn, dataset) self.start_key = make_key(PREFIX, 'qd', self.id, dataset, 'start') self.end_key = make_key(PREFIX, 'qd', self.id, dataset, 'end') self.active_jobs_key = make_key(PREFIX, 'qdja')
def __init__(self, conn, dataset, job_id): # noqa self.conn = conn self.id = job_id self.dataset = Dataset.ensure(conn, dataset) self.start_key = make_key(PREFIX, "qd", self.id, dataset, "start") self.end_key = make_key(PREFIX, "qd", self.id, dataset, "end") self.active_jobs_key = make_key(PREFIX, "qdja")
def aleph_emit_entity(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) entity_id = data.get("entity_id", data.get("id")) if not entity_id: context.emit_warning( "Error: Can not create entity. `id` is not definied") return source_url = data.get("source_url", data.get("url")) foreign_id = data.get("foreign_id", data.get("request_id", source_url)) # Fetch entity from cache cached_entity = context.get_tag( make_key(collection_id, foreign_id, entity_id)) if cached_entity and isinstance(cached_entity, dict): context.log.info("Skip entity creation: {}".format(foreign_id)) data["aleph_id"] = cached_entity["id"] data["aleph_collection_id"] = collection_id data["aleph_entity"] = cached_entity context.emit(data=data, optional=True) return for try_number in range(api.retries): rate = settings.MEMORIOUS_RATE_LIMIT rate_limit = get_rate_limit("aleph", limit=rate) rate_limit.comply() try: res = api.write_entity( collection_id, { "schema": data.get("schema"), "properties": data.get("properties"), }, entity_id, ) entity = { "id": res.get("id"), "schema": res.get("schema"), "properties": res.get("properties"), } context.log.info("Aleph entity ID: %s", entity["id"]) # Save the entity in cache for future use context.set_tag(make_key(collection_id, foreign_id, entity_id), entity) data["aleph_id"] = entity["id"] data["aleph_collection_id"] = collection_id data["aleph_entity"] = entity context.emit(data=data, optional=True) return except AlephException as exc: if try_number > api.retries or not exc.transient: context.emit_warning("Error: %s" % exc) return backoff(exc, try_number)
def timeout(cls, stage, rate_limit): stages_on_timeout = make_key("memorious", "timeout_stages") conn.sadd(stages_on_timeout, stage.namespaced_name) stage_timeout_key = make_key("memorious", "timeout", stage.namespaced_name) expiry = (rate_limit.interval * rate_limit.unit) / rate_limit.limit conn.set(stage_timeout_key, "true", ex=math.ceil(expiry)) # Delay the current task without further adding to call count rate_limit.comply(amount=0)
def __init__(self, job, stage): # noqa self.job = job self.conn = job.conn self.stage = stage self.queue_key = make_key(PREFIX, 'q', job.dataset, stage, job.id) self.stages_key = self._get_stage_jobs_key(stage) self.pending_key = make_key(self.queue_key, 'pending') self.running_key = make_key(self.queue_key, 'running') self.finished_key = make_key(self.queue_key, 'finished')
def delete_counts(cls, crawler): for level in cls.LEVELS: conn.delete(make_key(crawler, "events", "count", level)) for run_id in Crawl.run_ids(crawler): for level in cls.LEVELS: conn.delete(make_key(crawler, "events", "count", run_id, level)) # noqa for stage in crawler.stages.keys(): for level in cls.LEVELS: conn.delete(make_key(crawler, "events", "count", stage, level))
def __init__(self, job, stage): # noqa self.job = job self.conn = job.conn self.stage = stage self.stages_key = self._get_stage_jobs_key(stage) queue_id = (PREFIX, "q", job.dataset, stage, job.id) self.queue_key = make_key(*queue_id) self.pending_key = make_key(*queue_id, "pending") self.running_key = make_key(*queue_id, "running") self.finished_key = make_key(*queue_id, "finished")
def save(cls, crawler, stage, level, run_id, error=None, message=None): """Create an event, possibly based on an exception.""" event = { 'stage': stage.name, 'level': level, 'timestamp': pack_now(), 'error': error, 'message': message } data = dump_json(event) keys = [ make_key(crawler, "events"), make_key(crawler, "events", level), make_key(crawler, "events", stage), make_key(crawler, "events", stage, level), make_key(crawler, "events", run_id), make_key(crawler, "events", run_id, level), ] for key in keys: conn.lpush(key, data) conn.expire(key, REDIS_EXPIRE) # Persist the counts for longer count_keys = [ make_key(crawler, "events", "count", level), make_key(crawler, "events", "count", stage, level), make_key(crawler, "events", "count", run_id, level), ] for key in count_keys: conn.incr(key) conn.expire(key, REDIS_LONG) return event
def operation_start(cls, crawler, stage, run_id): if not conn.sismember(make_key(crawler, "runs"), run_id): conn.sadd(make_key(crawler, "runs"), run_id) conn.expire(make_key(crawler, "runs"), REDIS_LONG) conn.set(make_key(crawler, "run", run_id, "start"), pack_now(), ex=REDIS_LONG) # noqa conn.incr(make_key(crawler, "run", run_id)) conn.incr(make_key(crawler, "run", run_id, "total_ops")) conn.incr(make_key(crawler, stage)) conn.incr(make_key(crawler, "total_ops")) conn.set(make_key(crawler, "last_run"), pack_now(), ex=REDIS_LONG) conn.set(make_key(crawler, "current_run"), run_id, ex=REDIS_LONG)
def aleph_emit_document(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) content_hash = data.get("content_hash") source_url = data.get("source_url", data.get("url")) foreign_id = data.get("foreign_id", data.get("request_id", source_url)) # Fetch document id from cache document = context.get_tag( make_key(collection_id, foreign_id, content_hash)) if document: context.log.info("Skip aleph upload: %s", foreign_id) context.log.info("Skip aleph upload: %s", foreign_id) data["aleph_id"] = document["id"] data["aleph_document"] = document data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return meta = clean_dict(_create_meta_object(context, data)) meta.update(_create_document_metadata(context, data)) label = meta.get("file_name", meta.get("source_url")) context.log.info("Upload: %s", label) with context.load_file(content_hash) as fh: if fh is None: return file_path = Path(fh.name).resolve() for try_number in range(api.retries): rate = settings.MEMORIOUS_RATE_LIMIT rate_limit = get_rate_limit("aleph", limit=rate) rate_limit.comply() try: res = api.ingest_upload(collection_id, file_path, meta) document_id = res.get("id") context.log.info("Aleph document ID: %s", document_id) # Save the document id in cache for future use meta["id"] = document_id context.set_tag( make_key(collection_id, foreign_id, content_hash), meta) data["aleph_id"] = document_id data["aleph_document"] = meta data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return except AlephException as exc: if try_number > api.retries or not exc.transient: context.emit_warning("Error: %s" % exc) return backoff(exc, try_number)
def runs(cls, crawler): runs = [] for run_id in cls.run_ids(crawler): start = conn.get(make_key(crawler, "run", run_id, "start")) end = conn.get(make_key(crawler, "run", run_id, "end")) total_ops = conn.get(make_key(crawler, "run", run_id, "total_ops")) runs.append({ "run_id": run_id, "total_ops": unpack_int(total_ops), "start": unpack_datetime(start, datetime.utcnow()), "end": unpack_datetime(end), }) return runs
def flush(cls, crawler): for stage in crawler.stages: conn.delete(make_key(crawler, stage)) for run_id in cls.run_ids(crawler): conn.delete(make_key(crawler, run_id)) conn.delete(make_key(crawler, run_id, "start")) conn.delete(make_key(crawler, run_id, "end")) conn.delete(make_key(crawler, run_id, "total_ops")) conn.delete(make_key(crawler, "runs")) conn.delete(make_key(crawler, "current_run")) conn.delete(make_key(crawler, "total_ops")) conn.delete(make_key(crawler, "last_run")) conn.delete(make_key(crawler, "runs_abort"))
def save(self): session = pickle.dumps(self.session) session = codecs.encode(session, 'base64') key = sha1(session).hexdigest()[:15] key = make_key(self.context.crawler, "session", self.context.run_id, key) # noqa conn.set(key, session, ex=REDIS_SHORT) self.context.state[self.STATE_SESSION] = key
def test_redis(self): key = make_key('test', uuid4()) conn = get_redis() assert not conn.exists(key) conn.set(key, 'banana') assert conn.get(key) == 'banana', conn.get(key) assert conn.exists(key)
def get_stages(self): all_stages = set({stage.namespaced_name for _, stage in manager.stages}) # noqa stages_on_timeout_key = make_key('memorious', 'timeout_stages') stages_on_timeout = conn.smembers(stages_on_timeout_key) if stages_on_timeout: return list(all_stages - set(stages_on_timeout)) return all_stages
def fetch(context, data): """Do an HTTP GET on the ``url`` specified in the inbound data.""" url = data.get('url') attempt = data.pop('retry_attempt', 1) try: result = context.http.get(url, lazy=True) rules = context.get('rules', {'match_all': {}}) if not Rule.get_rule(rules).apply(result): context.log.info('Fetch skip: %r', result.url) return if not result.ok: err = (result.url, result.status_code) context.emit_warning("Fetch fail [%s]: HTTP %s" % err) if not context.params.get('emit_errors', False): return else: context.log.info("Fetched [%s]: %r", result.status_code, result.url) data.update(result.serialize()) if url != result.url: tag = make_key(context.run_id, url) context.set_tag(tag, None) context.emit(data=data) except RequestException as ce: retries = int(context.get('retry', 3)) if retries >= attempt: context.log.warn("Retry: %s (error: %s)", url, ce) data['retry_attempt'] = attempt + 1 context.recurse(data=data, delay=2**attempt) else: context.emit_warning("Fetch fail [%s]: %s" % (url, ce))
def save(self): session = pickle.dumps(self.session) session = codecs.encode(session, 'base64') key = sha1(session).hexdigest()[:15] key = make_key(self.context.run_id, "session", key) conn.set(key, session, ex=QUEUE_EXPIRE) self.context.state[self.STATE_SESSION] = key
def fetch(context, data): """Do an HTTP GET on the ``url`` specified in the inbound data.""" url = data.get("url") if urlparse(url).scheme not in ("http", "https", ""): context.log.info("Fetch skipped. Unsupported scheme: %r", url) return attempt = data.pop("retry_attempt", 1) try: result = context.http.get(url, lazy=True) rules = context.get("rules", {"match_all": {}}) if not Rule.get_rule(rules).apply(result): context.log.info("Fetch skip: %r", result.url) return if not result.ok: err = (result.url, result.status_code) context.emit_warning("Fetch fail [%s]: HTTP %s" % err) if not context.params.get("emit_errors", False): return else: context.log.info("Fetched [%s]: %r", result.status_code, result.url) data.update(result.serialize()) if url != result.url: tag = make_key(context.run_id, url) context.set_tag(tag, None) context.emit(data=data) except RequestException as ce: retries = int(context.get("retry", 3)) if retries >= attempt: context.log.warn("Retry: %s (error: %s)", url, ce) data["retry_attempt"] = attempt + 1 context.recurse(data=data, delay=2 ** attempt) else: context.emit_warning("Fetch fail [%s]: %s" % (url, ce))
def test_redis(self): key = make_key("test", uuid4()) conn = get_redis() assert not conn.exists(key) conn.set(key, "banana") assert conn.get(key) == "banana", conn.get(key) assert conn.exists(key)
def aleph_folder(context, data): api = get_api(context) if api is None: return collection_id = get_collection_id(context, api) foreign_id = data.get("foreign_id") if foreign_id is None: context.log.warning("No folder foreign ID!") return meta = clean_dict(_create_meta_object(context, data)) label = meta.get("file_name", meta.get("source_url")) context.log.info("Make folder: %s", label) for try_number in range(api.retries): rate = settings.MEMORIOUS_RATE_LIMIT rate_limit = get_rate_limit("aleph", limit=rate) rate_limit.comply() try: res = api.ingest_upload(collection_id, metadata=meta, sync=True) document_id = res.get("id") context.log.info("Aleph folder entity ID: %s", document_id) # Save the document id in cache for future use context.set_tag(make_key(collection_id, foreign_id), document_id) data["aleph_folder_id"] = document_id data["aleph_collection_id"] = collection_id context.emit(data=data, optional=True) return except AlephException as ae: if try_number > api.retries or not ae.transient: context.emit_warning("Error: %s" % ae) return backoff(ae, try_number)
def flush_tags(self): pipe = conn.pipeline() count = 0 for key in conn.scan_iter(make_key(self, 'tag', '*')): pipe.delete(key) count += 1 pipe.execute() log.info("Deleted %d tags", count)
def load_session(self): if self.STATE_SESSION not in self.context.state: return key = self.context.state.get(self.STATE_SESSION) value = conn.get(make_key(self.context.run_id, "session", key)) if value is not None: session = codecs.decode(bytes(value, 'utf-8'), 'base64') return pickle.loads(session)
def _remove(self, pipe): for stage in self.get_stages(): stage._remove(pipe) pipe.srem(self.dataset.key, self.dataset.name) pipe.srem(self.dataset.jobs_key, self.id) pipe.srem(self.active_jobs_key, make_key(self.dataset.name, self.id)) pipe.delete(self.start_key) pipe.setnx(self.end_key, pack_now()) pipe.expire(self.end_key, REDIS_EXPIRE)
def request_id(self): if self._request_id is not None: return self._request_id if self.request is not None: parts = [self.request.method, self.url] if self.request.data: parts.append(hash_data(self.request.data)) if self.request.json: parts.append(hash_data(self.request.json)) return make_key(*parts)
def documentcloud_mark_processed(context, data): """Create a persistent tag to indicate that a document has been fully processed On subsequent runs, we can check and skip processing this document earlier in the pipeline. """ key = make_key( context.crawler.name, data["foreign_id"], data["content_hash"], ) context.log.info( f"Document with foreign id {data['foreign_id']} has been processed") context.set_tag(key, "processed")
def parse_html(context, data, result): context.log.info("Parse: %r", result.url) for title in result.html.xpath(".//title/text()"): if title is not None and "title" not in data: data["title"] = title include = context.params.get("include_paths") if include is None: roots = [result.html] else: roots = [] for path in include: roots = roots + result.html.xpath(path) seen = set() for root in roots: for tag_query, attr_name in URL_TAGS: for element in root.xpath(tag_query): attr = element.get(attr_name) if attr is None: continue try: url = urljoin(result.url, attr) key = url except Exception: log.warning("Invalid URL: %r", attr) continue if url is None or key is None or key in seen: continue seen.add(key) tag = make_key(context.run_id, key) if context.check_tag(tag): continue context.set_tag(tag, None) data["url"] = url if data.get("title") is None: # Option to set the document title from the link text. if context.get("link_title", False): data["title"] = collapse_spaces(element.text_content()) elif element.get("title"): data["title"] = collapse_spaces(element.get("title")) context.http.session.headers["Referer"] = url context.emit(rule="fetch", data=data)
def parse_html(context, data, result): context.log.info('Parse: %r', result.url) title = result.html.xpath('.//title/text()')[0] if title is not None and 'title' not in data: data['title'] = title include = context.params.get('include_paths') if include is None: roots = [result.html] else: roots = [] for path in include: roots = roots + result.html.xpath(path) seen = set() for root in roots: for tag_query, attr_name in URL_TAGS: for element in root.xpath(tag_query): attr = element.get(attr_name) if attr is None: continue try: url = urljoin(result.url, attr) key = url except Exception: log.warning('Invalid URL: %r', attr) continue if url is None or key is None or key in seen: continue seen.add(key) tag = make_key(context.run_id, key) if context.check_tag(tag): continue context.set_tag(tag, None) data = {'url': url} # Option to set the document title from the link text. if context.get('link_title', False): data['title'] = collapse_spaces(element.text_content()) elif element.get('title'): data['title'] = collapse_spaces(element.get('title')) context.http.session.headers['Referer'] = url context.emit(rule='fetch', data=data)
def key(self, *parts): return make_key(self.prefix, *parts)
def place_key(name): return make_key(PLACE_KEY, name)