def tasks(cls): queues = [make_key('queue', c, s) for c, s in manager.stages] random.shuffle(queues) while True: task_data_tuple = cls.conn.blpop(queues) # blpop blocks until it finds something. But fakeredis has no # blocking support. So it justs returns None. if not task_data_tuple: return key, json_data = task_data_tuple # Shift the queues list so that the matching key is at the # very end of the list, priorising all other crawlers. # queues = list(reversed(queues)) deq = deque(queues) deq.rotate((queues.index(key) * -1) - 1) queues = list(deq) task_data = load_json(json_data) stage = task_data["stage"] state = task_data["state"] data = task_data["data"] next_time = task_data.get("next_allowed_exec_time") next_time = unpack_datetime(next_time) crawler = state.get('crawler') cls.conn.decr(make_key('queue_pending', crawler)) yield (stage, state, data, next_time)
def flush(cls, crawler): prefix = make_key('queue', crawler, '*') for key in cls.conn.scan_iter(prefix): cls.conn.delete(key) cls.conn.ltrim(key, 0, -1) cls.conn.srem("queues_set", key) cls.conn.delete(make_key('queue_pending', crawler))
def op_count(cls, crawler, stage=None): """Total operations performed for this crawler""" if stage: total_ops = cls.conn.get(make_key(crawler, stage)) else: total_ops = cls.conn.get(make_key(crawler, "total_ops")) return unpack_int(total_ops)
def runs(cls, crawler): for run_id in cls.run_ids(crawler): start = cls.conn.get(make_key("run", run_id, "start")) end = cls.conn.get(make_key("run", run_id, "end")) total_ops = cls.conn.get(make_key("run", run_id, "total_ops")) yield { 'run_id': run_id, 'total_ops': unpack_int(total_ops), 'start': unpack_datetime(start, datetime.utcnow()), 'end': unpack_datetime(end) }
def runs(cls, crawler): for run_id in cls.conn.lrange(make_key(crawler, "runs_list"), 0, -1): start = cls.conn.get(make_key("run", run_id, "start")) end = cls.conn.get(make_key("run", run_id, "end")) total_ops = cls.conn.get(make_key("run", run_id, "total_ops")) yield { 'run_id': run_id, 'total_ops': unpack_int(total_ops), 'start': unpack_datetime(start), 'end': unpack_datetime(end) }
def operation_start(cls, crawler, stage, run_id): if not cls.conn.sismember(make_key(crawler, "runs"), run_id): cls.conn.sadd(make_key(crawler, "runs"), run_id) cls.conn.set(make_key("run", run_id, "start"), pack_now()) cls.conn.incr(make_key("run", run_id)) cls.conn.incr(make_key("run", run_id, "total_ops")) cls.conn.incr(make_key(crawler, stage)) cls.conn.incr(make_key(crawler, "total_ops")) cls.conn.set(make_key(crawler, "last_run"), pack_now()) cls.conn.set(make_key(crawler, "current_run"), run_id)
def flush(cls, crawler): for stage in crawler.stages: cls.conn.delete(make_key(crawler, stage)) for run_id in cls.run_ids(crawler): cls.conn.delete(make_key(crawler, run_id)) cls.conn.delete(make_key(crawler, run_id, "start")) cls.conn.delete(make_key(crawler, run_id, "end")) cls.conn.delete(make_key(crawler, run_id, "total_ops")) cls.conn.delete(make_key(crawler, "runs")) cls.conn.delete(make_key(crawler, "current_run")) cls.conn.delete(make_key(crawler, "total_ops")) cls.conn.delete(make_key(crawler, "last_run")) cls.conn.delete(make_key(crawler, "runs_abort"))
def parse_html(context, data, result): context.log.info('Parse: %r', result.url) title = result.html.findtext('.//title') if title is not None and 'title' not in data: data['title'] = title seen = set() for tag_query, attr_name in URL_TAGS: for element in result.html.findall(tag_query): attr = element.get(attr_name) if attr is None: continue url = normalize_url(urljoin(result.url, attr)) if url is None or url in seen: continue seen.add(url) tag = make_key((context.run_id, url)) if context.check_tag(tag): continue context.set_tag(tag, None) data = {'url': url} # Option to set the document title from the link text. if context.get('link_title', False): data['title'] = collapse_spaces(element.text_content()) elif element.get('title'): data['title'] = collapse_spaces(element.get('title')) context.emit(rule='fetch', data=data)
def tasks(cls): queues = [make_key('queue', c, s) for c, s in manager.stages] while True: timeout = 1 if settings.DEBUG else 0 task_data_tuple = conn.blpop(queues, timeout=timeout) # blpop blocks until it finds something. But fakeredis has no # blocking support. So it justs returns None. if task_data_tuple is None: return key, json_data = task_data_tuple # Shift the queues list so that the matching key is at the # very end of the list, priorising all other crawlers. # queues = list(reversed(queues)) deq = deque(queues) deq.rotate((queues.index(key) * -1) - 1) queues = list(deq) task_data = load_json(json_data) stage = task_data["stage"] state = task_data["state"] data = task_data["data"] next_time = task_data.get("next_allowed_exec_time") next_time = unpack_datetime(next_time) yield (stage, state, data, next_time)
def save(self): session = pickle.dumps(self.session) session = codecs.encode(session, 'base64') key = sha1(session).hexdigest()[:15] key = make_key(self.context.run_id, "session", key) conn.set(key, session, ex=QUEUE_EXPIRE) self.context.state[self.STATE_SESSION] = key
def skip_incremental(self, *criteria): """Perform an incremental check on a set of criteria. This can be used to execute a part of a crawler only once per an interval (which is specified by the ``expire`` setting). If the operation has already been performed (and should thus be skipped), this will return ``True``. If the operation needs to be executed, the returned value will be ``False``. """ if not self.incremental: return False # this is pure convenience, and will probably backfire at some point. key = make_key(criteria) if key is None: return False # this is used to re-run parts of a scrape after a certain interval, # e.g. half a year, or a year since = None if self.crawler.expire > 0: delta = timedelta(days=-1 * self.crawler.expire) since = datetime.utcnow() - delta if Tag.exists(self.crawler, key, since=since): return True self.set_tag(key, None) return False
def save(cls, crawler, session): session = pickle.dumps(session) session = codecs.encode(session, 'base64') key = sha1(session).hexdigest()[:15] key = make_key(crawler, "session", key) cls.conn.set(key, session, ex=QUEUE_EXPIRE) return key
def fetch(context, data): """Do an HTTP GET on the ``url`` specified in the inbound data.""" url = data.get('url') attempt = data.pop('retry_attempt', 1) try: result = context.http.get(url, lazy=True) rules = context.get('rules', {'match_all': {}}) if not Rule.get_rule(rules).apply(result): context.log.info('Fetch skip: %r', result.url) return if not result.ok: err = (result.url, result.status_code) context.emit_warning("Fetch fail [%s]: HTTP %s" % err) if not context.params.get('emit_errors', False): return else: context.log.info("Fetched [%s]: %r", result.status_code, result.url) data.update(result.serialize()) if url != result.url: tag = make_key(context.run_id, url) context.set_tag(tag, None) context.emit(data=data) except RequestException as ce: retries = int(context.get('retry', 3)) if retries >= attempt: context.log.warn("Retry: %s (error: %s)", url, ce) data['retry_attempt'] = attempt + 1 context.recurse(data=data, delay=2**attempt) else: context.emit_warning("Fetch fail [%s]: %s" % (url, ce))
def load_session(self): if self.STATE_SESSION not in self.context.state: return key = self.context.state.get(self.STATE_SESSION) value = conn.get(make_key(self.context.run_id, "session", key)) if value is not None: session = codecs.decode(bytes(value, 'utf-8'), 'base64') return pickle.loads(session)
def flush(cls, crawler): for run_id in cls.conn.smembers(make_key(crawler, "runs")): cls.conn.delete(make_key("run", run_id, "start")) cls.conn.delete(make_key("run", run_id, "end")) cls.conn.delete(make_key("run", run_id, "total_ops")) cls.conn.delete(make_key("run", run_id)) cls.conn.delete(make_key(crawler, "runs")) cls.conn.delete(make_key(crawler, "runs_list"))
def record_operation_start(cls, crawler, run_id): if not cls.conn.sismember(make_key(crawler, "runs"), run_id): cls.conn.sadd(make_key(crawler, "runs"), run_id) cls.conn.lpush(make_key(crawler, "runs_list"), run_id) cls.conn.set(make_key("run", run_id, "start"), pack_now()) cls.conn.incr(make_key("run", run_id)) cls.conn.incr(make_key("run", run_id, "total_ops"))
def parse_html(context, data, result): context.log.info('Parse: %r', result.url) title = result.html.findtext('.//title') if title is not None and 'title' not in data: data['title'] = title include = context.params.get('include_paths') if include is None: roots = [result.html] else: roots = [] for path in include: roots = roots + result.html.findall(path) seen = set() for root in roots: for tag_query, attr_name in URL_TAGS: for element in root.findall(tag_query): attr = element.get(attr_name) if attr is None: continue try: url = normalize_url(urljoin(result.url, attr)) except Exception: log.warning('Invalid URL: %r', attr) continue if url is None or url in seen: continue seen.add(url) tag = make_key(context.run_id, url) if context.check_tag(tag): continue context.set_tag(tag, None) data = {'url': url} # Option to set the document title from the link text. if context.get('link_title', False): data['title'] = collapse_spaces(element.text_content()) elif element.get('title'): data['title'] = collapse_spaces(element.get('title')) context.http.session.headers['Referer'] = url if re.findall('publicId|firstResult', url): print("----------------PRINTING URL----------------") print(url) context.emit(rule='fetch', data=data)
def delete(cls, crawler): cls.conn.delete(make_key(crawler, "events")) for level in cls.LEVELS: cls.conn.delete(make_key(crawler, "events", level)) for run_id in Crawl.run_ids(crawler): cls.conn.delete(make_key(crawler, "events", run_id)) for level in cls.LEVELS: cls.conn.delete(make_key(crawler, "events", run_id, level)) for stage in crawler.stages.keys(): cls.conn.delete(make_key(crawler, "events", stage)) for level in cls.LEVELS: cls.conn.delete(make_key(crawler, "events", stage, level))
def fetch(context, data): """Do an HTTP GET on the ``url`` specified in the inbound data.""" url = data.get('url') result = context.http.get(url, lazy=True) rules = context.get('rules', {'match_all': {}}) if not Rule.get_rule(rules).apply(result): context.log.info('Fetch skip: %r', result.url) return if not result.ok: context.emit_warning("Fetch fail [%s]: %s", result.status_code, result.url) return context.log.info("Fetched [%s]: %r", result.status_code, result.url) data.update(result.serialize()) if url != result.url: tag = make_key((context.run_id, url)) context.set_tag(tag, None) context.emit(data=data)
def skip_incremental(self, *criteria): """Perform an incremental check on a set of criteria. This can be used to execute a part of a crawler only once per an interval (which is specified by the ``expire`` setting). If the operation has already been performed (and should thus be skipped), this will return ``True``. If the operation needs to be executed, the returned value will be ``False``. """ if not self.incremental: return False # this is pure convenience, and will probably backfire at some point. key = make_key(*criteria) if key is None: return False if Tag.exists(self.crawler, key): return True self.set_tag(key, None) return False
def save(cls, crawler, stage, level, run_id, error=None, message=None): """Create an event, possibly based on an exception.""" event = { 'stage': stage.name, 'level': level, 'timestamp': pack_now(), 'error': error, 'message': message } data = dump_json(event) cls.conn.lpush(make_key(crawler, "events"), data) cls.conn.lpush(make_key(crawler, "events", level), data) cls.conn.lpush(make_key(crawler, "events", stage), data) cls.conn.lpush(make_key(crawler, "events", stage, level), data) cls.conn.lpush(make_key(crawler, "events", run_id), data) cls.conn.lpush(make_key(crawler, "events", run_id, level), data) return event
def get_crawler_events(cls, crawler, start, end, level=None): key = make_key(crawler, "events", level) return cls.event_list(key, start, end)
def get_run_counts(cls, crawler, run_id): counts = {} for level in cls.LEVELS: key = make_key(crawler, "events", run_id, level) counts[level] = cls.conn.llen(key) or 0 return counts
def get_stage_counts(cls, crawler, stage): counts = {} for level in cls.LEVELS: key = make_key(crawler, "events", stage, level) counts[level] = cls.conn.llen(key) or 0 return counts
def delete(cls, crawler): for key in cls.conn.scan_iter(make_key(crawler, "events", "*")): cls.conn.delete(key)
def size(cls, crawler): """Total operations pending for this crawler""" key = make_key('queue_pending', crawler) return unpack_int(cls.conn.get(key))
def is_aborted(cls, crawler, run_id): key = make_key(crawler, "runs_abort") return cls.conn.sismember(key, run_id)
def get_stage_events(cls, crawler, stage_name, start, end, level=None): """events from a particular stage""" key = make_key(crawler, "events", stage_name, level) return cls.event_list(key, start, end)
def get_run_events(cls, crawler, run_id, start, end, level=None): """Events from a particular run""" key = make_key(crawler, "events", run_id, level) return cls.event_list(key, start, end)
def queue(cls, stage, state, data, delay=None): crawler = state.get('crawler') task_data = cls.serialize_task_data(stage, state, data, delay) cls.conn.rpush(make_key('queue', crawler, stage), task_data) cls.conn.incr(make_key('queue_pending', crawler))