Esempio n. 1
0
    def tasks(cls):
        queues = [make_key('queue', c, s) for c, s in manager.stages]
        random.shuffle(queues)
        while True:
            task_data_tuple = cls.conn.blpop(queues)
            # blpop blocks until it finds something. But fakeredis has no
            # blocking support. So it justs returns None.
            if not task_data_tuple:
                return

            key, json_data = task_data_tuple
            # Shift the queues list so that the matching key is at the
            # very end of the list, priorising all other crawlers.
            # queues = list(reversed(queues))
            deq = deque(queues)
            deq.rotate((queues.index(key) * -1) - 1)
            queues = list(deq)

            task_data = load_json(json_data)
            stage = task_data["stage"]
            state = task_data["state"]
            data = task_data["data"]
            next_time = task_data.get("next_allowed_exec_time")
            next_time = unpack_datetime(next_time)
            crawler = state.get('crawler')
            cls.conn.decr(make_key('queue_pending', crawler))
            yield (stage, state, data, next_time)
Esempio n. 2
0
 def flush(cls, crawler):
     prefix = make_key('queue', crawler, '*')
     for key in cls.conn.scan_iter(prefix):
         cls.conn.delete(key)
         cls.conn.ltrim(key, 0, -1)
         cls.conn.srem("queues_set", key)
     cls.conn.delete(make_key('queue_pending', crawler))
Esempio n. 3
0
 def op_count(cls, crawler, stage=None):
     """Total operations performed for this crawler"""
     if stage:
         total_ops = cls.conn.get(make_key(crawler, stage))
     else:
         total_ops = cls.conn.get(make_key(crawler, "total_ops"))
     return unpack_int(total_ops)
Esempio n. 4
0
 def runs(cls, crawler):
     for run_id in cls.run_ids(crawler):
         start = cls.conn.get(make_key("run", run_id, "start"))
         end = cls.conn.get(make_key("run", run_id, "end"))
         total_ops = cls.conn.get(make_key("run", run_id, "total_ops"))
         yield {
             'run_id': run_id,
             'total_ops': unpack_int(total_ops),
             'start': unpack_datetime(start, datetime.utcnow()),
             'end': unpack_datetime(end)
         }
Esempio n. 5
0
 def runs(cls, crawler):
     for run_id in cls.conn.lrange(make_key(crawler, "runs_list"), 0, -1):
         start = cls.conn.get(make_key("run", run_id, "start"))
         end = cls.conn.get(make_key("run", run_id, "end"))
         total_ops = cls.conn.get(make_key("run", run_id, "total_ops"))
         yield {
             'run_id': run_id,
             'total_ops': unpack_int(total_ops),
             'start': unpack_datetime(start),
             'end': unpack_datetime(end)
         }
Esempio n. 6
0
 def operation_start(cls, crawler, stage, run_id):
     if not cls.conn.sismember(make_key(crawler, "runs"), run_id):
         cls.conn.sadd(make_key(crawler, "runs"), run_id)
         cls.conn.set(make_key("run", run_id, "start"), pack_now())
     cls.conn.incr(make_key("run", run_id))
     cls.conn.incr(make_key("run", run_id, "total_ops"))
     cls.conn.incr(make_key(crawler, stage))
     cls.conn.incr(make_key(crawler, "total_ops"))
     cls.conn.set(make_key(crawler, "last_run"), pack_now())
     cls.conn.set(make_key(crawler, "current_run"), run_id)
Esempio n. 7
0
    def flush(cls, crawler):
        for stage in crawler.stages:
            cls.conn.delete(make_key(crawler, stage))

        for run_id in cls.run_ids(crawler):
            cls.conn.delete(make_key(crawler, run_id))
            cls.conn.delete(make_key(crawler, run_id, "start"))
            cls.conn.delete(make_key(crawler, run_id, "end"))
            cls.conn.delete(make_key(crawler, run_id, "total_ops"))

        cls.conn.delete(make_key(crawler, "runs"))
        cls.conn.delete(make_key(crawler, "current_run"))
        cls.conn.delete(make_key(crawler, "total_ops"))
        cls.conn.delete(make_key(crawler, "last_run"))
        cls.conn.delete(make_key(crawler, "runs_abort"))
Esempio n. 8
0
def parse_html(context, data, result):
    context.log.info('Parse: %r', result.url)

    title = result.html.findtext('.//title')
    if title is not None and 'title' not in data:
        data['title'] = title

    seen = set()
    for tag_query, attr_name in URL_TAGS:
        for element in result.html.findall(tag_query):
            attr = element.get(attr_name)
            if attr is None:
                continue

            url = normalize_url(urljoin(result.url, attr))
            if url is None or url in seen:
                continue
            seen.add(url)

            tag = make_key((context.run_id, url))
            if context.check_tag(tag):
                continue
            context.set_tag(tag, None)

            data = {'url': url}
            # Option to set the document title from the link text.
            if context.get('link_title', False):
                data['title'] = collapse_spaces(element.text_content())
            elif element.get('title'):
                data['title'] = collapse_spaces(element.get('title'))
            context.emit(rule='fetch', data=data)
Esempio n. 9
0
    def tasks(cls):
        queues = [make_key('queue', c, s) for c, s in manager.stages]
        while True:
            timeout = 1 if settings.DEBUG else 0
            task_data_tuple = conn.blpop(queues, timeout=timeout)
            # blpop blocks until it finds something. But fakeredis has no
            # blocking support. So it justs returns None.
            if task_data_tuple is None:
                return

            key, json_data = task_data_tuple
            # Shift the queues list so that the matching key is at the
            # very end of the list, priorising all other crawlers.
            # queues = list(reversed(queues))
            deq = deque(queues)
            deq.rotate((queues.index(key) * -1) - 1)
            queues = list(deq)

            task_data = load_json(json_data)
            stage = task_data["stage"]
            state = task_data["state"]
            data = task_data["data"]
            next_time = task_data.get("next_allowed_exec_time")
            next_time = unpack_datetime(next_time)
            yield (stage, state, data, next_time)
Esempio n. 10
0
 def save(self):
     session = pickle.dumps(self.session)
     session = codecs.encode(session, 'base64')
     key = sha1(session).hexdigest()[:15]
     key = make_key(self.context.run_id, "session", key)
     conn.set(key, session, ex=QUEUE_EXPIRE)
     self.context.state[self.STATE_SESSION] = key
Esempio n. 11
0
    def skip_incremental(self, *criteria):
        """Perform an incremental check on a set of criteria.

        This can be used to execute a part of a crawler only once per an
        interval (which is specified by the ``expire`` setting). If the
        operation has already been performed (and should thus be skipped),
        this will return ``True``. If the operation needs to be executed,
        the returned value will be ``False``.
        """
        if not self.incremental:
            return False

        # this is pure convenience, and will probably backfire at some point.
        key = make_key(criteria)
        if key is None:
            return False

        # this is used to re-run parts of a scrape after a certain interval,
        # e.g. half a year, or a year
        since = None
        if self.crawler.expire > 0:
            delta = timedelta(days=-1 * self.crawler.expire)
            since = datetime.utcnow() - delta

        if Tag.exists(self.crawler, key, since=since):
            return True
        self.set_tag(key, None)
        return False
Esempio n. 12
0
 def save(cls, crawler, session):
     session = pickle.dumps(session)
     session = codecs.encode(session, 'base64')
     key = sha1(session).hexdigest()[:15]
     key = make_key(crawler, "session", key)
     cls.conn.set(key, session, ex=QUEUE_EXPIRE)
     return key
Esempio n. 13
0
def fetch(context, data):
    """Do an HTTP GET on the ``url`` specified in the inbound data."""
    url = data.get('url')
    attempt = data.pop('retry_attempt', 1)
    try:
        result = context.http.get(url, lazy=True)
        rules = context.get('rules', {'match_all': {}})
        if not Rule.get_rule(rules).apply(result):
            context.log.info('Fetch skip: %r', result.url)
            return

        if not result.ok:
            err = (result.url, result.status_code)
            context.emit_warning("Fetch fail [%s]: HTTP %s" % err)
            if not context.params.get('emit_errors', False):
                return
        else:
            context.log.info("Fetched [%s]: %r", result.status_code,
                             result.url)

        data.update(result.serialize())
        if url != result.url:
            tag = make_key(context.run_id, url)
            context.set_tag(tag, None)
        context.emit(data=data)
    except RequestException as ce:
        retries = int(context.get('retry', 3))
        if retries >= attempt:
            context.log.warn("Retry: %s (error: %s)", url, ce)
            data['retry_attempt'] = attempt + 1
            context.recurse(data=data, delay=2**attempt)
        else:
            context.emit_warning("Fetch fail [%s]: %s" % (url, ce))
Esempio n. 14
0
 def load_session(self):
     if self.STATE_SESSION not in self.context.state:
         return
     key = self.context.state.get(self.STATE_SESSION)
     value = conn.get(make_key(self.context.run_id, "session", key))
     if value is not None:
         session = codecs.decode(bytes(value, 'utf-8'), 'base64')
         return pickle.loads(session)
Esempio n. 15
0
 def flush(cls, crawler):
     for run_id in cls.conn.smembers(make_key(crawler, "runs")):
         cls.conn.delete(make_key("run", run_id, "start"))
         cls.conn.delete(make_key("run", run_id, "end"))
         cls.conn.delete(make_key("run", run_id, "total_ops"))
         cls.conn.delete(make_key("run", run_id))
     cls.conn.delete(make_key(crawler, "runs"))
     cls.conn.delete(make_key(crawler, "runs_list"))
Esempio n. 16
0
 def record_operation_start(cls, crawler, run_id):
     if not cls.conn.sismember(make_key(crawler, "runs"), run_id):
         cls.conn.sadd(make_key(crawler, "runs"), run_id)
         cls.conn.lpush(make_key(crawler, "runs_list"), run_id)
         cls.conn.set(make_key("run", run_id, "start"), pack_now())
     cls.conn.incr(make_key("run", run_id))
     cls.conn.incr(make_key("run", run_id, "total_ops"))
Esempio n. 17
0
def parse_html(context, data, result):
    context.log.info('Parse: %r', result.url)

    title = result.html.findtext('.//title')
    if title is not None and 'title' not in data:
        data['title'] = title

    include = context.params.get('include_paths')
    if include is None:
        roots = [result.html]
    else:
        roots = []
        for path in include:
            roots = roots + result.html.findall(path)

    seen = set()
    for root in roots:
        for tag_query, attr_name in URL_TAGS:
            for element in root.findall(tag_query):
                attr = element.get(attr_name)
                if attr is None:
                    continue

                try:
                    url = normalize_url(urljoin(result.url, attr))
                except Exception:
                    log.warning('Invalid URL: %r', attr)
                    continue

                if url is None or url in seen:
                    continue
                seen.add(url)

                tag = make_key(context.run_id, url)
                if context.check_tag(tag):
                    continue
                context.set_tag(tag, None)
                data = {'url': url}
                # Option to set the document title from the link text.
                if context.get('link_title', False):
                    data['title'] = collapse_spaces(element.text_content())
                elif element.get('title'):
                    data['title'] = collapse_spaces(element.get('title'))

                context.http.session.headers['Referer'] = url
                if  re.findall('publicId|firstResult', url):
                    print("----------------PRINTING URL----------------")
                    print(url)
                    context.emit(rule='fetch', data=data)
Esempio n. 18
0
 def delete(cls, crawler):
     cls.conn.delete(make_key(crawler, "events"))
     for level in cls.LEVELS:
         cls.conn.delete(make_key(crawler, "events", level))
     for run_id in Crawl.run_ids(crawler):
         cls.conn.delete(make_key(crawler, "events", run_id))
         for level in cls.LEVELS:
             cls.conn.delete(make_key(crawler, "events", run_id, level))
     for stage in crawler.stages.keys():
         cls.conn.delete(make_key(crawler, "events", stage))
         for level in cls.LEVELS:
             cls.conn.delete(make_key(crawler, "events", stage, level))
Esempio n. 19
0
def fetch(context, data):
    """Do an HTTP GET on the ``url`` specified in the inbound data."""
    url = data.get('url')
    result = context.http.get(url, lazy=True)

    rules = context.get('rules', {'match_all': {}})
    if not Rule.get_rule(rules).apply(result):
        context.log.info('Fetch skip: %r', result.url)
        return

    if not result.ok:
        context.emit_warning("Fetch fail [%s]: %s", result.status_code,
                             result.url)
        return

    context.log.info("Fetched [%s]: %r", result.status_code, result.url)
    data.update(result.serialize())
    if url != result.url:
        tag = make_key((context.run_id, url))
        context.set_tag(tag, None)
    context.emit(data=data)
Esempio n. 20
0
    def skip_incremental(self, *criteria):
        """Perform an incremental check on a set of criteria.

        This can be used to execute a part of a crawler only once per an
        interval (which is specified by the ``expire`` setting). If the
        operation has already been performed (and should thus be skipped),
        this will return ``True``. If the operation needs to be executed,
        the returned value will be ``False``.
        """
        if not self.incremental:
            return False

        # this is pure convenience, and will probably backfire at some point.
        key = make_key(*criteria)
        if key is None:
            return False

        if Tag.exists(self.crawler, key):
            return True

        self.set_tag(key, None)
        return False
Esempio n. 21
0
 def save(cls, crawler, stage, level, run_id, error=None, message=None):
     """Create an event, possibly based on an exception."""
     event = {
         'stage': stage.name,
         'level': level,
         'timestamp': pack_now(),
         'error': error,
         'message': message
     }
     data = dump_json(event)
     cls.conn.lpush(make_key(crawler, "events"), data)
     cls.conn.lpush(make_key(crawler, "events", level), data)
     cls.conn.lpush(make_key(crawler, "events", stage), data)
     cls.conn.lpush(make_key(crawler, "events", stage, level), data)
     cls.conn.lpush(make_key(crawler, "events", run_id), data)
     cls.conn.lpush(make_key(crawler, "events", run_id, level), data)
     return event
Esempio n. 22
0
 def get_crawler_events(cls, crawler, start, end, level=None):
     key = make_key(crawler, "events", level)
     return cls.event_list(key, start, end)
Esempio n. 23
0
 def get_run_counts(cls, crawler, run_id):
     counts = {}
     for level in cls.LEVELS:
         key = make_key(crawler, "events", run_id, level)
         counts[level] = cls.conn.llen(key) or 0
     return counts
Esempio n. 24
0
 def get_stage_counts(cls, crawler, stage):
     counts = {}
     for level in cls.LEVELS:
         key = make_key(crawler, "events", stage, level)
         counts[level] = cls.conn.llen(key) or 0
     return counts
Esempio n. 25
0
 def delete(cls, crawler):
     for key in cls.conn.scan_iter(make_key(crawler, "events", "*")):
         cls.conn.delete(key)
Esempio n. 26
0
 def size(cls, crawler):
     """Total operations pending for this crawler"""
     key = make_key('queue_pending', crawler)
     return unpack_int(cls.conn.get(key))
Esempio n. 27
0
 def is_aborted(cls, crawler, run_id):
     key = make_key(crawler, "runs_abort")
     return cls.conn.sismember(key, run_id)
Esempio n. 28
0
 def get_stage_events(cls, crawler, stage_name, start, end, level=None):
     """events from a particular stage"""
     key = make_key(crawler, "events", stage_name, level)
     return cls.event_list(key, start, end)
Esempio n. 29
0
 def get_run_events(cls, crawler, run_id, start, end, level=None):
     """Events from a particular run"""
     key = make_key(crawler, "events", run_id, level)
     return cls.event_list(key, start, end)
Esempio n. 30
0
 def queue(cls, stage, state, data, delay=None):
     crawler = state.get('crawler')
     task_data = cls.serialize_task_data(stage, state, data, delay)
     cls.conn.rpush(make_key('queue', crawler, stage), task_data)
     cls.conn.incr(make_key('queue_pending', crawler))