Python make_key Examples, memorious.util.make_key Python Examples

Example #1

0

Show file

File: queue.py Project: MediaUncovered/memorious

    def tasks(cls):
        queues = [make_key('queue', c, s) for c, s in manager.stages]
        random.shuffle(queues)
        while True:
            task_data_tuple = cls.conn.blpop(queues)
            # blpop blocks until it finds something. But fakeredis has no
            # blocking support. So it justs returns None.
            if not task_data_tuple:
                return

            key, json_data = task_data_tuple
            # Shift the queues list so that the matching key is at the
            # very end of the list, priorising all other crawlers.
            # queues = list(reversed(queues))
            deq = deque(queues)
            deq.rotate((queues.index(key) * -1) - 1)
            queues = list(deq)

            task_data = load_json(json_data)
            stage = task_data["stage"]
            state = task_data["state"]
            data = task_data["data"]
            next_time = task_data.get("next_allowed_exec_time")
            next_time = unpack_datetime(next_time)
            crawler = state.get('crawler')
            cls.conn.decr(make_key('queue_pending', crawler))
            yield (stage, state, data, next_time)

Example #2

0

Show file

File: queue.py Project: MediaUncovered/memorious

 def flush(cls, crawler):
     prefix = make_key('queue', crawler, '*')
     for key in cls.conn.scan_iter(prefix):
         cls.conn.delete(key)
         cls.conn.ltrim(key, 0, -1)
         cls.conn.srem("queues_set", key)
     cls.conn.delete(make_key('queue_pending', crawler))

Example #3

0

Show file

File: crawl.py Project: edmondsylar/memorious

 def op_count(cls, crawler, stage=None):
     """Total operations performed for this crawler"""
     if stage:
         total_ops = cls.conn.get(make_key(crawler, stage))
     else:
         total_ops = cls.conn.get(make_key(crawler, "total_ops"))
     return unpack_int(total_ops)

Example #4

0

Show file

File: crawl.py Project: edmondsylar/memorious

 def runs(cls, crawler):
     for run_id in cls.run_ids(crawler):
         start = cls.conn.get(make_key("run", run_id, "start"))
         end = cls.conn.get(make_key("run", run_id, "end"))
         total_ops = cls.conn.get(make_key("run", run_id, "total_ops"))
         yield {
             'run_id': run_id,
             'total_ops': unpack_int(total_ops),
             'start': unpack_datetime(start, datetime.utcnow()),
             'end': unpack_datetime(end)
         }

Example #5

0

Show file

 def runs(cls, crawler):
     for run_id in cls.conn.lrange(make_key(crawler, "runs_list"), 0, -1):
         start = cls.conn.get(make_key("run", run_id, "start"))
         end = cls.conn.get(make_key("run", run_id, "end"))
         total_ops = cls.conn.get(make_key("run", run_id, "total_ops"))
         yield {
             'run_id': run_id,
             'total_ops': unpack_int(total_ops),
             'start': unpack_datetime(start),
             'end': unpack_datetime(end)
         }

Example #6

0

Show file

File: crawl.py Project: edmondsylar/memorious

 def operation_start(cls, crawler, stage, run_id):
     if not cls.conn.sismember(make_key(crawler, "runs"), run_id):
         cls.conn.sadd(make_key(crawler, "runs"), run_id)
         cls.conn.set(make_key("run", run_id, "start"), pack_now())
     cls.conn.incr(make_key("run", run_id))
     cls.conn.incr(make_key("run", run_id, "total_ops"))
     cls.conn.incr(make_key(crawler, stage))
     cls.conn.incr(make_key(crawler, "total_ops"))
     cls.conn.set(make_key(crawler, "last_run"), pack_now())
     cls.conn.set(make_key(crawler, "current_run"), run_id)

Example #7

0

Show file

File: crawl.py Project: edmondsylar/memorious

    def flush(cls, crawler):
        for stage in crawler.stages:
            cls.conn.delete(make_key(crawler, stage))

        for run_id in cls.run_ids(crawler):
            cls.conn.delete(make_key(crawler, run_id))
            cls.conn.delete(make_key(crawler, run_id, "start"))
            cls.conn.delete(make_key(crawler, run_id, "end"))
            cls.conn.delete(make_key(crawler, run_id, "total_ops"))

        cls.conn.delete(make_key(crawler, "runs"))
        cls.conn.delete(make_key(crawler, "current_run"))
        cls.conn.delete(make_key(crawler, "total_ops"))
        cls.conn.delete(make_key(crawler, "last_run"))
        cls.conn.delete(make_key(crawler, "runs_abort"))

Example #8

0

Show file

File: parse.py Project: patcon/memorious

def parse_html(context, data, result):
    context.log.info('Parse: %r', result.url)

    title = result.html.findtext('.//title')
    if title is not None and 'title' not in data:
        data['title'] = title

    seen = set()
    for tag_query, attr_name in URL_TAGS:
        for element in result.html.findall(tag_query):
            attr = element.get(attr_name)
            if attr is None:
                continue

            url = normalize_url(urljoin(result.url, attr))
            if url is None or url in seen:
                continue
            seen.add(url)

            tag = make_key((context.run_id, url))
            if context.check_tag(tag):
                continue
            context.set_tag(tag, None)

            data = {'url': url}
            # Option to set the document title from the link text.
            if context.get('link_title', False):
                data['title'] = collapse_spaces(element.text_content())
            elif element.get('title'):
                data['title'] = collapse_spaces(element.get('title'))
            context.emit(rule='fetch', data=data)

Example #9

0

Show file

File: queue.py Project: summerlove66/memorious

    def tasks(cls):
        queues = [make_key('queue', c, s) for c, s in manager.stages]
        while True:
            timeout = 1 if settings.DEBUG else 0
            task_data_tuple = conn.blpop(queues, timeout=timeout)
            # blpop blocks until it finds something. But fakeredis has no
            # blocking support. So it justs returns None.
            if task_data_tuple is None:
                return

            key, json_data = task_data_tuple
            # Shift the queues list so that the matching key is at the
            # very end of the list, priorising all other crawlers.
            # queues = list(reversed(queues))
            deq = deque(queues)
            deq.rotate((queues.index(key) * -1) - 1)
            queues = list(deq)

            task_data = load_json(json_data)
            stage = task_data["stage"]
            state = task_data["state"]
            data = task_data["data"]
            next_time = task_data.get("next_allowed_exec_time")
            next_time = unpack_datetime(next_time)
            yield (stage, state, data, next_time)

Example #10

0

Show file

File: http.py Project: summerlove66/memorious

 def save(self):
     session = pickle.dumps(self.session)
     session = codecs.encode(session, 'base64')
     key = sha1(session).hexdigest()[:15]
     key = make_key(self.context.run_id, "session", key)
     conn.set(key, session, ex=QUEUE_EXPIRE)
     self.context.state[self.STATE_SESSION] = key

Example #11

0

Show file

    def skip_incremental(self, *criteria):
        """Perform an incremental check on a set of criteria.

        This can be used to execute a part of a crawler only once per an
        interval (which is specified by the ``expire`` setting). If the
        operation has already been performed (and should thus be skipped),
        this will return ``True``. If the operation needs to be executed,
        the returned value will be ``False``.
        """
        if not self.incremental:
            return False

        # this is pure convenience, and will probably backfire at some point.
        key = make_key(criteria)
        if key is None:
            return False

        # this is used to re-run parts of a scrape after a certain interval,
        # e.g. half a year, or a year
        since = None
        if self.crawler.expire > 0:
            delta = timedelta(days=-1 * self.crawler.expire)
            since = datetime.utcnow() - delta

        if Tag.exists(self.crawler, key, since=since):
            return True
        self.set_tag(key, None)
        return False

Example #12

0

Show file

File: session.py Project: paterry/memorious

 def save(cls, crawler, session):
     session = pickle.dumps(session)
     session = codecs.encode(session, 'base64')
     key = sha1(session).hexdigest()[:15]
     key = make_key(crawler, "session", key)
     cls.conn.set(key, session, ex=QUEUE_EXPIRE)
     return key

Example #13

0

Show file

def fetch(context, data):
    """Do an HTTP GET on the ``url`` specified in the inbound data."""
    url = data.get('url')
    attempt = data.pop('retry_attempt', 1)
    try:
        result = context.http.get(url, lazy=True)
        rules = context.get('rules', {'match_all': {}})
        if not Rule.get_rule(rules).apply(result):
            context.log.info('Fetch skip: %r', result.url)
            return

        if not result.ok:
            err = (result.url, result.status_code)
            context.emit_warning("Fetch fail [%s]: HTTP %s" % err)
            if not context.params.get('emit_errors', False):
                return
        else:
            context.log.info("Fetched [%s]: %r", result.status_code,
                             result.url)

        data.update(result.serialize())
        if url != result.url:
            tag = make_key(context.run_id, url)
            context.set_tag(tag, None)
        context.emit(data=data)
    except RequestException as ce:
        retries = int(context.get('retry', 3))
        if retries >= attempt:
            context.log.warn("Retry: %s (error: %s)", url, ce)
            data['retry_attempt'] = attempt + 1
            context.recurse(data=data, delay=2**attempt)
        else:
            context.emit_warning("Fetch fail [%s]: %s" % (url, ce))

Example #14

0

Show file

File: http.py Project: summerlove66/memorious

 def load_session(self):
     if self.STATE_SESSION not in self.context.state:
         return
     key = self.context.state.get(self.STATE_SESSION)
     value = conn.get(make_key(self.context.run_id, "session", key))
     if value is not None:
         session = codecs.decode(bytes(value, 'utf-8'), 'base64')
         return pickle.loads(session)

Example #15

0

Show file

 def flush(cls, crawler):
     for run_id in cls.conn.smembers(make_key(crawler, "runs")):
         cls.conn.delete(make_key("run", run_id, "start"))
         cls.conn.delete(make_key("run", run_id, "end"))
         cls.conn.delete(make_key("run", run_id, "total_ops"))
         cls.conn.delete(make_key("run", run_id))
     cls.conn.delete(make_key(crawler, "runs"))
     cls.conn.delete(make_key(crawler, "runs_list"))

Example #16

0

Show file

 def record_operation_start(cls, crawler, run_id):
     if not cls.conn.sismember(make_key(crawler, "runs"), run_id):
         cls.conn.sadd(make_key(crawler, "runs"), run_id)
         cls.conn.lpush(make_key(crawler, "runs_list"), run_id)
         cls.conn.set(make_key("run", run_id, "start"), pack_now())
     cls.conn.incr(make_key("run", run_id))
     cls.conn.incr(make_key("run", run_id, "total_ops"))

Example #17

0

Show file

def parse_html(context, data, result):
    context.log.info('Parse: %r', result.url)

    title = result.html.findtext('.//title')
    if title is not None and 'title' not in data:
        data['title'] = title

    include = context.params.get('include_paths')
    if include is None:
        roots = [result.html]
    else:
        roots = []
        for path in include:
            roots = roots + result.html.findall(path)

    seen = set()
    for root in roots:
        for tag_query, attr_name in URL_TAGS:
            for element in root.findall(tag_query):
                attr = element.get(attr_name)
                if attr is None:
                    continue

                try:
                    url = normalize_url(urljoin(result.url, attr))
                except Exception:
                    log.warning('Invalid URL: %r', attr)
                    continue

                if url is None or url in seen:
                    continue
                seen.add(url)

                tag = make_key(context.run_id, url)
                if context.check_tag(tag):
                    continue
                context.set_tag(tag, None)
                data = {'url': url}
                # Option to set the document title from the link text.
                if context.get('link_title', False):
                    data['title'] = collapse_spaces(element.text_content())
                elif element.get('title'):
                    data['title'] = collapse_spaces(element.get('title'))

                context.http.session.headers['Referer'] = url
                if  re.findall('publicId|firstResult', url):
                    print("----------------PRINTING URL----------------")
                    print(url)
                    context.emit(rule='fetch', data=data)

Example #18

0

Show file

 def delete(cls, crawler):
     cls.conn.delete(make_key(crawler, "events"))
     for level in cls.LEVELS:
         cls.conn.delete(make_key(crawler, "events", level))
     for run_id in Crawl.run_ids(crawler):
         cls.conn.delete(make_key(crawler, "events", run_id))
         for level in cls.LEVELS:
             cls.conn.delete(make_key(crawler, "events", run_id, level))
     for stage in crawler.stages.keys():
         cls.conn.delete(make_key(crawler, "events", stage))
         for level in cls.LEVELS:
             cls.conn.delete(make_key(crawler, "events", stage, level))

Example #19

0

Show file

def fetch(context, data):
    """Do an HTTP GET on the ``url`` specified in the inbound data."""
    url = data.get('url')
    result = context.http.get(url, lazy=True)

    rules = context.get('rules', {'match_all': {}})
    if not Rule.get_rule(rules).apply(result):
        context.log.info('Fetch skip: %r', result.url)
        return

    if not result.ok:
        context.emit_warning("Fetch fail [%s]: %s", result.status_code,
                             result.url)
        return

    context.log.info("Fetched [%s]: %r", result.status_code, result.url)
    data.update(result.serialize())
    if url != result.url:
        tag = make_key((context.run_id, url))
        context.set_tag(tag, None)
    context.emit(data=data)

Example #20

0

Show file

    def skip_incremental(self, *criteria):
        """Perform an incremental check on a set of criteria.

        This can be used to execute a part of a crawler only once per an
        interval (which is specified by the ``expire`` setting). If the
        operation has already been performed (and should thus be skipped),
        this will return ``True``. If the operation needs to be executed,
        the returned value will be ``False``.
        """
        if not self.incremental:
            return False

        # this is pure convenience, and will probably backfire at some point.
        key = make_key(*criteria)
        if key is None:
            return False

        if Tag.exists(self.crawler, key):
            return True

        self.set_tag(key, None)
        return False

Example #21

0

Show file

 def save(cls, crawler, stage, level, run_id, error=None, message=None):
     """Create an event, possibly based on an exception."""
     event = {
         'stage': stage.name,
         'level': level,
         'timestamp': pack_now(),
         'error': error,
         'message': message
     }
     data = dump_json(event)
     cls.conn.lpush(make_key(crawler, "events"), data)
     cls.conn.lpush(make_key(crawler, "events", level), data)
     cls.conn.lpush(make_key(crawler, "events", stage), data)
     cls.conn.lpush(make_key(crawler, "events", stage, level), data)
     cls.conn.lpush(make_key(crawler, "events", run_id), data)
     cls.conn.lpush(make_key(crawler, "events", run_id, level), data)
     return event

Example #22

0

Show file

 def get_crawler_events(cls, crawler, start, end, level=None):
     key = make_key(crawler, "events", level)
     return cls.event_list(key, start, end)

Example #23

0

Show file

 def get_run_counts(cls, crawler, run_id):
     counts = {}
     for level in cls.LEVELS:
         key = make_key(crawler, "events", run_id, level)
         counts[level] = cls.conn.llen(key) or 0
     return counts

Example #24

0

Show file

 def get_stage_counts(cls, crawler, stage):
     counts = {}
     for level in cls.LEVELS:
         key = make_key(crawler, "events", stage, level)
         counts[level] = cls.conn.llen(key) or 0
     return counts

Example #25

0

Show file

 def delete(cls, crawler):
     for key in cls.conn.scan_iter(make_key(crawler, "events", "*")):
         cls.conn.delete(key)

Example #26

0

Show file

File: queue.py Project: MediaUncovered/memorious

 def size(cls, crawler):
     """Total operations pending for this crawler"""
     key = make_key('queue_pending', crawler)
     return unpack_int(cls.conn.get(key))

Example #27

0

Show file

File: crawl.py Project: edmondsylar/memorious

 def is_aborted(cls, crawler, run_id):
     key = make_key(crawler, "runs_abort")
     return cls.conn.sismember(key, run_id)

Example #28

0

Show file

 def get_stage_events(cls, crawler, stage_name, start, end, level=None):
     """events from a particular stage"""
     key = make_key(crawler, "events", stage_name, level)
     return cls.event_list(key, start, end)

Example #29

0

Show file

 def get_run_events(cls, crawler, run_id, start, end, level=None):
     """Events from a particular run"""
     key = make_key(crawler, "events", run_id, level)
     return cls.event_list(key, start, end)

Example #30

0

Show file

File: queue.py Project: MediaUncovered/memorious

 def queue(cls, stage, state, data, delay=None):
     crawler = state.get('crawler')
     task_data = cls.serialize_task_data(stage, state, data, delay)
     cls.conn.rpush(make_key('queue', crawler, stage), task_data)
     cls.conn.incr(make_key('queue_pending', crawler))