Beispiel #1
0
 def op_count(cls, crawler, stage=None):
     """Total operations performed for this crawler"""
     if stage:
         total_ops = conn.get(make_key(crawler, stage))
     else:
         total_ops = conn.get(make_key(crawler, "total_ops"))
     return unpack_int(total_ops)
Beispiel #2
0
 def timeout_expiration_check(self):
     stages_on_timeout_key = make_key('memorious', 'timeout_stages')
     stages_on_timeout = conn.smembers(stages_on_timeout_key)
     for stage in stages_on_timeout:
         key = make_key('memorious', 'timeout', stage)
         if not conn.get(key):
             conn.srem(stages_on_timeout_key, stage)
Beispiel #3
0
 def operation_end(cls, crawler, run_id):
     conn.set(make_key(crawler, "last_run"), pack_now(), ex=REDIS_LONG)
     pending = conn.decr(make_key(crawler, "run", run_id))
     if unpack_int(pending) == 0:
         conn.set(make_key(crawler, "run", run_id, "end"),
                  pack_now(),
                  ex=REDIS_LONG)
Beispiel #4
0
 def __init__(self, conn, dataset, job_id):  # noqa
     self.conn = conn
     self.id = job_id
     self.dataset = Dataset.ensure(conn, dataset)
     self.start_key = make_key(PREFIX, 'qd', self.id, dataset, 'start')
     self.end_key = make_key(PREFIX, 'qd', self.id, dataset, 'end')
     self.active_jobs_key = make_key(PREFIX, 'qdja')
Beispiel #5
0
 def __init__(self, conn, dataset, job_id):  # noqa
     self.conn = conn
     self.id = job_id
     self.dataset = Dataset.ensure(conn, dataset)
     self.start_key = make_key(PREFIX, "qd", self.id, dataset, "start")
     self.end_key = make_key(PREFIX, "qd", self.id, dataset, "end")
     self.active_jobs_key = make_key(PREFIX, "qdja")
Beispiel #6
0
def aleph_emit_entity(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    entity_id = data.get("entity_id", data.get("id"))
    if not entity_id:
        context.emit_warning(
            "Error: Can not create entity. `id` is not definied")
        return
    source_url = data.get("source_url", data.get("url"))
    foreign_id = data.get("foreign_id", data.get("request_id", source_url))
    # Fetch entity from cache
    cached_entity = context.get_tag(
        make_key(collection_id, foreign_id, entity_id))

    if cached_entity and isinstance(cached_entity, dict):
        context.log.info("Skip entity creation: {}".format(foreign_id))
        data["aleph_id"] = cached_entity["id"]
        data["aleph_collection_id"] = collection_id
        data["aleph_entity"] = cached_entity
        context.emit(data=data, optional=True)
        return

    for try_number in range(api.retries):
        rate = settings.MEMORIOUS_RATE_LIMIT
        rate_limit = get_rate_limit("aleph", limit=rate)
        rate_limit.comply()
        try:
            res = api.write_entity(
                collection_id,
                {
                    "schema": data.get("schema"),
                    "properties": data.get("properties"),
                },
                entity_id,
            )

            entity = {
                "id": res.get("id"),
                "schema": res.get("schema"),
                "properties": res.get("properties"),
            }
            context.log.info("Aleph entity ID: %s", entity["id"])

            # Save the entity in cache for future use
            context.set_tag(make_key(collection_id, foreign_id, entity_id),
                            entity)

            data["aleph_id"] = entity["id"]
            data["aleph_collection_id"] = collection_id
            data["aleph_entity"] = entity
            context.emit(data=data, optional=True)
            return
        except AlephException as exc:
            if try_number > api.retries or not exc.transient:
                context.emit_warning("Error: %s" % exc)
                return
            backoff(exc, try_number)
Beispiel #7
0
 def timeout(cls, stage, rate_limit):
     stages_on_timeout = make_key("memorious", "timeout_stages")
     conn.sadd(stages_on_timeout, stage.namespaced_name)
     stage_timeout_key = make_key("memorious", "timeout", stage.namespaced_name)
     expiry = (rate_limit.interval * rate_limit.unit) / rate_limit.limit
     conn.set(stage_timeout_key, "true", ex=math.ceil(expiry))
     # Delay the current task without further adding to call count
     rate_limit.comply(amount=0)
Beispiel #8
0
 def __init__(self, job, stage):  # noqa
     self.job = job
     self.conn = job.conn
     self.stage = stage
     self.queue_key = make_key(PREFIX, 'q', job.dataset, stage, job.id)
     self.stages_key = self._get_stage_jobs_key(stage)
     self.pending_key = make_key(self.queue_key, 'pending')
     self.running_key = make_key(self.queue_key, 'running')
     self.finished_key = make_key(self.queue_key, 'finished')
Beispiel #9
0
 def delete_counts(cls, crawler):
     for level in cls.LEVELS:
         conn.delete(make_key(crawler, "events", "count", level))
     for run_id in Crawl.run_ids(crawler):
         for level in cls.LEVELS:
             conn.delete(make_key(crawler, "events", "count", run_id,
                                  level))  # noqa
     for stage in crawler.stages.keys():
         for level in cls.LEVELS:
             conn.delete(make_key(crawler, "events", "count", stage, level))
Beispiel #10
0
 def __init__(self, job, stage):  # noqa
     self.job = job
     self.conn = job.conn
     self.stage = stage
     self.stages_key = self._get_stage_jobs_key(stage)
     queue_id = (PREFIX, "q", job.dataset, stage, job.id)
     self.queue_key = make_key(*queue_id)
     self.pending_key = make_key(*queue_id, "pending")
     self.running_key = make_key(*queue_id, "running")
     self.finished_key = make_key(*queue_id, "finished")
Beispiel #11
0
 def save(cls, crawler, stage, level, run_id, error=None, message=None):
     """Create an event, possibly based on an exception."""
     event = {
         'stage': stage.name,
         'level': level,
         'timestamp': pack_now(),
         'error': error,
         'message': message
     }
     data = dump_json(event)
     keys = [
         make_key(crawler, "events"),
         make_key(crawler, "events", level),
         make_key(crawler, "events", stage),
         make_key(crawler, "events", stage, level),
         make_key(crawler, "events", run_id),
         make_key(crawler, "events", run_id, level),
     ]
     for key in keys:
         conn.lpush(key, data)
         conn.expire(key, REDIS_EXPIRE)
     # Persist the counts for longer
     count_keys = [
         make_key(crawler, "events", "count", level),
         make_key(crawler, "events", "count", stage, level),
         make_key(crawler, "events", "count", run_id, level),
     ]
     for key in count_keys:
         conn.incr(key)
         conn.expire(key, REDIS_LONG)
     return event
Beispiel #12
0
 def operation_start(cls, crawler, stage, run_id):
     if not conn.sismember(make_key(crawler, "runs"), run_id):
         conn.sadd(make_key(crawler, "runs"), run_id)
         conn.expire(make_key(crawler, "runs"), REDIS_LONG)
         conn.set(make_key(crawler, "run", run_id, "start"), pack_now(), ex=REDIS_LONG)  # noqa
     conn.incr(make_key(crawler, "run", run_id))
     conn.incr(make_key(crawler, "run", run_id, "total_ops"))
     conn.incr(make_key(crawler, stage))
     conn.incr(make_key(crawler, "total_ops"))
     conn.set(make_key(crawler, "last_run"), pack_now(), ex=REDIS_LONG)
     conn.set(make_key(crawler, "current_run"), run_id, ex=REDIS_LONG)
Beispiel #13
0
def aleph_emit_document(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    content_hash = data.get("content_hash")
    source_url = data.get("source_url", data.get("url"))
    foreign_id = data.get("foreign_id", data.get("request_id", source_url))
    # Fetch document id from cache
    document = context.get_tag(
        make_key(collection_id, foreign_id, content_hash))
    if document:
        context.log.info("Skip aleph upload: %s", foreign_id)
        context.log.info("Skip aleph upload: %s", foreign_id)
        data["aleph_id"] = document["id"]
        data["aleph_document"] = document
        data["aleph_collection_id"] = collection_id
        context.emit(data=data, optional=True)
        return

    meta = clean_dict(_create_meta_object(context, data))
    meta.update(_create_document_metadata(context, data))

    label = meta.get("file_name", meta.get("source_url"))
    context.log.info("Upload: %s", label)
    with context.load_file(content_hash) as fh:
        if fh is None:
            return
        file_path = Path(fh.name).resolve()

        for try_number in range(api.retries):
            rate = settings.MEMORIOUS_RATE_LIMIT
            rate_limit = get_rate_limit("aleph", limit=rate)
            rate_limit.comply()
            try:
                res = api.ingest_upload(collection_id, file_path, meta)
                document_id = res.get("id")
                context.log.info("Aleph document ID: %s", document_id)
                # Save the document id in cache for future use
                meta["id"] = document_id
                context.set_tag(
                    make_key(collection_id, foreign_id, content_hash), meta)
                data["aleph_id"] = document_id
                data["aleph_document"] = meta
                data["aleph_collection_id"] = collection_id
                context.emit(data=data, optional=True)
                return
            except AlephException as exc:
                if try_number > api.retries or not exc.transient:
                    context.emit_warning("Error: %s" % exc)
                    return
                backoff(exc, try_number)
Beispiel #14
0
 def runs(cls, crawler):
     runs = []
     for run_id in cls.run_ids(crawler):
         start = conn.get(make_key(crawler, "run", run_id, "start"))
         end = conn.get(make_key(crawler, "run", run_id, "end"))
         total_ops = conn.get(make_key(crawler, "run", run_id, "total_ops"))
         runs.append({
             "run_id": run_id,
             "total_ops": unpack_int(total_ops),
             "start": unpack_datetime(start, datetime.utcnow()),
             "end": unpack_datetime(end),
         })
     return runs
Beispiel #15
0
    def flush(cls, crawler):
        for stage in crawler.stages:
            conn.delete(make_key(crawler, stage))

        for run_id in cls.run_ids(crawler):
            conn.delete(make_key(crawler, run_id))
            conn.delete(make_key(crawler, run_id, "start"))
            conn.delete(make_key(crawler, run_id, "end"))
            conn.delete(make_key(crawler, run_id, "total_ops"))

        conn.delete(make_key(crawler, "runs"))
        conn.delete(make_key(crawler, "current_run"))
        conn.delete(make_key(crawler, "total_ops"))
        conn.delete(make_key(crawler, "last_run"))
        conn.delete(make_key(crawler, "runs_abort"))
Beispiel #16
0
 def save(self):
     session = pickle.dumps(self.session)
     session = codecs.encode(session, 'base64')
     key = sha1(session).hexdigest()[:15]
     key = make_key(self.context.crawler, "session", self.context.run_id, key)  # noqa
     conn.set(key, session, ex=REDIS_SHORT)
     self.context.state[self.STATE_SESSION] = key
Beispiel #17
0
 def test_redis(self):
     key = make_key('test', uuid4())
     conn = get_redis()
     assert not conn.exists(key)
     conn.set(key, 'banana')
     assert conn.get(key) == 'banana', conn.get(key)
     assert conn.exists(key)
Beispiel #18
0
 def get_stages(self):
     all_stages = set({stage.namespaced_name for _, stage in manager.stages})  # noqa
     stages_on_timeout_key = make_key('memorious', 'timeout_stages')
     stages_on_timeout = conn.smembers(stages_on_timeout_key)
     if stages_on_timeout:
         return list(all_stages - set(stages_on_timeout))
     return all_stages
Beispiel #19
0
def fetch(context, data):
    """Do an HTTP GET on the ``url`` specified in the inbound data."""
    url = data.get('url')
    attempt = data.pop('retry_attempt', 1)
    try:
        result = context.http.get(url, lazy=True)
        rules = context.get('rules', {'match_all': {}})
        if not Rule.get_rule(rules).apply(result):
            context.log.info('Fetch skip: %r', result.url)
            return

        if not result.ok:
            err = (result.url, result.status_code)
            context.emit_warning("Fetch fail [%s]: HTTP %s" % err)
            if not context.params.get('emit_errors', False):
                return
        else:
            context.log.info("Fetched [%s]: %r", result.status_code,
                             result.url)

        data.update(result.serialize())
        if url != result.url:
            tag = make_key(context.run_id, url)
            context.set_tag(tag, None)
        context.emit(data=data)
    except RequestException as ce:
        retries = int(context.get('retry', 3))
        if retries >= attempt:
            context.log.warn("Retry: %s (error: %s)", url, ce)
            data['retry_attempt'] = attempt + 1
            context.recurse(data=data, delay=2**attempt)
        else:
            context.emit_warning("Fetch fail [%s]: %s" % (url, ce))
Beispiel #20
0
 def save(self):
     session = pickle.dumps(self.session)
     session = codecs.encode(session, 'base64')
     key = sha1(session).hexdigest()[:15]
     key = make_key(self.context.run_id, "session", key)
     conn.set(key, session, ex=QUEUE_EXPIRE)
     self.context.state[self.STATE_SESSION] = key
Beispiel #21
0
def fetch(context, data):
    """Do an HTTP GET on the ``url`` specified in the inbound data."""
    url = data.get("url")
    if urlparse(url).scheme not in ("http", "https", ""):
        context.log.info("Fetch skipped. Unsupported scheme: %r", url)
        return
    attempt = data.pop("retry_attempt", 1)
    try:
        result = context.http.get(url, lazy=True)
        rules = context.get("rules", {"match_all": {}})
        if not Rule.get_rule(rules).apply(result):
            context.log.info("Fetch skip: %r", result.url)
            return

        if not result.ok:
            err = (result.url, result.status_code)
            context.emit_warning("Fetch fail [%s]: HTTP %s" % err)
            if not context.params.get("emit_errors", False):
                return
        else:
            context.log.info("Fetched [%s]: %r", result.status_code, result.url)

        data.update(result.serialize())
        if url != result.url:
            tag = make_key(context.run_id, url)
            context.set_tag(tag, None)
        context.emit(data=data)
    except RequestException as ce:
        retries = int(context.get("retry", 3))
        if retries >= attempt:
            context.log.warn("Retry: %s (error: %s)", url, ce)
            data["retry_attempt"] = attempt + 1
            context.recurse(data=data, delay=2 ** attempt)
        else:
            context.emit_warning("Fetch fail [%s]: %s" % (url, ce))
Beispiel #22
0
 def test_redis(self):
     key = make_key("test", uuid4())
     conn = get_redis()
     assert not conn.exists(key)
     conn.set(key, "banana")
     assert conn.get(key) == "banana", conn.get(key)
     assert conn.exists(key)
Beispiel #23
0
def aleph_folder(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    foreign_id = data.get("foreign_id")
    if foreign_id is None:
        context.log.warning("No folder foreign ID!")
        return

    meta = clean_dict(_create_meta_object(context, data))
    label = meta.get("file_name", meta.get("source_url"))
    context.log.info("Make folder: %s", label)
    for try_number in range(api.retries):
        rate = settings.MEMORIOUS_RATE_LIMIT
        rate_limit = get_rate_limit("aleph", limit=rate)
        rate_limit.comply()
        try:
            res = api.ingest_upload(collection_id, metadata=meta, sync=True)
            document_id = res.get("id")
            context.log.info("Aleph folder entity ID: %s", document_id)
            # Save the document id in cache for future use
            context.set_tag(make_key(collection_id, foreign_id), document_id)
            data["aleph_folder_id"] = document_id
            data["aleph_collection_id"] = collection_id
            context.emit(data=data, optional=True)
            return
        except AlephException as ae:
            if try_number > api.retries or not ae.transient:
                context.emit_warning("Error: %s" % ae)
                return
            backoff(ae, try_number)
Beispiel #24
0
 def flush_tags(self):
     pipe = conn.pipeline()
     count = 0
     for key in conn.scan_iter(make_key(self, 'tag', '*')):
         pipe.delete(key)
         count += 1
     pipe.execute()
     log.info("Deleted %d tags", count)
Beispiel #25
0
 def load_session(self):
     if self.STATE_SESSION not in self.context.state:
         return
     key = self.context.state.get(self.STATE_SESSION)
     value = conn.get(make_key(self.context.run_id, "session", key))
     if value is not None:
         session = codecs.decode(bytes(value, 'utf-8'), 'base64')
         return pickle.loads(session)
Beispiel #26
0
 def _remove(self, pipe):
     for stage in self.get_stages():
         stage._remove(pipe)
     pipe.srem(self.dataset.key, self.dataset.name)
     pipe.srem(self.dataset.jobs_key, self.id)
     pipe.srem(self.active_jobs_key, make_key(self.dataset.name, self.id))
     pipe.delete(self.start_key)
     pipe.setnx(self.end_key, pack_now())
     pipe.expire(self.end_key, REDIS_EXPIRE)
Beispiel #27
0
 def request_id(self):
     if self._request_id is not None:
         return self._request_id
     if self.request is not None:
         parts = [self.request.method, self.url]
         if self.request.data:
             parts.append(hash_data(self.request.data))
         if self.request.json:
             parts.append(hash_data(self.request.json))
         return make_key(*parts)
Beispiel #28
0
def documentcloud_mark_processed(context, data):
    """Create a persistent tag to indicate that a document has been fully processed

    On subsequent runs, we can check and skip processing this document earlier in the
    pipeline.
    """
    key = make_key(
        context.crawler.name,
        data["foreign_id"],
        data["content_hash"],
    )
    context.log.info(
        f"Document with foreign id {data['foreign_id']} has been processed")
    context.set_tag(key, "processed")
Beispiel #29
0
def parse_html(context, data, result):
    context.log.info("Parse: %r", result.url)

    for title in result.html.xpath(".//title/text()"):
        if title is not None and "title" not in data:
            data["title"] = title

    include = context.params.get("include_paths")
    if include is None:
        roots = [result.html]
    else:
        roots = []
        for path in include:
            roots = roots + result.html.xpath(path)

    seen = set()
    for root in roots:
        for tag_query, attr_name in URL_TAGS:
            for element in root.xpath(tag_query):
                attr = element.get(attr_name)
                if attr is None:
                    continue

                try:
                    url = urljoin(result.url, attr)
                    key = url
                except Exception:
                    log.warning("Invalid URL: %r", attr)
                    continue

                if url is None or key is None or key in seen:
                    continue
                seen.add(key)

                tag = make_key(context.run_id, key)
                if context.check_tag(tag):
                    continue
                context.set_tag(tag, None)
                data["url"] = url

                if data.get("title") is None:
                    # Option to set the document title from the link text.
                    if context.get("link_title", False):
                        data["title"] = collapse_spaces(element.text_content())
                    elif element.get("title"):
                        data["title"] = collapse_spaces(element.get("title"))

                context.http.session.headers["Referer"] = url
                context.emit(rule="fetch", data=data)
Beispiel #30
0
def parse_html(context, data, result):
    context.log.info('Parse: %r', result.url)

    title = result.html.xpath('.//title/text()')[0]
    if title is not None and 'title' not in data:
        data['title'] = title

    include = context.params.get('include_paths')
    if include is None:
        roots = [result.html]
    else:
        roots = []
        for path in include:
            roots = roots + result.html.xpath(path)

    seen = set()
    for root in roots:
        for tag_query, attr_name in URL_TAGS:
            for element in root.xpath(tag_query):
                attr = element.get(attr_name)
                if attr is None:
                    continue

                try:
                    url = urljoin(result.url, attr)
                    key = url
                except Exception:
                    log.warning('Invalid URL: %r', attr)
                    continue

                if url is None or key is None or key in seen:
                    continue
                seen.add(key)

                tag = make_key(context.run_id, key)
                if context.check_tag(tag):
                    continue
                context.set_tag(tag, None)
                data = {'url': url}

                # Option to set the document title from the link text.
                if context.get('link_title', False):
                    data['title'] = collapse_spaces(element.text_content())
                elif element.get('title'):
                    data['title'] = collapse_spaces(element.get('title'))

                context.http.session.headers['Referer'] = url
                context.emit(rule='fetch', data=data)
Beispiel #31
0
 def key(self, *parts):
     return make_key(self.prefix, *parts)
Beispiel #32
0
def place_key(name):
    return make_key(PLACE_KEY, name)