Ejemplo n.º 1
0
def aleph_folder(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    foreign_id = data.get("foreign_id")
    if foreign_id is None:
        context.log.warning("No folder foreign ID!")
        return

    meta = clean_dict(_create_meta_object(context, data))
    label = meta.get("file_name", meta.get("source_url"))
    context.log.info("Make folder: %s", label)
    for try_number in range(api.retries):
        rate = settings.MEMORIOUS_RATE_LIMIT
        rate_limit = get_rate_limit("aleph", limit=rate)
        rate_limit.comply()
        try:
            res = api.ingest_upload(collection_id, metadata=meta, sync=True)
            document_id = res.get("id")
            context.log.info("Aleph folder entity ID: %s", document_id)
            # Save the document id in cache for future use
            context.set_tag(make_key(collection_id, foreign_id), document_id)
            data["aleph_folder_id"] = document_id
            data["aleph_collection_id"] = collection_id
            context.emit(data=data, optional=True)
            return
        except AlephException as ae:
            if try_number > api.retries or not ae.transient:
                context.emit_warning("Error: %s" % ae)
                return
            backoff(ae, try_number)
Ejemplo n.º 2
0
def aleph_emit_entity(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    entity_id = data.get("entity_id", data.get("id"))
    if not entity_id:
        context.emit_warning(
            "Error: Can not create entity. `id` is not definied")
        return
    source_url = data.get("source_url", data.get("url"))
    foreign_id = data.get("foreign_id", data.get("request_id", source_url))
    # Fetch entity from cache
    cached_entity = context.get_tag(
        make_key(collection_id, foreign_id, entity_id))

    if cached_entity and isinstance(cached_entity, dict):
        context.log.info("Skip entity creation: {}".format(foreign_id))
        data["aleph_id"] = cached_entity["id"]
        data["aleph_collection_id"] = collection_id
        data["aleph_entity"] = cached_entity
        context.emit(data=data, optional=True)
        return

    for try_number in range(api.retries):
        rate = settings.MEMORIOUS_RATE_LIMIT
        rate_limit = get_rate_limit("aleph", limit=rate)
        rate_limit.comply()
        try:
            res = api.write_entity(
                collection_id,
                {
                    "schema": data.get("schema"),
                    "properties": data.get("properties"),
                },
                entity_id,
            )

            entity = {
                "id": res.get("id"),
                "schema": res.get("schema"),
                "properties": res.get("properties"),
            }
            context.log.info("Aleph entity ID: %s", entity["id"])

            # Save the entity in cache for future use
            context.set_tag(make_key(collection_id, foreign_id, entity_id),
                            entity)

            data["aleph_id"] = entity["id"]
            data["aleph_collection_id"] = collection_id
            data["aleph_entity"] = entity
            context.emit(data=data, optional=True)
            return
        except AlephException as exc:
            if try_number > api.retries or not exc.transient:
                context.emit_warning("Error: %s" % exc)
                return
            backoff(exc, try_number)
Ejemplo n.º 3
0
def aleph_emit_document(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    content_hash = data.get("content_hash")
    source_url = data.get("source_url", data.get("url"))
    foreign_id = data.get("foreign_id", data.get("request_id", source_url))
    # Fetch document id from cache
    document = context.get_tag(
        make_key(collection_id, foreign_id, content_hash))
    if document:
        context.log.info("Skip aleph upload: %s", foreign_id)
        context.log.info("Skip aleph upload: %s", foreign_id)
        data["aleph_id"] = document["id"]
        data["aleph_document"] = document
        data["aleph_collection_id"] = collection_id
        context.emit(data=data, optional=True)
        return

    meta = clean_dict(_create_meta_object(context, data))
    meta.update(_create_document_metadata(context, data))

    label = meta.get("file_name", meta.get("source_url"))
    context.log.info("Upload: %s", label)
    with context.load_file(content_hash) as fh:
        if fh is None:
            return
        file_path = Path(fh.name).resolve()

        for try_number in range(api.retries):
            rate = settings.MEMORIOUS_RATE_LIMIT
            rate_limit = get_rate_limit("aleph", limit=rate)
            rate_limit.comply()
            try:
                res = api.ingest_upload(collection_id, file_path, meta)
                document_id = res.get("id")
                context.log.info("Aleph document ID: %s", document_id)
                # Save the document id in cache for future use
                meta["id"] = document_id
                context.set_tag(
                    make_key(collection_id, foreign_id, content_hash), meta)
                data["aleph_id"] = document_id
                data["aleph_document"] = meta
                data["aleph_collection_id"] = collection_id
                context.emit(data=data, optional=True)
                return
            except AlephException as exc:
                if try_number > api.retries or not exc.transient:
                    context.emit_warning("Error: %s" % exc)
                    return
                backoff(exc, try_number)
Ejemplo n.º 4
0
def _upsert(context, params, data):
    """Insert or update data and add/update appropriate timestamps"""
    table = params.get("table")
    table = datastore.get_table(table, primary_id=False)
    unique_keys = ensure_list(params.get("unique"))
    data["__last_seen"] = datetime.datetime.utcnow()
    if len(unique_keys):
        updated = table.update(data, unique_keys, return_count=True)
        if updated:
            return
    data["__first_seen"] = data["__last_seen"]
    rate_limit = get_rate_limit("db", limit=settings.DB_RATE_LIMIT)
    rate_limit.comply()
    table.insert(data)
Ejemplo n.º 5
0
def ftp_fetch(context, data):
    url = data.get("url")
    context.log.info("FTP fetch: %s", url)
    requests_ftp.monkeypatch_session()
    session = requests.Session()
    username = context.get("username", "Anonymous")
    password = context.get("password", "anonymous@ftp")

    resource = urlparse(url).netloc or url
    # a bit weird to have a http rate limit while using ftp
    limit = context.get("http_rate_limit", settings.HTTP_RATE_LIMIT)
    limit = limit / 60  # per minute to per second for stricter enforcement
    rate_limit = get_rate_limit(resource, limit=limit, interval=1, unit=1)

    cached = context.get_tag(url)
    if cached is not None:
        context.emit(rule="pass", data=cached)
        return

    context.enforce_rate_limit(rate_limit)
    resp = session.retr(url, auth=(username, password))
    if resp.status_code < 399:
        data.update(
            {
                "status_code": resp.status_code,
                "retrieved_at": datetime.utcnow().isoformat(),
                "content_hash": context.store_data(data=resp.content),
            }
        )
        context.set_tag(url, data)
        context.emit(rule="pass", data=data)
    else:
        context.enforce_rate_limit(rate_limit)
        resp = session.nlst(url, auth=(username, password))
        for child in resp.iter_lines(decode_unicode=True):
            child_data = data.copy()
            child_data["url"] = os.path.join(url, child)
            context.log.info("FTP directory child: %(url)s", child_data)
            context.emit(rule="child", data=child_data)
Ejemplo n.º 6
0
def ftp_fetch(context, data):
    url = data.get('url')
    context.log.info("FTP fetch: %s", url)
    requests_ftp.monkeypatch_session()
    session = requests.Session()
    username = context.get('username', 'Anonymous')
    password = context.get('password', 'anonymous@ftp')

    resource = urlparse(url).netloc or url
    # a bit weird to have a http rate limit while using ftp
    limit = context.get('http_rate_limit', settings.HTTP_RATE_LIMIT)
    rate_limit = get_rate_limit(resource, limit=limit)

    cached = context.get_tag(url)
    if cached is not None:
        context.emit(rule='pass', data=cached)
        return

    rate_limit.comply()
    resp = session.retr(url, auth=(username, password))
    if resp.status_code < 399:
        data.update({
            'status_code': resp.status_code,
            'retrieved_at': datetime.utcnow().isoformat(),
            'content_hash': context.store_data(data=resp.content)
        })
        context.set_tag(url, data)
        context.emit(rule='pass', data=data)
    else:
        rate_limit.comply()
        resp = session.nlst(url, auth=(username, password))
        for child in resp.iter_lines(decode_unicode=True):
            child_data = data.copy()
            child_data['url'] = os.path.join(url, child)
            context.log.info("FTP directory child: %(url)s", child_data)
            context.emit(rule='child', data=child_data)
Ejemplo n.º 7
0
 def _rate_limit(self, url):
     resource = urlparse(url).netloc or url
     limit = self.context.get('http_rate_limit', settings.HTTP_RATE_LIMIT)
     rate_limit = get_rate_limit(resource, limit=limit)
     rate_limit.comply()
Ejemplo n.º 8
0
def aleph_emit(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    content_hash = data.get("content_hash")
    source_url = data.get("source_url", data.get("url"))
    foreign_id = data.get("foreign_id", data.get("request_id", source_url))
    # Fetch document id from cache
    document_id = context.get_tag(make_key(collection_id, foreign_id, content_hash))
    if document_id:
        context.log.info("Skip aleph upload: %s", foreign_id)
        data["aleph_id"] = document_id
        context.emit(data=data, optional=True)
        return

    meta = {
        "crawler": context.crawler.name,
        "foreign_id": foreign_id,
        "source_url": source_url,
        "title": data.get("title"),
        "author": data.get("author"),
        "file_name": data.get("file_name"),
        "retrieved_at": data.get("retrieved_at"),
        "modified_at": data.get("modified_at"),
        "published_at": data.get("published_at"),
        "headers": data.get("headers", {}),
    }

    languages = context.params.get("languages")
    meta["languages"] = data.get("languages", languages)
    countries = context.params.get("countries")
    meta["countries"] = data.get("countries", countries)
    mime_type = context.params.get("mime_type")
    meta["mime_type"] = data.get("mime_type", mime_type)

    if data.get("aleph_folder_id"):
        meta["parent"] = {"id": data.get("aleph_folder_id")}

    meta = clean_dict(meta)
    # pprint(meta)
    label = meta.get("file_name", meta.get("source_url"))
    context.log.info("Upload: %s", label)
    with context.load_file(content_hash) as fh:
        if fh is None:
            return
        file_path = Path(fh.name).resolve()

        for try_number in range(api.retries):
            rate = settings.MEMORIOUS_RATE_LIMIT
            rate_limit = get_rate_limit("aleph", limit=rate)
            rate_limit.comply()
            try:
                res = api.ingest_upload(collection_id, file_path, meta)
                document_id = res.get("id")
                context.log.info("Aleph document entity ID: %s", document_id)
                # Save the document id in cache for future use
                context.set_tag(
                    make_key(collection_id, foreign_id, content_hash), document_id
                )
                data["aleph_id"] = document_id
                data["aleph_document"] = meta
                data["aleph_collection_id"] = collection_id
                context.emit(data=data, optional=True)
                return
            except AlephException as exc:
                if try_number > api.retries or not exc.transient:
                    context.emit_warning("Error: %s" % exc)
                    return
                backoff(exc, try_number)
Ejemplo n.º 9
0
 def _rate_limit(self, url):
     resource = urlparse(url).netloc or url
     limit = self.context.get('http_rate_limit', settings.HTTP_RATE_LIMIT)
     limit = limit / 60  # per minute to per second for stricter enforcement
     rate_limit = get_rate_limit(resource, limit=limit, interval=1, unit=1)
     self.context.enforce_rate_limit(rate_limit)
Ejemplo n.º 10
0
 def boot(self):
     self.scheduler = get_rate_limit('scheduler',
                                     unit=60,
                                     interval=settings.SCHEDULER_INTERVAL,
                                     limit=1)
     self.hourly = get_rate_limit('hourly', unit=3600, interval=1, limit=1)
Ejemplo n.º 11
0
def aleph_emit(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    content_hash = data.get('content_hash')
    source_url = data.get('source_url', data.get('url'))
    foreign_id = data.get('foreign_id', data.get('request_id', source_url))
    if context.skip_incremental(collection_id, foreign_id, content_hash):
        context.log.info("Skip aleph upload: %s", foreign_id)
        return

    meta = {
        'crawler': context.crawler.name,
        'foreign_id': foreign_id,
        'source_url': source_url,
        'title': data.get('title'),
        'author': data.get('author'),
        'file_name': data.get('file_name'),
        'retrieved_at': data.get('retrieved_at'),
        'modified_at': data.get('modified_at'),
        'published_at': data.get('published_at'),
        'headers': data.get('headers', {})
    }

    languages = context.params.get('languages')
    meta['languages'] = data.get('languages', languages)
    countries = context.params.get('countries')
    meta['countries'] = data.get('countries', countries)
    mime_type = context.params.get('mime_type')
    meta['mime_type'] = data.get('mime_type', mime_type)

    if data.get('parent_foreign_id'):
        meta['parent'] = {'foreign_id': data.get('parent_foreign_id')}

    meta = clean_dict(meta)
    # pprint(meta)
    label = meta.get('file_name', meta.get('source_url'))
    context.log.info("Upload: %s", label)
    with context.load_file(content_hash) as fh:
        if fh is None:
            return
        file_path = Path(fh.name).resolve()

        for try_number in range(api.retries):
            rate = settings.MEMORIOUS_RATE_LIMIT
            rate_limit = get_rate_limit('aleph', limit=rate)
            rate_limit.comply()
            try:
                res = api.ingest_upload(collection_id, file_path, meta)
                document_id = res.get('id')
                context.log.info("Aleph document entity ID: %s", document_id)
                data['aleph_id'] = document_id
                data['aleph_document'] = meta
                data['aleph_collection_id'] = collection_id
                context.emit(data=data, optional=True)
                return
            except AlephException as ae:
                if try_number > api.retries or not ae.transient:
                    context.emit_warning("Error: %s" % ae)
                    return
                backoff(ae, try_number)