Exemple #1
0
def aleph_folder(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    foreign_id = data.get("foreign_id")
    if foreign_id is None:
        context.log.warning("No folder foreign ID!")
        return

    meta = clean_dict(_create_meta_object(context, data))
    label = meta.get("file_name", meta.get("source_url"))
    context.log.info("Make folder: %s", label)
    for try_number in range(api.retries):
        rate = settings.MEMORIOUS_RATE_LIMIT
        rate_limit = get_rate_limit("aleph", limit=rate)
        rate_limit.comply()
        try:
            res = api.ingest_upload(collection_id, metadata=meta, sync=True)
            document_id = res.get("id")
            context.log.info("Aleph folder entity ID: %s", document_id)
            # Save the document id in cache for future use
            context.set_tag(make_key(collection_id, foreign_id), document_id)
            data["aleph_folder_id"] = document_id
            data["aleph_collection_id"] = collection_id
            context.emit(data=data, optional=True)
            return
        except AlephException as ae:
            if try_number > api.retries or not ae.transient:
                context.emit_warning("Error: %s" % ae)
                return
            backoff(ae, try_number)
Exemple #2
0
    def ingest_upload(self, collection_id, file_path=None, metadata=None):
        """
        Create an empty folder in a collection or upload a document to it

        params
        ------
        collection_id: id of the collection to upload to
        file_path: path of the file to upload. None while creating folders
        metadata: dict containing metadata for the file or folders. In case of
        files, metadata contains foreign_id of the parent. Metadata for a
        directory contains foreign_id for itself as well as its parent and the
        name of the directory.
        """
        url = self._make_url("collections/{0}/ingest".format(collection_id))
        if not file_path or file_path.is_dir():
            data = {"meta": json.dumps(metadata)}
            return self._request("POST", url, data=data)

        for attempt in count(1):
            try:
                with file_path.open('rb') as fh:
                    # use multipart encoder to allow uploading very large files
                    m = MultipartEncoder(fields={
                        'meta': json.dumps(metadata),
                        'file': (file_path.name, fh, MIME)
                    })
                    headers = {'Content-Type': m.content_type}
                    return self._request("POST", url, data=m, headers=headers)
            except AlephException as ae:
                if not ae.transient or attempt > self.retries:
                    raise ae
                backoff(ae, attempt)
Exemple #3
0
def aleph_emit_entity(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    entity_id = data.get("entity_id", data.get("id"))
    if not entity_id:
        context.emit_warning(
            "Error: Can not create entity. `id` is not definied")
        return
    source_url = data.get("source_url", data.get("url"))
    foreign_id = data.get("foreign_id", data.get("request_id", source_url))
    # Fetch entity from cache
    cached_entity = context.get_tag(
        make_key(collection_id, foreign_id, entity_id))

    if cached_entity and isinstance(cached_entity, dict):
        context.log.info("Skip entity creation: {}".format(foreign_id))
        data["aleph_id"] = cached_entity["id"]
        data["aleph_collection_id"] = collection_id
        data["aleph_entity"] = cached_entity
        context.emit(data=data, optional=True)
        return

    for try_number in range(api.retries):
        rate = settings.MEMORIOUS_RATE_LIMIT
        rate_limit = get_rate_limit("aleph", limit=rate)
        rate_limit.comply()
        try:
            res = api.write_entity(
                collection_id,
                {
                    "schema": data.get("schema"),
                    "properties": data.get("properties"),
                },
                entity_id,
            )

            entity = {
                "id": res.get("id"),
                "schema": res.get("schema"),
                "properties": res.get("properties"),
            }
            context.log.info("Aleph entity ID: %s", entity["id"])

            # Save the entity in cache for future use
            context.set_tag(make_key(collection_id, foreign_id, entity_id),
                            entity)

            data["aleph_id"] = entity["id"]
            data["aleph_collection_id"] = collection_id
            data["aleph_entity"] = entity
            context.emit(data=data, optional=True)
            return
        except AlephException as exc:
            if try_number > api.retries or not exc.transient:
                context.emit_warning("Error: %s" % exc)
                return
            backoff(exc, try_number)
Exemple #4
0
def aleph_emit(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    content_hash = data.get('content_hash')
    source_url = data.get('source_url', data.get('url'))
    foreign_id = data.get('foreign_id', data.get('request_id', source_url))
    if context.skip_incremental(collection_id, foreign_id, content_hash):
        context.log.info("Skip aleph upload: %s", foreign_id)
        return

    meta = {
        'crawler': context.crawler.name,
        'foreign_id': foreign_id,
        'source_url': source_url,
        'title': data.get('title'),
        'author': data.get('author'),
        'file_name': data.get('file_name'),
        'retrieved_at': data.get('retrieved_at'),
        'modified_at': data.get('modified_at'),
        'published_at': data.get('published_at'),
        'headers': data.get('headers', {})
    }

    languages = context.params.get('languages')
    meta['languages'] = data.get('languages', languages)
    countries = context.params.get('countries')
    meta['countries'] = data.get('countries', countries)
    mime_type = context.params.get('mime_type')
    meta['mime_type'] = data.get('mime_type', mime_type)

    if data.get('parent_foreign_id'):
        meta['parent'] = {'foreign_id': data.get('parent_foreign_id')}

    meta = clean_dict(meta)
    # pprint(meta)
    label = meta.get('file_name', meta.get('source_url'))
    context.log.info("Upload: %s", label)
    with context.load_file(content_hash) as fh:
        if fh is None:
            return
        file_path = Path(fh.name).resolve()
        for try_number in range(api.retries):
            try:
                res = api.ingest_upload(collection_id, file_path, meta)
                document_id = res.get('id')
                context.log.info("Aleph document entity ID: %s", document_id)
                data['aleph_id'] = document_id
                data['aleph_document'] = meta
                data['aleph_collection_id'] = collection_id
                context.emit(data=data, optional=True)
                return
            except AlephException as ae:
                if try_number > api.retries or not ae.transient:
                    context.emit_warning("Error: %s" % ae)
                    return
                backoff(ae, try_number)
Exemple #5
0
def aleph_emit_document(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    content_hash = data.get("content_hash")
    source_url = data.get("source_url", data.get("url"))
    foreign_id = data.get("foreign_id", data.get("request_id", source_url))
    # Fetch document id from cache
    document = context.get_tag(
        make_key(collection_id, foreign_id, content_hash))
    if document:
        context.log.info("Skip aleph upload: %s", foreign_id)
        context.log.info("Skip aleph upload: %s", foreign_id)
        data["aleph_id"] = document["id"]
        data["aleph_document"] = document
        data["aleph_collection_id"] = collection_id
        context.emit(data=data, optional=True)
        return

    meta = clean_dict(_create_meta_object(context, data))
    meta.update(_create_document_metadata(context, data))

    label = meta.get("file_name", meta.get("source_url"))
    context.log.info("Upload: %s", label)
    with context.load_file(content_hash) as fh:
        if fh is None:
            return
        file_path = Path(fh.name).resolve()

        for try_number in range(api.retries):
            rate = settings.MEMORIOUS_RATE_LIMIT
            rate_limit = get_rate_limit("aleph", limit=rate)
            rate_limit.comply()
            try:
                res = api.ingest_upload(collection_id, file_path, meta)
                document_id = res.get("id")
                context.log.info("Aleph document ID: %s", document_id)
                # Save the document id in cache for future use
                meta["id"] = document_id
                context.set_tag(
                    make_key(collection_id, foreign_id, content_hash), meta)
                data["aleph_id"] = document_id
                data["aleph_document"] = meta
                data["aleph_collection_id"] = collection_id
                context.emit(data=data, optional=True)
                return
            except AlephException as exc:
                if try_number > api.retries or not exc.transient:
                    context.emit_warning("Error: %s" % exc)
                    return
                backoff(exc, try_number)
def _upload(q: Queue, api: AlephAPI, collection_id: str, root_path: Path):
    while not q.empty():
        path, parent_id, try_number = q.get()
        try:
            _crawl_path(q, api, collection_id, parent_id, root_path, path)
        except AlephException as exc:
            if exc.transient and try_number < api.retries:
                backoff(exc, try_number)
                q.put((path, parent_id, try_number + 1))
            else:
                log.error(exc.message)
        except Exception:
            log.exception('Failed [%s]: %s', collection_id, path)
        q.task_done()
Exemple #7
0
 def execute(self):
     while not self.queue.empty():
         path, parent_id, try_number = self.queue.get()
         try:
             self.crawl_path(parent_id, path)
         except AlephException as exc:
             if exc.transient and try_number < self.api.retries:
                 backoff(exc, try_number)
                 self.queue.put((path, parent_id, try_number + 1))
             else:
                 log.error(exc.message)
         except Exception:
             log.exception('Failed [%s]: %s', self.collection_id, path)
         finally:
             self.queue.task_done()
Exemple #8
0
 def _bulk_chunk(self, collection_id, chunk, force=False, unsafe=False):
     for attempt in count(1):
         url = self._make_url("collections/{0}/_bulk".format(collection_id))
         params = {'unsafe': unsafe}
         try:
             response = self.session.post(url, json=chunk, params=params)
             response.raise_for_status()
             return
         except RequestException as exc:
             ae = AlephException(exc)
             if not ae.transient or attempt > self.retries:
                 if not force:
                     raise ae
                 log.error(ae)
                 return
             backoff(ae, attempt)
Exemple #9
0
 def backoff_ingest_upload(self, path: Path, parent_id: str,
                           foreign_id: str) -> Optional[str]:
     try_number = 1
     while True:
         try:
             return self.ingest_upload(Path(path), parent_id, foreign_id)
         except AlephException as err:
             if err.transient and try_number < self.api.retries:
                 try_number += 1
                 backoff(err, try_number)
             else:
                 log.error(err.message)
                 return None
         except Exception:
             log.exception("Failed [%s]: %s", self.collection_id, path)
             return None
Exemple #10
0
    def ingest_upload(
        self,
        collection_id: str,
        file_path: Optional[Path] = None,
        metadata: Optional[Dict] = None,
        sync: bool = False,
        index: bool = True,
    ) -> Dict:
        """
        Create an empty folder in a collection or upload a document to it

        params
        ------
        collection_id: id of the collection to upload to
        file_path: path of the file to upload. None while creating folders
        metadata: dict containing metadata for the file or folders. In case of
        files, metadata contains foreign_id of the parent. Metadata for a
        directory contains foreign_id for itself as well as its parent and the
        name of the directory.
        """
        url_path = "collections/{0}/ingest".format(collection_id)
        params = {"sync": sync, "index": index}
        url = self._make_url(url_path, params=params)
        if not file_path or file_path.is_dir():
            data = {"meta": json.dumps(metadata)}
            return self._request("POST", url, data=data)

        for attempt in count(1):
            try:
                with file_path.open("rb") as fh:
                    # use multipart encoder to allow uploading very large files
                    m = MultipartEncoder(
                        fields={
                            "meta": json.dumps(metadata),
                            "file": (file_path.name, fh, MIME),
                        })
                    headers = {"Content-Type": m.content_type}
                    return self._request("POST", url, data=m, headers=headers)
            except AlephException as ae:
                if not ae.transient or attempt > self.retries:
                    raise ae
                backoff(ae, attempt)
        return {}
Exemple #11
0
 def _bulk_chunk(
     self,
     collection_id: str,
     chunk: List,
     entityset_id: Optional[str] = None,
     force: bool = False,
     unsafe: bool = False,
 ):
     for attempt in count(1):
         url = self._make_url(f"collections/{collection_id}/_bulk")
         params = {"unsafe": unsafe, "entityset_id": entityset_id}
         try:
             response = self.session.post(url, json=chunk, params=params)
             response.raise_for_status()
             return
         except RequestException as exc:
             ae = AlephException(exc)
             if not ae.transient or attempt > self.retries:
                 if not force:
                     raise ae
                 log.error(ae)
                 return
             backoff(ae, attempt)
Exemple #12
0
    def write_entity(self,
                     collection_id: str,
                     entity: Dict,
                     entity_id: str = None,
                     **kw) -> Dict:
        """Create a single entity via the API, in the given
        collection.

        params
        ------
        collection_id: id of the collection to use. This will overwrite any
        existing collection specified in the entity dict
        entity_id: id for the entity to be created. This will overwrite any
        existing entity specified in the entity dict
        entity: A dict object containing the values of the entity
        """
        entity["collection_id"] = collection_id

        if entity_id is not None:
            entity["id"] = entity_id

        for attempt in count(1):
            if entity_id is not None:
                url = self._make_url("entities/{}").format(entity_id)
            else:
                url = self._make_url("entities")
            try:
                return self._request("POST", url, json=entity)
            except RequestException as exc:
                ae = AlephException(exc)
                if not ae.transient or attempt > self.retries:
                    log.error(ae)
                    raise exc
                backoff(ae, attempt)

        return {}
Exemple #13
0
def aleph_emit(context, data):
    api = get_api(context)
    if api is None:
        return
    collection_id = get_collection_id(context, api)
    content_hash = data.get("content_hash")
    source_url = data.get("source_url", data.get("url"))
    foreign_id = data.get("foreign_id", data.get("request_id", source_url))
    # Fetch document id from cache
    document_id = context.get_tag(make_key(collection_id, foreign_id, content_hash))
    if document_id:
        context.log.info("Skip aleph upload: %s", foreign_id)
        data["aleph_id"] = document_id
        context.emit(data=data, optional=True)
        return

    meta = {
        "crawler": context.crawler.name,
        "foreign_id": foreign_id,
        "source_url": source_url,
        "title": data.get("title"),
        "author": data.get("author"),
        "file_name": data.get("file_name"),
        "retrieved_at": data.get("retrieved_at"),
        "modified_at": data.get("modified_at"),
        "published_at": data.get("published_at"),
        "headers": data.get("headers", {}),
    }

    languages = context.params.get("languages")
    meta["languages"] = data.get("languages", languages)
    countries = context.params.get("countries")
    meta["countries"] = data.get("countries", countries)
    mime_type = context.params.get("mime_type")
    meta["mime_type"] = data.get("mime_type", mime_type)

    if data.get("aleph_folder_id"):
        meta["parent"] = {"id": data.get("aleph_folder_id")}

    meta = clean_dict(meta)
    # pprint(meta)
    label = meta.get("file_name", meta.get("source_url"))
    context.log.info("Upload: %s", label)
    with context.load_file(content_hash) as fh:
        if fh is None:
            return
        file_path = Path(fh.name).resolve()

        for try_number in range(api.retries):
            rate = settings.MEMORIOUS_RATE_LIMIT
            rate_limit = get_rate_limit("aleph", limit=rate)
            rate_limit.comply()
            try:
                res = api.ingest_upload(collection_id, file_path, meta)
                document_id = res.get("id")
                context.log.info("Aleph document entity ID: %s", document_id)
                # Save the document id in cache for future use
                context.set_tag(
                    make_key(collection_id, foreign_id, content_hash), document_id
                )
                data["aleph_id"] = document_id
                data["aleph_document"] = meta
                data["aleph_collection_id"] = collection_id
                context.emit(data=data, optional=True)
                return
            except AlephException as exc:
                if try_number > api.retries or not exc.transient:
                    context.emit_warning("Error: %s" % exc)
                    return
                backoff(exc, try_number)