Ejemplo n.º 1
0
    def stream_entities(self, collection_id=None, include=None, schema=None):
        """Iterate over all entities in the given collection.

        params
        ------
        collection_id: id of the collection to stream
        include: an array of fields from the index to include.
        """
        url = self._make_url('entities/_stream')
        if collection_id is not None:
            url = "collections/{0}/_stream".format(collection_id)
            url = self._make_url(url)
        params = {'include': include, 'schema': schema}
        try:
            res = self.session.get(url, params=params, stream=True)
            res.raise_for_status()
            for entity in res.iter_lines():
                if isinstance(entity, bytes):
                    entity = entity.decode('utf-8')
                entity = json.loads(entity)
                aleph_url = 'entities/%s' % entity.get('id')
                prop_push(entity, 'alephUrl', self._make_url(aleph_url))
                yield entity
        except RequestException as exc:
            raise AlephException(exc)
Ejemplo n.º 2
0
    def stream_entities(self, collection: Optional[Dict] = None,
                        include: Optional[List] = None,
                        schema: Optional[str] = None,
                        publisher: bool = False) -> Iterator[Dict]:
        """Iterate over all entities in the given collection.

        params
        ------
        collection_id: id of the collection to stream
        include: an array of fields from the index to include.
        """
        url = self._make_url('entities/_stream')
        if collection is not None:
            collection_id = collection.get('id')
            url = f"collections/{collection_id}/_stream"
            url = self._make_url(url)
        params = {'include': include, 'schema': schema}
        try:
            res = self.session.get(url, params=params, stream=True)
            res.raise_for_status()
            for entity in res.iter_lines():
                entity = json.loads(entity)
                yield self._patch_entity(entity,
                                         publisher=publisher,
                                         collection=collection)
        except RequestException as exc:
            raise AlephException(exc)
Ejemplo n.º 3
0
    def stream_entities(self,
                        collection_id=None,
                        include=None,
                        decode_json=True):
        """Iterate over all entities in the given collection.

        params
        ------
        collection_id: id of the collection to stream
        include: an array of fields from the index to include.
        """
        url = self._make_url('entities/_stream')
        if collection_id is not None:
            url = "collections/{0}/_stream".format(collection_id)
            url = self._make_url(url)
        params = {'include': include}
        try:
            res = self.session.get(url, params=params, stream=True)
            res.raise_for_status()
            for entity in res.iter_lines():
                entity = json.loads(entity)
                properties = entity.get('properties')
                if properties is not None and 'id' in entity:
                    values = properties.get('alephUrl', [])
                    aleph_url = 'entities/%s' % entity.get('id')
                    values.append(self._make_url(aleph_url))
                    properties['alephUrl'] = values
                yield entity
        except RequestException as exc:
            raise AlephException(exc)
Ejemplo n.º 4
0
def bulk_load(api: AlephAPI, mapping_file: str):
    data = load_config_file(mapping_file)
    if not isinstance(data, dict):
        raise AlephException('mapping_file has to be a json dictionary')
    for foreign_id, config in data.items():
        collection = api.load_collection_by_foreign_id(foreign_id, config)
        collection_id = collection['id']
        log.info(f"Bulk mapping collection ID: {collection_id}")
        api.map_collection(collection_id, data)
Ejemplo n.º 5
0
 def match(self, entity, collection_ids=None, url=None):
     """Find similar entities given a sample entity."""
     params = {'collection_ids': ensure_list(collection_ids)}
     if url is None:
         url = self._make_url('match')
     try:
         response = self.session.post(url, json=entity, params=params)
         response.raise_for_status()
         for result in response.json().get('results', []):
             yield result
     except RequestException as exc:
         raise AlephException(exc)
Ejemplo n.º 6
0
 def match(self, entity: Dict, collection_ids: Optional[str] = None,
           url: str = None, publisher: bool = False) -> Iterator[List]:
     """Find similar entities given a sample entity."""
     params = {'collection_ids': ensure_list(collection_ids)}
     if url is None:
         url = self._make_url('match')
     try:
         response = self.session.post(url, json=entity, params=params)
         response.raise_for_status()
         for result in response.json().get('results', []):
             yield self._patch_entity(result, publisher=publisher)
     except RequestException as exc:
         raise AlephException(exc)
Ejemplo n.º 7
0
def _upload_path(api: AlephAPI, path: Path, collection_id: str, parent_id: str,
                 foreign_id: str) -> str:
    metadata = {
        'foreign_id': foreign_id,
        'file_name': path.name,
    }
    log.info('Upload [%s->%s]: %s', collection_id, parent_id, foreign_id)
    if parent_id is not None:
        metadata['parent_id'] = parent_id
    result = api.ingest_upload(collection_id, path, metadata=metadata)
    if 'id' not in result:
        raise AlephException('Upload failed')
    return result['id']
Ejemplo n.º 8
0
 def upload_path(self, path: Path, parent_id: str, foreign_id: str) -> str:
     metadata = {
         "foreign_id": foreign_id,
         "file_name": path.name,
     }
     log.info("Upload [%s->%s]: %s", self.collection_id, parent_id, foreign_id)
     if parent_id is not None:
         metadata["parent_id"] = parent_id
     result = self.api.ingest_upload(
         self.collection_id, path, metadata=metadata, index=self.index
     )
     if "id" not in result:
         raise AlephException("Upload failed")
     return result["id"]
Ejemplo n.º 9
0
    def _request(self, method, url, **kwargs):
        """A single point to make the http requests.

        Having a single point to make all requests let's us set headers, manage
        successful and failed responses and possibly manage session etc
        conviniently in a single place.
        """
        try:
            response = self.session.request(method=method, url=url, **kwargs)
            response.raise_for_status()
        except RequestException as exc:
            raise AlephException(exc)

        if len(response.text):
            return response.json()
Ejemplo n.º 10
0
    def __init__(self,
                 host: Optional[str] = settings.HOST,
                 api_key: Optional[str] = settings.API_KEY,
                 session_id: Optional[str] = None,
                 retries: int = settings.MAX_TRIES):

        if not host:
            raise AlephException('No host environment variable found')
        self.base_url = urljoin(host, '/api/2/')
        self.retries = retries
        session_id = session_id or str(uuid.uuid4())
        self.session: Session = Session()
        self.session.headers['X-Aleph-Session'] = session_id
        self.session.headers['User-Agent'] = 'alephclient/%s' % VERSION
        if api_key is not None:
            self.session.headers['Authorization'] = 'ApiKey %s' % api_key
Ejemplo n.º 11
0
 def _bulk_chunk(self, collection_id, chunk, force=False, unsafe=False):
     for attempt in count(1):
         url = self._make_url("collections/{0}/_bulk".format(collection_id))
         params = {'unsafe': unsafe}
         try:
             response = self.session.post(url, json=chunk, params=params)
             response.raise_for_status()
             return
         except RequestException as exc:
             ae = AlephException(exc)
             if not ae.transient or attempt > self.retries:
                 if not force:
                     raise ae
                 log.error(ae)
                 return
             backoff(ae, attempt)
Ejemplo n.º 12
0
 def _bulk_chunk(
     self,
     collection_id: str,
     chunk: List,
     entityset_id: Optional[str] = None,
     force: bool = False,
     unsafe: bool = False,
 ):
     for attempt in count(1):
         url = self._make_url(f"collections/{collection_id}/_bulk")
         params = {"unsafe": unsafe, "entityset_id": entityset_id}
         try:
             response = self.session.post(url, json=chunk, params=params)
             response.raise_for_status()
             return
         except RequestException as exc:
             ae = AlephException(exc)
             if not ae.transient or attempt > self.retries:
                 if not force:
                     raise ae
                 log.error(ae)
                 return
             backoff(ae, attempt)
Ejemplo n.º 13
0
    def write_entity(self,
                     collection_id: str,
                     entity: Dict,
                     entity_id: str = None,
                     **kw) -> Dict:
        """Create a single entity via the API, in the given
        collection.

        params
        ------
        collection_id: id of the collection to use. This will overwrite any
        existing collection specified in the entity dict
        entity_id: id for the entity to be created. This will overwrite any
        existing entity specified in the entity dict
        entity: A dict object containing the values of the entity
        """
        entity["collection_id"] = collection_id

        if entity_id is not None:
            entity["id"] = entity_id

        for attempt in count(1):
            if entity_id is not None:
                url = self._make_url("entities/{}").format(entity_id)
            else:
                url = self._make_url("entities")
            try:
                return self._request("POST", url, json=entity)
            except RequestException as exc:
                ae = AlephException(exc)
                if not ae.transient or attempt > self.retries:
                    log.error(ae)
                    raise exc
                backoff(ae, attempt)

        return {}