def stream_entities(self, collection_id=None, include=None, schema=None): """Iterate over all entities in the given collection. params ------ collection_id: id of the collection to stream include: an array of fields from the index to include. """ url = self._make_url('entities/_stream') if collection_id is not None: url = "collections/{0}/_stream".format(collection_id) url = self._make_url(url) params = {'include': include, 'schema': schema} try: res = self.session.get(url, params=params, stream=True) res.raise_for_status() for entity in res.iter_lines(): if isinstance(entity, bytes): entity = entity.decode('utf-8') entity = json.loads(entity) aleph_url = 'entities/%s' % entity.get('id') prop_push(entity, 'alephUrl', self._make_url(aleph_url)) yield entity except RequestException as exc: raise AlephException(exc)
def stream_entities(self, collection: Optional[Dict] = None, include: Optional[List] = None, schema: Optional[str] = None, publisher: bool = False) -> Iterator[Dict]: """Iterate over all entities in the given collection. params ------ collection_id: id of the collection to stream include: an array of fields from the index to include. """ url = self._make_url('entities/_stream') if collection is not None: collection_id = collection.get('id') url = f"collections/{collection_id}/_stream" url = self._make_url(url) params = {'include': include, 'schema': schema} try: res = self.session.get(url, params=params, stream=True) res.raise_for_status() for entity in res.iter_lines(): entity = json.loads(entity) yield self._patch_entity(entity, publisher=publisher, collection=collection) except RequestException as exc: raise AlephException(exc)
def stream_entities(self, collection_id=None, include=None, decode_json=True): """Iterate over all entities in the given collection. params ------ collection_id: id of the collection to stream include: an array of fields from the index to include. """ url = self._make_url('entities/_stream') if collection_id is not None: url = "collections/{0}/_stream".format(collection_id) url = self._make_url(url) params = {'include': include} try: res = self.session.get(url, params=params, stream=True) res.raise_for_status() for entity in res.iter_lines(): entity = json.loads(entity) properties = entity.get('properties') if properties is not None and 'id' in entity: values = properties.get('alephUrl', []) aleph_url = 'entities/%s' % entity.get('id') values.append(self._make_url(aleph_url)) properties['alephUrl'] = values yield entity except RequestException as exc: raise AlephException(exc)
def bulk_load(api: AlephAPI, mapping_file: str): data = load_config_file(mapping_file) if not isinstance(data, dict): raise AlephException('mapping_file has to be a json dictionary') for foreign_id, config in data.items(): collection = api.load_collection_by_foreign_id(foreign_id, config) collection_id = collection['id'] log.info(f"Bulk mapping collection ID: {collection_id}") api.map_collection(collection_id, data)
def match(self, entity, collection_ids=None, url=None): """Find similar entities given a sample entity.""" params = {'collection_ids': ensure_list(collection_ids)} if url is None: url = self._make_url('match') try: response = self.session.post(url, json=entity, params=params) response.raise_for_status() for result in response.json().get('results', []): yield result except RequestException as exc: raise AlephException(exc)
def match(self, entity: Dict, collection_ids: Optional[str] = None, url: str = None, publisher: bool = False) -> Iterator[List]: """Find similar entities given a sample entity.""" params = {'collection_ids': ensure_list(collection_ids)} if url is None: url = self._make_url('match') try: response = self.session.post(url, json=entity, params=params) response.raise_for_status() for result in response.json().get('results', []): yield self._patch_entity(result, publisher=publisher) except RequestException as exc: raise AlephException(exc)
def _upload_path(api: AlephAPI, path: Path, collection_id: str, parent_id: str, foreign_id: str) -> str: metadata = { 'foreign_id': foreign_id, 'file_name': path.name, } log.info('Upload [%s->%s]: %s', collection_id, parent_id, foreign_id) if parent_id is not None: metadata['parent_id'] = parent_id result = api.ingest_upload(collection_id, path, metadata=metadata) if 'id' not in result: raise AlephException('Upload failed') return result['id']
def upload_path(self, path: Path, parent_id: str, foreign_id: str) -> str: metadata = { "foreign_id": foreign_id, "file_name": path.name, } log.info("Upload [%s->%s]: %s", self.collection_id, parent_id, foreign_id) if parent_id is not None: metadata["parent_id"] = parent_id result = self.api.ingest_upload( self.collection_id, path, metadata=metadata, index=self.index ) if "id" not in result: raise AlephException("Upload failed") return result["id"]
def _request(self, method, url, **kwargs): """A single point to make the http requests. Having a single point to make all requests let's us set headers, manage successful and failed responses and possibly manage session etc conviniently in a single place. """ try: response = self.session.request(method=method, url=url, **kwargs) response.raise_for_status() except RequestException as exc: raise AlephException(exc) if len(response.text): return response.json()
def __init__(self, host: Optional[str] = settings.HOST, api_key: Optional[str] = settings.API_KEY, session_id: Optional[str] = None, retries: int = settings.MAX_TRIES): if not host: raise AlephException('No host environment variable found') self.base_url = urljoin(host, '/api/2/') self.retries = retries session_id = session_id or str(uuid.uuid4()) self.session: Session = Session() self.session.headers['X-Aleph-Session'] = session_id self.session.headers['User-Agent'] = 'alephclient/%s' % VERSION if api_key is not None: self.session.headers['Authorization'] = 'ApiKey %s' % api_key
def _bulk_chunk(self, collection_id, chunk, force=False, unsafe=False): for attempt in count(1): url = self._make_url("collections/{0}/_bulk".format(collection_id)) params = {'unsafe': unsafe} try: response = self.session.post(url, json=chunk, params=params) response.raise_for_status() return except RequestException as exc: ae = AlephException(exc) if not ae.transient or attempt > self.retries: if not force: raise ae log.error(ae) return backoff(ae, attempt)
def _bulk_chunk( self, collection_id: str, chunk: List, entityset_id: Optional[str] = None, force: bool = False, unsafe: bool = False, ): for attempt in count(1): url = self._make_url(f"collections/{collection_id}/_bulk") params = {"unsafe": unsafe, "entityset_id": entityset_id} try: response = self.session.post(url, json=chunk, params=params) response.raise_for_status() return except RequestException as exc: ae = AlephException(exc) if not ae.transient or attempt > self.retries: if not force: raise ae log.error(ae) return backoff(ae, attempt)
def write_entity(self, collection_id: str, entity: Dict, entity_id: str = None, **kw) -> Dict: """Create a single entity via the API, in the given collection. params ------ collection_id: id of the collection to use. This will overwrite any existing collection specified in the entity dict entity_id: id for the entity to be created. This will overwrite any existing entity specified in the entity dict entity: A dict object containing the values of the entity """ entity["collection_id"] = collection_id if entity_id is not None: entity["id"] = entity_id for attempt in count(1): if entity_id is not None: url = self._make_url("entities/{}").format(entity_id) else: url = self._make_url("entities") try: return self._request("POST", url, json=entity) except RequestException as exc: ae = AlephException(exc) if not ae.transient or attempt > self.retries: log.error(ae) raise exc backoff(ae, attempt) return {}