def get_collection(collection_id): """Fetch a collection from the index.""" if collection_id is None: return key = cache.object_key(Collection, collection_id) data = cache.get_complex(key) if data is not None: return data collection = Collection.by_id(collection_id) if collection is None: return data = collection.to_dict() stats = get_collection_stats(collection.id) data['count'] = stats['count'] data['schemata'] = stats['schemata'] # if no countries or langs are given, take the most common from the data. countries = ensure_list(collection.countries) countries = countries or stats['countries'].keys() data['countries'] = registry.country.normalize_set(countries) languages = ensure_list(collection.languages) languages = languages or stats['languages'].keys() data['languages'] = registry.language.normalize_set(languages) cache.set_complex(key, data, expire=cache.EXPIRE) return data
def entities(collection_id=None): require(request.authz.can_stream()) log.debug("Stream entities [%r] begins... (coll: %s)", request.authz, collection_id) schemata = ensure_list(request.args.getlist('schema')) excludes = ['text', 'roles', 'fingerprints'] includes = ensure_list(request.args.getlist('include')) includes = [f for f in includes if f not in excludes] if collection_id is not None: get_db_collection(collection_id, request.authz.READ) record_audit(Audit.ACT_COLLECTION, id=collection_id) entities = iter_entities(authz=request.authz, collection_id=collection_id, schemata=schemata, excludes=excludes, includes=includes) return stream_ijson(entities)
def __init__(self, role_id, roles, is_admin=False): self.id = role_id self.logged_in = role_id is not None self.roles = set(ensure_list(roles)) self.is_admin = is_admin self.in_maintenance = settings.MAINTENANCE self.session_write = not self.in_maintenance and self.logged_in self._collections = {}
def __init__(self, label, span, countries): super(LocationResult, self).__init__(label, span, countries) if self.key is not None: try: value = kv.lrange(place_key(self.key), 0, -1) self.countries = ensure_list(value) except KeyError: pass
def iter_entities(authz=None, collection_id=None, schemata=None, includes=None, excludes=None): """Scan all entities matching the given criteria.""" filters = [] if authz is not None: filters.append(authz_query(authz)) if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) if ensure_list(schemata): filters.append({'terms': {'schemata': ensure_list(schemata)}}) query = { 'query': {'bool': {'filter': filters}}, '_source': _source_spec(includes, excludes) } index = entities_read_index(schema=schemata) for res in scan(es, index=index, query=query, scroll='1410m'): entity = unpack_result(res) if entity is not None: yield entity
def update(self, data, creator=None): self.updated_at = datetime.utcnow() self.label = data.get('label', self.label) self.summary = data.get('summary', self.summary) self.summary = data.get('summary', self.summary) self.publisher = data.get('publisher', self.publisher) self.publisher_url = data.get('publisher_url', self.publisher_url) self.info_url = data.get('info_url', self.info_url) self.data_url = data.get('data_url', self.data_url) self.category = data.get('category', self.category) self.casefile = as_bool(data.get('casefile'), default=self.casefile) self.countries = ensure_list(data.get('countries', self.countries)) self.languages = ensure_list(data.get('languages', self.languages)) if creator is None: creator = Role.by_id(data.get('creator_id')) if creator is not None: self.creator = creator db.session.add(self) db.session.flush() if self.creator is not None: Permission.grant(self, self.creator, True, True)
def schema_scope(schema, expand=True): schemata = set() names = ensure_list(schema) or model.schemata.values() for schema in names: schema = model.get(schema) if schema is not None: schemata.add(schema) if expand: schemata.update(schema.descendants) for schema in schemata: if not schema.abstract: yield schema
def update(self): """Apply the outcome of the result to the document.""" doc = self.document if self.status == self.STATUS_SUCCESS: doc.status = Document.STATUS_SUCCESS doc.error_message = None else: doc.status = Document.STATUS_FAIL doc.error_message = stringify(self.error_message) schema = model['Document'] for flag, name in self.SCHEMATA: if flag in self.flags: schema = model[name] doc.schema = schema.name doc.foreign_id = safe_string(self.id) doc.content_hash = self.checksum or doc.content_hash doc.pdf_version = self.pdf_checksum doc.title = self.title or doc.meta.get('title') doc.file_name = self.file_name or doc.meta.get('file_name') doc.file_size = self.size or doc.meta.get('file_size') doc.summary = self.summary or doc.meta.get('summary') doc.author = self.author or doc.meta.get('author') doc.generator = self.generator or doc.meta.get('generator') doc.mime_type = self.mime_type or doc.meta.get('mime_type') doc.encoding = self.encoding or doc.meta.get('encoding') doc.date = self.date or doc.meta.get('date') doc.authored_at = self.created_at or doc.meta.get('authored_at') doc.modified_at = self.modified_at or doc.meta.get('modified_at') doc.published_at = self.published_at or doc.meta.get('published_at') doc.message_id = self.message_id or doc.meta.get('message_id') doc.in_reply_to = ensure_list(self.in_reply_to) doc.columns = list(self.columns.keys()) doc.body_raw = self.body_html doc.body_text = self.body_text doc.headers = self.headers for kw in self.keywords: doc.add_keyword(safe_string(kw)) for lang in self.languages: doc.add_language(safe_string(lang)) db.session.flush() collector = DocumentTagCollector(doc, 'ingestors') for entity in self.entities: collector.emit(entity, DocumentTag.TYPE_PERSON) for email in self.emails: collector.emit(email, DocumentTag.TYPE_EMAIL) collector.save()
def bulk(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) merge = get_flag('merge', default=False) # This will disable certain security measures in order to allow bulk # loading of document data. unsafe = get_flag('unsafe', default=False) unsafe = unsafe and request.authz.is_admin entities = ensure_list(request.get_json(force=True)) bulk_write(collection, entities, merge=merge, unsafe=unsafe) refresh_collection(id) return ('', 204)
def url_for(*a, **kw): """Overwrite Flask url_for to force external paths.""" try: kw['_external'] = False query = kw.pop('_query', None) authorize = kw.pop('_authorize', False) relative = kw.pop('_relative', False) path = flask_url_for(*a, **kw) if authorize is True and hasattr(request, 'authz'): token = request.authz.to_token(scope=path) query = list(ensure_list(query)) query.append(('api_key', token)) return url_external(path, query, relative=relative) except RuntimeError: return None
def field_filter_query(field, values): """Need to define work-around for full-text fields.""" values = ensure_list(values) if not len(values): return {'match_all': {}} if field in ['_id', 'id']: return {'ids': {'values': values}} if field in ['names']: field = 'fingerprints' if len(values) == 1: # if field in ['addresses']: # field = '%s.text' % field # return {'match_phrase': {field: values[0]}} return {'term': {field: values[0]}} return {'terms': {field: values}}
def match_query(proxy, collection_ids=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" if query is None: query = bool_query() # Don't match the query entity: if proxy.id is not None: sq = {"ids": {"values": [proxy.id]}} query['bool']['must_not'].append(sq) collection_ids = ensure_list(collection_ids) if len(collection_ids): query['bool']['filter'].append({ 'terms': {'collection_id': collection_ids} }) filters = [] for (prop, value) in proxy.itervalues(): specificity = prop.specificity(value) if specificity > 0: filters.append((prop, value, specificity)) filters = sorted(filters, key=lambda p: p[2], reverse=True) required = [] for (prop, value, specificity) in filters: if prop.type in REQUIRED and len(required) <= MAX_CLAUSES: required.extend(_make_queries(prop, value, specificity)) scoring = [] for (prop, value, specificity) in filters: clauses = len(required) + len(scoring) if prop.type not in REQUIRED and clauses <= MAX_CLAUSES: scoring.extend(_make_queries(prop, value, specificity)) if not len(required): # e.g. a document from which no features have been extracted. return none_query() # make it mandatory to have at least one match query['bool']['must'].append({ 'bool': { 'should': [required], 'minimum_should_match': 1 } }) query['bool']['should'].extend(scoring) return query
def publish(event, actor_id=None, params=None, channels=None): """ Publish a notification to the given channels, while storing the parameters and initiating actor for the event. """ assert isinstance(event, Event), event params = params or {} outparams = {} channels = ensure_list(channels) channels.append(channel(actor_id, clazz=Role)) for name, clazz in event.params.items(): obj = params.get(name) outparams[name] = get_entity_id(obj) channels.append(channel(obj, clazz=clazz)) Notification.publish(event, actor_id=actor_id, params=outparams, channels=channels) db.session.flush()
def generate_document(document, shallow=False): """Generate bulk index actions for all records and the main document.""" data = document.to_dict() data['text'] = ensure_list(data.get('text')) total_len = sum((len(t) for t in data['text'])) if document.supports_records: q = db.session.query(DocumentRecord) q = q.filter(DocumentRecord.document_id == document.id) for idx, record in enumerate(q.yield_per(BULK_PAGE)): texts = list(record.texts) if total_len < INDEX_MAX_LEN: total_len += sum((len(t) for t in texts)) data['text'].extend(texts) record = record.to_dict() record['collection_id'] = document.collection_id record['created_at'] = document.created_at record['updated_at'] = document.updated_at record['text'] = texts if not shallow: entity_id, index, body = index_operation(record) yield { '_id': entity_id, '_index': index, '_source': body } if idx > 0 and idx % 1000 == 0: log.info("Indexed [%s]: %s records...", document.id, idx) # log.debug("Text length [%s]: %s", document.id, total_len) entity_id, index, body = index_operation(data) for other in entities_index_list(Document.SCHEMA): if other != index: yield { '_id': entity_id, '_index': other, '_op_type': 'delete' } yield { '_id': entity_id, '_index': index, '_source': body }
def export_entity_properties(g, uri, entity): properties = entity.get('properties', {}) schema = model.get(entity.get('schema')) for name, prop in schema.properties.items(): for value in ensure_list(properties.get(name)): if prop.type_name == 'date': obj = date_lit(value) if prop.type_name == 'country': obj = country_uri(value) if prop.type_name == 'email': obj = email_uri(value) if prop.type_name == 'phone': obj = phone_uri(value) if prop.type_name == 'url': obj = URIRef(value) if prop.type_name == 'entity': obj = entity_uri(value) else: obj = Literal(value) g.add((uri, FTM[name], obj))
def iter_tokens(limit=1000000000): """Go through all the names in the index.""" query = {'_source': {'include': 'names'}} index = entities_read_index(schema=Entity.LEGAL_ENTITY) seen = 0 try: for res in scan(es, index=index, query=query, scroll='1440m'): names = ensure_list(res.get('_source', {}).get('names')) tokens = set() for name in names: tokens.update(name_tokens(name)) yield from tokens seen += 1 if seen % 1000 == 0: log.info("Entities: %s", seen) if limit is not None and seen > limit: return except Exception as ex: log.warning("Token iterator aborted: %s", ex)
def parse_for_metadata(context, data, html): meta = context.params.get('meta', {}) meta_date = context.params.get('meta_date', {}) meta_paths = meta meta_paths.update(meta_date) for key, xpaths in meta_paths.items(): for xpath in ensure_list(xpaths): element = html.find(xpath) if element is None: continue value = collapse_spaces(element.text_content()) if key in meta_date: value = iso_date(value) if value is not None: data[key] = value break return data
def convert_record(record, country=None): published_date = clean_date(record.pop('publishedDate', None)) publisher = record.pop('publisher', {}).get('name') if record.get('tag'): for entity in convert_item(record, country): entity.add('publisher', publisher, quiet=True) entity.add('modifiedAt', published_date, quiet=True) yield entity compiled_release = record.get('compiledRelease', {}) for entity in convert_item(compiled_release, country): entity.add('publisher', publisher, quiet=True) entity.add('modifiedAt', published_date, quiet=True) yield entity for release in ensure_list(record.get('releases', [])): for entity in convert_item(release, country): entity.add('publisher', publisher, quiet=True) entity.add('modifiedAt', published_date, quiet=True) yield entity
def format_proxy(proxy, collection, job_id=None): """Apply final denormalisations to the index.""" proxy.context = {} proxy = collection.ns.apply(proxy) data = proxy.to_full_dict() data['collection_id'] = collection.id data['job_id'] = job_id names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. properties = data.get('properties') text = properties.pop('indexText', []) text.extend(fps) text.append(collection.label) data['text'] = text data['updated_at'] = collection.updated_at for updated_at in properties.pop('indexUpdatedAt', []): data['updated_at'] = updated_at # integer casting numeric = {} for prop, values in properties.items(): prop = proxy.schema.get(prop) if prop.type in NUMERIC_TYPES: numeric[prop.name] = _numeric_values(prop.type, values) # also cast group field for dates numeric['dates'] = _numeric_values(registry.date, data.get('dates')) data['numeric'] = numeric # pprint(data) entity_id = data.pop('id') return { '_id': entity_id, '_index': entities_write_index(data.get('schema')), '_source': data }
def generate_document(document, shallow=False): """Generate bulk index actions for all records and the main document.""" data = document.to_dict() data['text'] = ensure_list(data.get('text')) total_len = sum((len(t) for t in data['text'])) if document.supports_records: q = db.session.query(DocumentRecord) q = q.filter(DocumentRecord.document_id == document.id) for idx, record in enumerate(q.yield_per(BULK_PAGE)): texts = list(record.texts) if total_len < INDEX_MAX_LEN: total_len += sum((len(t) for t in texts)) data['text'].extend(texts) record = record.to_dict() record['collection_id'] = document.collection_id record['created_at'] = document.created_at record['updated_at'] = document.updated_at record['text'] = texts if not shallow: entity_id, index, body = index_operation(record) yield { '_id': entity_id, '_index': index, '_type': 'doc', '_source': body } if idx > 0 and idx % 1000 == 0: log.info("Indexed [%s]: %s records...", document.id, idx) # log.debug("Text length [%s]: %s", document.id, total_len) entity_id, index, body = index_operation(data) for other in entities_index_list(Document.SCHEMA): if other != index: yield { '_id': entity_id, '_index': other, '_type': 'doc', '_op_type': 'delete' } yield {'_id': entity_id, '_index': index, '_type': 'doc', '_source': body}
def update_many(self, rows, keys, chunk_size=1000, ensure=None, types=None): """Update many rows in the table at a time. This is significantly faster than updating them one by one. Per default the rows are processed in chunks of 1000 per commit, unless you specify a different ``chunk_size``. See :py:meth:`update() <dataset.Table.update>` for details on the other parameters. """ keys = ensure_list(keys) chunk = [] columns = [] for index, row in enumerate(rows): chunk.append(row) for col in row.keys(): if col not in columns: columns.append(col) # bindparam requires names to not conflict (cannot be "id" for id) for key in keys: row["_%s" % key] = row[key] # Update when chunk_size is fulfilled or this is the last row if len(chunk) == chunk_size or index == len(rows) - 1: cl = [self.table.c[k] == bindparam("_%s" % k) for k in keys] stmt = self.table.update( whereclause=and_(*cl), values={ col: bindparam(col, required=False) for col in columns }, ) self.db.executable.execute(stmt, chunk) chunk = []
def _serialize(self, obj): pk = obj.get('id') obj['links'] = { 'self': url_for('collections_api.view', collection_id=pk), 'xref': url_for('xref_api.index', collection_id=pk), 'xref_export': url_for('xref_api.export', collection_id=pk, _authorize=obj.get('secret')), 'reconcile': url_for('reconcile_api.reconcile', collection_id=pk, _authorize=obj.get('secret')), 'ui': collection_url(pk) } obj['writeable'] = request.authz.can(pk, request.authz.WRITE) creator_id = obj.pop('creator_id', None) obj['creator'] = self.resolve(Role, creator_id, RoleSerializer) obj['team'] = [] for role_id in ensure_list(obj.pop('team_id', [])): if request.authz.can_read_role(role_id): role = self.resolve(Role, role_id, RoleSerializer) obj['team'].append(role) return obj
def match_company(self, proxy): MatchCriteria = self.client.get_type('ns0:MatchCriteria') # SelectionResult = self.client.get_type('ns0:SelectionResult') countries = list(proxy.countries) if len(countries) == 1: ct = MatchCriteria(Name=proxy.caption, Country=countries[0]) else: ct = MatchCriteria(Name=proxy.caption) data = self.cache.get(ct) if data is None: try: res = self.service.Match(self.session, ct, ['None']) data = zeep.helpers.serialize_object(res) # pprint(data) data = json.loads(json.dumps(data)) self.cache.store(ct, data) except Exception: log.exception("Orbis match call error.") self.close() return ensure_list(data)
def export_entity(entity, collection_uri): g = Graph() uri = registry.entity.rdf(entity.get('id')) g.add((uri, DCTERMS.isPartOf, collection_uri)) g.add((collection_uri, DCTERMS.hasPart, uri)) if 'properties' not in entity: entity.update(Document.doc_data_to_schema(entity)) schema = model.get(entity.get('schema')) for schema_ in schema.schemata: g.add((uri, RDF.type, schema_.uri)) properties = entity.get('properties', {}) for name, prop in schema.properties.items(): for value in ensure_list(properties.get(name)): obj = prop.type.rdf(value) g.add((uri, prop.uri, obj)) if entity.get('name'): g.add((uri, SKOS.prefLabel, Literal(entity.get('name')))) return g
def _index_form(collection, matches): now = datetime.utcnow().isoformat() for (score, entity, match_collection_id, match) in matches: xref_id = hash_data((entity.id, collection.id, match.id)) text = ensure_list(entity.get_type_values(registry.name)) text.extend(match.get_type_values(registry.name)) yield { "_id": xref_id, "_index": xref_index(), "_source": { "score": score, "entity_id": entity.id, "collection_id": collection.id, "match_id": match.id, "match_collection_id": match_collection_id, "countries": match.get_type_values(registry.country), "schema": match.schema.name, "text": text, "created_at": now, }, }
def generate(self): for parent in ensure_list(self.data.get('extends')): parent = self.model.get(parent) parent.generate() for name, prop in parent.properties.items(): if name not in self.properties: self.properties[name] = prop self.extends.add(parent) for ancestor in parent.schemata: self.schemata.add(ancestor) self.names.add(ancestor.name) ancestor.descendants.add(self) for prop in self.properties.values(): prop.generate() for featured in self.featured: if self.get(featured) is None: raise InvalidModel("Missing featured property: %s" % featured)
def bulk_write(collection, entities, safe=False, role_id=None, mutable=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. aggregator = get_aggregator(collection) writer = aggregator.bulk() for data in entities: entity = model.get_proxy(data, cleaned=False) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if safe: entity = remove_checksums(entity) entity.context = {"role_id": role_id, "mutable": mutable} for field, func in (("created_at", min), ("updated_at", max)): ts = func(ensure_list(data.get(field)), default=None) dt = registry.date.to_datetime(ts) if dt is not None: entity.context[field] = dt.isoformat() writer.put(entity, origin="bulk") yield entity.id writer.flush()
def generate(self): """While loading the schema, this function will validate and fully load the hierarchy, properties and flags of the definition.""" for parent in ensure_list(self.data.get("extends")): parent = self.model.get(parent) parent.generate() for name, prop in parent.properties.items(): if name not in self.properties: self.properties[name] = prop self.extends.add(parent) for ancestor in parent.schemata: self.schemata.add(ancestor) self.names.add(ancestor.name) ancestor.descendants.add(self) for prop in list(self.properties.values()): prop.generate() for featured in self.featured: if self.get(featured) is None: raise InvalidModel("Missing featured property: %s" % featured) for caption in self.caption: if self.get(caption) is None: raise InvalidModel("Missing caption property: %s" % caption) for required in self.required: if self.get(required) is None: raise InvalidModel("Missing required property: %s" % required) if self.edge: if self.source_prop is None: msg = "Missing edge source: %s" % self.edge_source raise InvalidModel(msg) if self.target_prop is None: msg = "Missing edge target: %s" % self.edge_target raise InvalidModel(msg)
def entities_by_ids(ids, schemata=None, cached=False, includes=None, excludes=None): """Iterate over unpacked entities based on a search for the given entity IDs.""" ids = ensure_list(ids) if not len(ids): return index = entities_read_index(schema=schemata) query = {'ids': {'values': ids}} # query = {'bool': {'filter': query}} query = { 'query': query, '_source': _source_spec(includes, excludes), 'size': MAX_PAGE } result = es.search(index=index, body=query) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: if cached: _cache_entity(entity) yield entity
def generate(collection_id): """ --- post: summary: Generate cross-reference matches description: > Generate cross-reference matches for entities in a collection. parameters: - in: path name: collection_id required: true schema: type: integer requestBody: content: application/json: schema: $ref: '#/components/schemas/XrefGenerate' responses: '202': content: application/json: schema: properties: status: description: accepted type: string type: object description: Accepted tags: - Xref - Collection """ data = parse_request('XrefGenerate') collection = get_db_collection(collection_id, request.authz.WRITE) against = ensure_list(data.get("against_collection_ids")) payload = {'against_collection_ids': against} queue_task(collection, OP_XREF, payload=payload) return jsonify({'status': 'accepted'}, status=202)
def match_query(proxy, collection_ids=None, query=None): """Given a document or entity in indexed form, build a query that will find similar entities based on a variety of criteria.""" if query is None: query = bool_query() # Don't match the query entity: if proxy.id is not None: sq = {"ids": {"values": [proxy.id]}} query['bool']['must_not'].append(sq) collection_ids = ensure_list(collection_ids) if len(collection_ids): query['bool']['filter'].append({ 'terms': {'collection_id': collection_ids} }) required = [] scoring = [] for (prop, value) in proxy.itervalues(): queries = list(_make_queries(prop, value)) if prop.type in REQUIRED: required.extend(queries) else: scoring.extend(queries) if not len(required): # e.g. a document from which no features have been extracted. return none_query() # make it mandatory to have at least one match query['bool']['must'].append({ 'bool': { 'should': [required], 'minimum_should_match': 1 } }) query['bool']['should'].extend(scoring) return query
def search( self, query: str, schema: Optional[str] = None, schemata: Optional[str] = None, filters: Optional[List] = None, publisher: bool = False, params: Optional[Mapping[str, Any]] = None, ) -> "EntityResultSet": """Conduct a search and return the search results.""" filters_list: List = ensure_list(filters) if schema is not None: filters_list.append(("schema", schema)) if schemata is not None: filters_list.append(("schemata", schemata)) if schema is None and schemata is None: filters_list.append(("schemata", "Thing")) url = self._make_url("entities", query=query, filters=filters_list, params=params) return EntityResultSet(self, url, publisher)
def index_operation(data): """Apply final denormalisations to the index.""" data['bulk'] = data.get('bulk', False) names = ensure_list(data.get('names')) fps = set([fingerprints.generate(name) for name in names]) fps.update(names) data['fingerprints'] = [fp for fp in fps if fp is not None] # Slight hack: a magic property in followthemoney that gets taken out # of the properties and added straight to the index text. texts = data.pop('text', []) texts.extend(data.get('properties', {}).pop('indexText', [])) texts.extend(fps) data['text'] = texts if not data.get('created_at'): data['created_at'] = data.get('updated_at') entity_id = str(data.pop('id')) data.pop('_index', None) index = entities_write_index(data.get('schema')) return entity_id, index, data
def set_entity_properties(g, uri, entity): if entity.get('name'): g.add((uri, SKOS.prefLabel, Literal(entity.get('name')))) for name in entity.get('names', []): g.add((uri, RDFS.label, Literal(name))) for country in entity.get('countries', []): if len(country) != 2: continue g.add((uri, ALEPH.country, country_uri(country))) for phone in entity.get('phones', []): g.add((uri, ALEPH.phone, tel_uri(phone))) for email in entity.get('emails', []): g.add((uri, ALEPH.email, email_uri(email))) properties = entity.get('properties', {}) for name, values in properties.items(): pred = FTM[name] for value in ensure_list(values): obj = typed_object(name, value) g.add((uri, pred, obj))
def _serialize(self, obj): pk = obj.get("id") authz = request.authz if obj.get("secret") else None obj["links"] = { "self": url_for("collections_api.view", collection_id=pk), "xref_export": url_for("xref_api.export", collection_id=pk, _authz=authz), "reconcile": url_for("reconcile_api.reconcile", collection_id=pk), "ui": collection_url(pk), } obj["shallow"] = obj.get("shallow", True) obj["writeable"] = request.authz.can(pk, request.authz.WRITE) creator_id = obj.pop("creator_id", None) obj["creator"] = self.resolve(Role, creator_id, RoleSerializer) obj["team"] = [] for role_id in ensure_list(obj.pop("team_id", [])): if request.authz.can_read_role(role_id): role = self.resolve(Role, role_id, RoleSerializer) obj["team"].append(role) return obj
def _create_meta_object(context, data) -> dict: source_url = data.get("source_url", data.get("url")) foreign_id = data.get("foreign_id", data.get("request_id", source_url)) meta = { "crawler": context.crawler.name, "foreign_id": foreign_id, "source_url": source_url, "title": data.get("title"), "author": data.get("author"), "publisher": data.get("publisher"), "file_name": data.get("file_name"), "retrieved_at": data.get("retrieved_at"), "modified_at": data.get("modified_at"), "published_at": data.get("published_at"), "headers": ensure_dict(data.get("headers")), "keywords": ensure_list(data.get("keywords")), } if data.get("aleph_folder_id"): meta["parent"] = {"id": data.get("aleph_folder_id")} return meta
def entities_read_index(schema=None, descendants=True, exclude=None): """Combined index to run all queries against.""" if not settings.ENTITIES_INDEX_SPLIT: indexes = set(settings.ENTITIES_INDEX_SET) indexes.add(settings.ENTITIES_INDEX) return ','.join(indexes) schemata = set() names = ensure_list(schema) or model.schemata.values() for schema in names: schema = model.get(schema) if schema is None: continue schemata.add(schema) if descendants: schemata.update(schema.descendants) exclude = model.get(exclude) indexes = list(settings.ENTITIES_INDEX_SET) for schema in schemata: if not schema.abstract and schema != exclude: indexes.append(schema_index(schema)) # log.info("Read index: %r", indexes) return ','.join(indexes)
def _serialize(self, obj): pk = obj.get('id') obj['id'] = str(pk) schema = model.get(obj.get('schema')) if schema is None: return None properties = obj.get('properties', {}) for prop in schema.properties.values(): if prop.type != registry.entity: continue values = ensure_list(properties.get(prop.name)) if values: properties[prop.name] = [] for value in values: entity = self.resolve(Entity, value, DiagramEntitySerializer) # noqa if entity is None: entity = value properties[prop.name].append(entity) obj.pop('_index', None) collection_id = obj.pop('collection_id', None) obj['collection_id'] = str(collection_id) return self._clean_response(obj)
def entities_by_ids( ids, schemata=None, cached=False, includes=PROXY_INCLUDES, excludes=None ): """Iterate over unpacked entities based on a search for the given entity IDs.""" ids = ensure_list(ids) if not len(ids): return cached = cached and excludes is None and includes == PROXY_INCLUDES entities = {} if cached: keys = [cache.object_key(Entity, i) for i in ids] for _, entity in cache.get_many_complex(keys): if entity is not None: entities[entity.get("id")] = entity missing = [i for i in ids if entities.get(id) is None] index = entities_read_index(schema=schemata) query = { "query": {"ids": {"values": missing}}, "_source": _source_spec(includes, excludes), "size": MAX_PAGE, } result = es.search(index=index, body=query) for doc in result.get("hits", {}).get("hits", []): entity = unpack_result(doc) if entity is not None: entity_id = entity.get("id") entities[entity_id] = entity if cached: key = cache.object_key(Entity, entity_id) cache.set_complex(key, entity, expires=60 * 60 * 2) for i in ids: entity = entities.get(i) if entity is not None: yield entity
def index(): """Returns a list of diagrams for the role --- get: summary: List diagrams parameters: - description: The collection id. in: query name: 'filter:collection_id' required: true schema: minimum: 1 type: integer responses: '200': content: application/json: schema: type: object allOf: - $ref: '#/components/schemas/QueryResponse' properties: results: type: array items: $ref: '#/components/schemas/Diagram' description: OK tags: - Diagram """ parser = QueryParser(request.args, request.authz) q = Diagram.by_authz(request.authz) collection_ids = ensure_list(parser.filters.get('collection_id')) if len(collection_ids): q = q.filter(Diagram.collection_id.in_(collection_ids)) result = DatabaseQueryResult(request, q) return DiagramSerializer.jsonify_result(result)
def ingest_component(self, entity, idx, comp): if comp.name == "VCALENDAR": entity.add("generator", comp.get("PRODID")) if comp.name == "VEVENT": event = self.manager.make_entity("Event") self.manager.apply_context(event, entity) uid = sanitize_text(comp.get("UID")) if uid is not None: event.make_id(uid) else: event.make_id(entity.id, idx) event.add("proof", entity) event.add("name", comp.get("SUMMARY")) event.add("description", comp.get("DESCRIPTION")) event.add("location", comp.get("LOCATION")) event.add("sourceUrl", comp.get("URL")) event.add("startDate", cal_date(comp.get("DTSTART"))) event.add("endDate", cal_date(comp.get("DTEND"))) event.add("date", cal_date(comp.get("CREATED"))) event.add("modifiedAt", cal_date(comp.get("LAST-MODIFIED"))) event.add("organizer", self.address_entity(comp.get("ORGANIZER"))) for attendee in ensure_list(comp.get("ATTENDEE")): event.add("involved", self.address_entity(attendee)) self.manager.emit_entity(event, fragment=idx)
def extract_text(self, data, languages=None): key = sha1(data).hexdigest() text = Cache.get_cache(key) if text is not None: log.info('%s chars cached', len(text)) return text data = self.ensure_size(data) if data is None: return for attempt in range(1000): try: service = RecognizeTextStub(self.channel) languages = ensure_list(languages) image = Image(data=data, languages=languages) response = service.Recognize(image) log.info('%s chars recognized', len(response.text)) if response.text is not None: Cache.set_cache(key, response.text) return response.text except self.Error as e: log.warning("gRPC [%s]: %s", e.code(), e.details()) backoff(failures=attempt)
def entities_by_ids(ids, schemata=None, cached=False, includes=None, excludes=None): """Iterate over unpacked entities based on a search for the given entity IDs.""" ids = ensure_list(ids) if not len(ids): return index = entities_read_index(schema=schemata) query = {'ids': {'values': ids}} # query = {'bool': {'filter': query}} query = { 'query': query, '_source': _source_spec(includes, excludes), 'size': MAX_PAGE } result = es.search(index=index, body=query) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: # Cache entities only briefly to avoid filling up the cache: if cached: key = cache.object_key(Entity, entity.get('id')) cache.set_complex(key, entity, expire=60 * 60) yield entity
def parse_for_metadata(context, data, html): meta = context.params.get('meta', {}) meta_date = context.params.get('meta_date', {}) meta_paths = meta meta_paths.update(meta_date) for key, xpaths in meta_paths.items(): for xpath in ensure_list(xpaths): element = html.xpath(xpath)[0] if element is None: continue try: value = collapse_spaces(element.text_content()) except AttributeError: # useful when element is an attribute value = collapse_spaces(str(element)) if key in meta_date: value = iso_date(value) if value is not None: data[key] = value break return data
def match(self, roles): """See if there's overlap in roles.""" roles = ensure_list(roles) if not len(roles): return False return self.roles.intersection(roles) > 0
def __init__(self, label, span, countries): args = dict(countries=countries) label = self.type.clean(label, **args) super(TypedResult, self).__init__(label, span, countries) self.countries = ensure_list(self.type.country_hint(label))
def _source_spec(includes, excludes): includes = ensure_list(includes) excludes = ensure_list(excludes) if not len(excludes): excludes = EXCLUDE_DEFAULT return {'includes': includes, 'excludes': excludes}