def __init__(self, model, data): self.model = model self._data = data # Support output from Aleph's linkage API (profile_id): self.id = data.get('canonical_id', data.get('profile_id')) self.id = self.id or get_entity_id(data.get('canonical')) self._canonical = None self.entity_id = data.get('entity_id') self.entity_id = self.entity_id or get_entity_id(data.get('entity')) self._entity = None self.decision = data.get('decision') self._score = data.get('score', None)
def __init__(self, model, data): self.model = model self._data = data # Support output from Aleph's linkage API (profile_id): self.id = data.get("canonical_id", data.get("profile_id")) self.id = self.id or get_entity_id(data.get("canonical")) self._canonical = None self.entity_id = data.get("entity_id") self.entity_id = self.entity_id or get_entity_id(data.get("entity")) self._entity = None self.decision = data.get("decision") self._score = data.get("score", None)
def map(self, proxy, record, entities, **kwargs): kwargs.update(self.data) if self.entity is not None: entity = entities.get(self.entity) if entity is not None: proxy.add(self.prop, get_entity_id(entity)) # This is really bad in theory, but really useful # in practice. Shoot me. text = proxy.schema.get('indexText') if text is not None: for caption in entity.schema.caption: proxy.add(text, entity.get(caption)) # clean the values returned by the query, or by using literals, or # formats. values = [] for value in self.record_values(record): value = self.type.clean(value, **kwargs) if value is not None: values.append(value) if self.join is not None: values = [self.join.join(values)] if self.split is not None: splote = [] for value in values: splote = splote + value.split(self.split) values = splote proxy.add(self.prop, values)
def map(self, record, entities, **kwargs): kwargs.update(self.data) if self.entity is not None: entity = entities.get(self.entity) return ensure_list(get_entity_id(entity)) # clean the values returned by the query, or by using literals, or # formats. values = [] for value in self.record_values(record): value = self.type.clean(value, **kwargs) if value is not None: values.append(value) if self.join is not None: values = [self.join.join(values)] if self.split is not None: splote = [] for value in values: splote = splote + value.split(self.split) values = splote return unique_list(values)
def map(self, proxy, record, entities): if self.entity is not None: entity = entities.get(self.entity) if entity is not None: proxy.add(self.prop, get_entity_id(entity)) inline_names(proxy, entity) # clean the values returned by the query, or by using literals, or # formats. values = [] for value in self.record_values(record): value = self.type.clean(value, proxy=proxy, **self.data) if value is not None: values.append(value) if self.join is not None: values = [self.join.join(values)] if self.split is not None: splote = [] for value in values: splote = splote + value.split(self.split) values = splote proxy.add(self.prop, values)
def clean(self, text, **kwargs): entity_id = get_entity_id(text) if entity_id is None: return entity_id = str(entity_id) if self.REGEX.match(entity_id) is not None: return entity_id
def resolve(self, subject): """Given an entity or entity ID, return the canonicalised ID that should be used going forward.""" subject = get_entity_id(subject) cluster = self.clusters.get(subject) if cluster is None: return subject return cluster.id
def channel(obj, clazz=None): clazz = clazz or type(obj) if clazz == str: return obj obj = get_entity_id(obj) if obj is not None: return '%s:%s' % (clazz.__name__, obj)
def channel(obj, clazz=None): clazz = clazz or type(obj) if clazz == str: return obj obj = get_entity_id(obj) if obj is not None: return '%s:%s' % (clazz.__name__, obj)
def add(self, subject, canonical): subject, _ = Namespace.parse(get_entity_id(subject)) canonical, _ = Namespace.parse(get_entity_id(canonical)) # Don't do no-ops. if subject == canonical: return if subject is None or canonical is None: return cluster = Cluster(canonical, subject) cluster = self.clusters.get(canonical, cluster) if subject in self.clusters: previous = self.clusters.get(subject) cluster.update(previous.entities) for entity in cluster.entities: self.clusters[entity] = cluster
def clean( self, raw: Any, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: entity_id = get_entity_id(raw) if entity_id is None: return None return self.clean_text(entity_id, fuzzy=fuzzy, format=format, proxy=proxy)
def apply(self, proxy, shallow=False): """Rewrite an entity proxy so all IDs mentioned are limited to the namespace.""" signed = proxy.clone() signed.id = self.sign(proxy.id) if not shallow: for prop in proxy.iterprops(): if prop.type != registry.entity: continue for value in signed.pop(prop): value = get_entity_id(value) signed.add(prop, self.sign(value)) return signed
def save(cls, session, subject, candidate, score=None, judgement=None, priority=None): obj = cls.by_id(session, subject, candidate) if obj is None: obj = cls() obj.id = cls.make_id(subject, candidate) obj.subject, _ = Namespace.parse(get_entity_id(subject)) obj.candidate, _ = Namespace.parse(get_entity_id(candidate)) priority = priority or DEFAULT_PRIORITY if score is not None: obj.score = score obj.priority = score * priority if judgement is not None: obj.judgement = judgement obj.updated_at = now() session.add(obj) return obj
def __init__(self, subject, prop, value, weight=1.0, inverted=False, inferred=False): self.subject = subject self.prop = prop self.value = get_entity_id(value) self.weight = weight self.inverted = inverted self.inferred = inferred
def index_notification(event, actor_id, params, channels, sync=False): """Index a notification.""" params = params or {} params = {n: get_entity_id(params.get(n)) for n in event.params.keys()} channels = list(set([c for c in channels if c is not None])) data = { 'actor_id': actor_id, 'params': params, 'event': event.name, 'channels': channels, 'created_at': datetime.utcnow(), } index = notifications_index() id_ = hash_data((actor_id, event.name, channels, params)) return index_safe(index, id_, data, refresh=refresh_sync(sync))
def validate(self, data): """Validate that the data should be stored. Since the types system doesn't really have validation, this currently tries to normalize the value to see if it passes strict parsing. """ values = [] for val in data: if self.stub: return gettext('Property cannot be written') val = get_entity_id(val) if not self.type.validate(val): return gettext('Invalid value') if val is not None: values.append(val)
def publish(event, actor_id=None, params=None, channels=None): """ Publish a notification to the given channels, while storing the parameters and initiating actor for the event. """ assert isinstance(event, Event), event params = params or {} outparams = {} channels = [channel_tag(c) for c in ensure_list(channels)] for name, clazz in event.params.items(): obj = params.get(name) outparams[name] = get_entity_id(obj) Notification.publish(event, actor_id=actor_id, params=outparams, channels=channels) db.session.flush()
def add(self, subject, canonical): subject = get_entity_id(subject) canonical = get_entity_id(canonical) # Don't do no-ops. if subject == canonical: return if subject is None or canonical is None: return resolved = self.resolve(canonical) # Circular dependencies if resolved == subject: resolved = max(subject, canonical) subject = min(subject, canonical) # Find existing references subjects = [subject] for (src, dst) in self.linkages.items(): if dst == subject: subjects.append(src) for sub in subjects: if sub != resolved: self.linkages[sub] = resolved
def _normalize_data(data): """Turn entities in properties into entity ids""" entities = data['layout']['entities'] for obj in entities: schema = model.get(obj.get('schema')) if schema is None: raise InvalidData("Invalid schema %s" % obj.get('schema')) properties = obj.get('properties', {}) for name, values in list(properties.items()): prop = schema.get(name) if prop.type == registry.entity: properties[prop.name] = [] for value in ensure_list(values): entity_id = get_entity_id(value) properties[prop.name].append(entity_id) return data
def apply(self, proxy): """Rewrite an entity proxy so all IDs mentioned are limited to the namespace. An exception is made for sameAs declarations.""" signed = proxy.clone() signed.id = self.sign(proxy.id) for prop in proxy.iterprops(): if prop.type != registry.entity: continue for value in signed.pop(prop): value = get_entity_id(value) signed.add(prop, self.sign(value)) # linked.add('sameAs', proxy.id, quiet=True) signed.remove('sameAs', signed.id) return signed
def publish(event, actor_id=None, params=None, channels=None): """ Publish a notification to the given channels, while storing the parameters and initiating actor for the event. """ assert isinstance(event, Event), event params = params or {} outparams = {} channels = ensure_list(channels) channels.append(channel(actor_id, clazz=Role)) for name, clazz in event.params.items(): obj = params.get(name) outparams[name] = get_entity_id(obj) channels.append(channel(obj, clazz=clazz)) Notification.publish(event, actor_id=actor_id, params=outparams, channels=channels) db.session.flush()
def index_notification(event, actor_id, params, channels, sync=False): """Index a notification.""" params = params or {} data = {} for param, value in params.items(): value = get_entity_id(value) if value is not None: data[param] = str(value) channels = list(set([c for c in channels if c is not None])) data = { "actor_id": actor_id, "params": data, "event": event.name, "channels": channels, "created_at": datetime.utcnow(), } index = notifications_index() id_ = hash_data((actor_id, event.name, channels, params)) return index_safe(index, id_, data, sync=sync)
def resolve(self, subject): """Given an entity or entity ID, return the canonicalised ID that should be used going forward.""" subject = get_entity_id(subject) return self.linkages.get(subject, subject)
def has(self, subject): subject = get_entity_id(subject) return subject in self.linkages
def clean(self, text, **kwargs): entity_id = get_entity_id(text) if self.validate(entity_id): return entity_id
def __init__(self, subject, canonical, judgement): self.subject = get_entity_id(subject) self.canonical = get_entity_id(canonical) self.judgement = judgement or self.UNSURE
def entity(self, entity): self._entity = entity self.entity_id = get_entity_id(entity)
def canonical(self, entity): self._canonical = entity self.id = get_entity_id(entity)
def clean(self, text, **kwargs): return get_entity_id(text)
def make_id(cls, subject, candidate): subject, _ = Namespace.parse(get_entity_id(subject)) candidate, _ = Namespace.parse(get_entity_id(candidate)) return '.'.join((subject, candidate))
def __init__(self, type_, value): self.type = type_ self.value = get_entity_id(value) self.uri = self.type.rdf(self.value)
def has(self, subject): subject = get_entity_id(subject) return subject in self.clusters