class Link(db.Model, UuidModel, SoftDeleteModel): type = db.Column(db.String(255), index=True) source_id = db.Column(db.String(254), index=True) target_id = db.Column(db.String(254), index=True) foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('links', lazy='dynamic')) # noqa @property def schema(self): return schemata.get(self.type) def to_dict(self): data = super(Link, self).to_dict() data.update({ 'schema': self.type, 'data': self.data, 'foreign_ids': self.foreign_ids or [], 'collection_id': self.collection_id }) return data def __repr__(self): return '<Link(%r, %r, %r)>' % (self.id, self.source_id, self.target_id)
class Reference(db.Model, IdModel, DatedModel): id = db.Column(db.Integer(), primary_key=True) document_id = db.Column(db.BigInteger, db.ForeignKey('document.id')) entity_id = db.Column(db.String(32), db.ForeignKey('entity.id')) origin = db.Column(db.String(128)) weight = db.Column(db.Integer) entity = db.relationship('Entity', backref=db.backref('references', lazy='dynamic')) document = db.relationship('Document', backref=db.backref('references', lazy='dynamic')) def to_dict(self): return { 'entity': { 'id': self.entity.id, 'name': self.entity.name, '$schema': self.entity.type }, 'weight': self.weight, 'origin': self.origin } def __repr__(self): return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
class Match(db.Model, IdModel, DatedModel): entity_id = db.Column(db.String(64)) document_id = db.Column(db.BigInteger()) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) match_id = db.Column(db.String(64)) match_collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) score = db.Column(db.Float(), nullable=True) @classmethod def find_by_collection(cls, collection_id, other_id): q = Match.all() q = q.filter(Match.collection_id == collection_id) q = q.filter(Match.document_id == None) # noqa q = q.filter(Match.match_collection_id == other_id) q = q.order_by(Match.score.desc()) q = q.order_by(Match.id) return q @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): q = db.session.query(cls) q = q.filter( or_(cls.collection_id == collection_id, cls.match_collection_id == collection_id)) q.delete(synchronize_session=False) @classmethod def group_by_collection(cls, collection_id, authz=None): from aleph.model import Collection, Permission cnt = func.count(Match.id).label('matches') parent = Match.collection_id.label('parent') coll = aliased(Collection, name='collection') q = db.session.query(cnt, parent) q = q.filter(Match.collection_id == collection_id) q = q.filter(Match.document_id == None) # noqa q = q.filter(Match.match_collection_id != collection_id) q = q.join(coll, Match.match_collection_id == coll.id) q = q.filter(coll.deleted_at == None) # noqa if authz is not None and not authz.is_admin: q = q.join(Permission, Match.match_collection_id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) q = q.add_entity(coll) q = q.group_by(coll, parent) q = q.order_by(cnt.desc()) q = q.order_by(parent.asc()) return q def __repr__(self): return 'Match(%r, %r, %r, %r)' % (self.entity_id, self.document_id, self.match_id, self.score)
class EntityIdentity(db.Model, IdModel, DatedModel): CONFIRMED = 1 REJECTED = 2 UNDECIDED = 3 JUDGEMENTS = [1, 2, 3] entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) # noqa entity = db.relationship('Entity', backref=db.backref('identities', lazy='dynamic')) # noqa match_id = db.Column(db.String(254), index=True, nullable=False) judgement = db.Column(db.Integer(), nullable=False) judge_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) @classmethod def judgements_by_entity(cls, entity_id): q = db.session.query(cls.match_id, cls.judgement) q = q.filter(cls.entity_id == entity_id) return {k: v for k, v in q.all()} @classmethod def entity_ids(cls, entity_id): q = db.session.query(cls.match_id) q = q.filter(cls.entity_id == entity_id) q = q.filter(cls.judgement == cls.CONFIRMED) ids = [entity_id] for mapped_id, in q.all(): ids.append(mapped_id) return ids @classmethod def by_entity_match(cls, entity_id, match_id): q = db.session.query(cls) q = q.filter(cls.entity_id == entity_id) q = q.filter(cls.match_id == match_id) return q.first() @classmethod def save(cls, entity_id, match_id, judgement, judge=None): obj = cls.by_entity_match(entity_id, match_id) if obj is None: obj = cls() obj.entity_id = entity_id obj.match_id = match_id obj.judgement = judgement obj.judge = judge db.session.add(obj) return obj def __repr__(self): return 'EntityIdentity(%r, %r, %r)' % (self.entity_id, self.match_id, self.judgement)
class Notification(db.Model, IdModel, DatedModel): GLOBAL = 'Global' _event = db.Column('event', db.String(255), nullable=False) channels = db.Column(ARRAY(db.String(255)), index=True) params = db.Column(JSONB) actor_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) actor = db.relationship(Role) @hybrid_property def event(self): return Events.get(self._event) @event.setter def event(self, event): self._event = event.name def iterparams(self): if self.actor_id is not None: yield 'actor', Role, self.actor_id if self.event is None: return for name, clazz in self.event.params.items(): value = self.params.get(name) if value is not None: yield name, clazz, value @classmethod def publish(cls, event, actor_id=None, channels=[], params={}): notf = cls() notf.event = event notf.actor_id = actor_id notf.params = params notf.channels = list(set([c for c in channels if c is not None])) db.session.add(notf) return notf @classmethod def by_role(cls, role): sq = db.session.query(Subscription.channel) sq = sq.filter(Subscription.deleted_at == None) # noqa sq = sq.filter(Subscription.role_id == role.id) sq = sq.cte('sq') q = cls.all() q = q.filter(or_( cls.actor_id != role.id, cls.actor_id == None # noqa )) q = q.filter(cls.channels.any(sq.c.channel)) q = q.filter(cls._event.in_(Events.names())) q = q.order_by(cls.created_at.desc()) q = q.order_by(cls.id.desc()) return q
class EntityOtherName(db.Model, EntityDetails): _schema = '/entity/other_name.json#' entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) entity = db.relationship('Entity', primaryjoin="and_(Entity.id == foreign(EntityOtherName.entity_id), " # noqa "EntityOtherName.deleted_at == None)", # noqa backref=db.backref('other_names', lazy='dynamic', cascade='all, delete-orphan')) # noqa name = db.Column(db.Unicode) note = db.Column(db.Unicode) family_name = db.Column(db.Unicode) given_name = db.Column(db.Unicode) additional_name = db.Column(db.Unicode) honorific_prefix = db.Column(db.Unicode) honorific_suffix = db.Column(db.Unicode) patronymic_name = db.Column(db.Unicode) start_date = db.Column(db.DateTime) end_date = db.Column(db.DateTime) @property def display_name(self): if self.name is not None: return self.name return '' @property def terms(self): return [self.display_name] def to_dict(self): data = super(EntityOtherName, self).to_dict() data['display_name'] = self.display_name return data
class UuidModel(object): id = db.Column(db.String(32), primary_key=True, default=make_textid, nullable=False, unique=False) def to_dict(self): parent = super(UuidModel, self) data = parent.to_dict() if hasattr(parent, 'to_dict') else {} data['id'] = self.id return data
class Reference(db.Model, IdModel, DatedModel): id = db.Column(db.Integer(), primary_key=True) origin = db.Column(db.String(128)) weight = db.Column(db.Integer) document_id = db.Column(db.BigInteger, db.ForeignKey('document.id'), index=True) # noqa document = db.relationship('Document', backref=db.backref('references', lazy='dynamic')) # noqa entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) # noqa entity = db.relationship('Entity', backref=db.backref('references', lazy='dynamic')) # noqa @classmethod def index_references(cls, document_id): """Helper function to get reference data for indexing.""" # cf. aleph.index.entities.generate_entities() from aleph.model.entity import Entity q = db.session.query(Reference.entity_id, Entity.collection_id) q = q.filter(Reference.document_id == document_id) q = q.filter(Entity.id == Reference.entity_id) q = q.filter(Entity.state == Entity.STATE_ACTIVE) return q.all() def to_dict(self): return { 'entity': { 'id': self.entity.id, 'name': self.entity.name, '$schema': self.entity.type }, 'weight': self.weight, 'origin': self.origin } def __repr__(self): return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
class EntityIdentifier(db.Model, EntityDetails): _schema = '/entity/identifier.json#' __tablename__ = 'entity_identifier' entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) entity = db.relationship('Entity', primaryjoin="and_(Entity.id == foreign(EntityIdentifier.entity_id), " # noqa "EntityIdentifier.deleted_at == None)", # noqa backref=db.backref('identifiers', lazy='dynamic', cascade='all, delete-orphan')) # noqa identifier = db.Column(db.Unicode) scheme = db.Column(db.Unicode)
class EntityBuilding(EntityAsset): _schema = '/entity/building.json#' __mapper_args__ = {'polymorphic_identity': _schema} building_address_id = db.Column(db.String(32), db.ForeignKey('entity_address.id')) # noqa building_address = db.relationship( 'EntityAddress', primaryjoin= "and_(EntityAddress.id == foreign(EntityBuilding.building_address_id), " # noqa "EntityAddress.deleted_at == None)") # noqa
class EntityLegalPerson(Entity): _schema = 'entity/legal_person.json#' __mapper_args__ = {'polymorphic_identity': _schema} image = db.Column(db.Unicode, nullable=True) postal_address_id = db.Column(db.String(32), db.ForeignKey('entity_address.id')) # noqa postal_address = db.relationship( 'EntityAddress', primaryjoin= "and_(EntityAddress.id == foreign(EntityLegalPerson.postal_address_id), " # noqa "EntityAddress.deleted_at == None)") # noqa
class Reference(db.Model, IdModel, DatedModel): id = db.Column(db.Integer(), primary_key=True) document_id = db.Column(db.BigInteger, db.ForeignKey('document.id')) entity_id = db.Column(db.String(32), db.ForeignKey('entity.id')) origin = db.Column(db.String(128)) weight = db.Column(db.Integer) entity = db.relationship(Entity, backref=db.backref('references', lazy='dynamic')) document = db.relationship(Document, backref=db.backref('references', lazy='dynamic')) @classmethod def delete_document(cls, document_id, origin=None): q = cls.all().filter_by(document_id=document_id) if origin is not None: q = q.filter_by(origin=origin) q.delete(synchronize_session='fetch') def __repr__(self): return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
class EntityContactDetail(db.Model, EntityDetails): _schema = '/entity/contact_detail.json#' entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) entity = db.relationship('EntityLegalPerson', primaryjoin="and_(Entity.id == foreign(EntityContactDetail.entity_id), " # noqa "EntityContactDetail.deleted_at == None)", # noqa backref=db.backref('contact_details', lazy='dynamic', cascade='all, delete-orphan')) # noqa label = db.Column(db.Unicode) type = db.Column(db.Unicode) note = db.Column(db.Unicode) valid_from = db.Column(db.DateTime) valid_until = db.Column(db.DateTime)
def by_channels(cls, channels, since=None, exclude_actor_id=None): channels = cast(channels, ARRAY(db.String(255))) q = cls.all() q = q.filter(cls.channels.overlap(channels)) # q = q.filter(cls.channels.any(channel)) q = q.filter(cls._event.in_(Events.names())) if exclude_actor_id is not None: q = q.filter(cls.actor_id != exclude_actor_id) if since is not None: q = q.filter(cls.created_at >= since) q = q.order_by(cls.created_at.desc()) q = q.order_by(cls.id.desc()) return q
class EntityPerson(EntityLegalPerson): _schema = '/entity/person.json#' __mapper_args__ = {'polymorphic_identity': _schema} gender = db.Column(db.Unicode, nullable=True) birth_date = db.Column(db.Unicode, nullable=True) death_date = db.Column(db.Unicode, nullable=True) residential_address_id = db.Column( db.String(32), db.ForeignKey('entity_address.id')) # noqa residential_address = db.relationship( 'EntityAddress', primaryjoin= "and_(EntityAddress.id == foreign(EntityPerson.residential_address_id), " # noqa "EntityAddress.deleted_at == None)") # noqa
class EntityOrganization(EntityLegalPerson): _schema = '/entity/organization.json#' __mapper_args__ = {'polymorphic_identity': _schema} classification = db.Column(db.Unicode, nullable=True) founding_date = db.Column(db.Unicode, nullable=True) dissolution_date = db.Column(db.Unicode, nullable=True) current_status = db.Column(db.Unicode, nullable=True) registered_address_id = db.Column( db.String(32), db.ForeignKey('entity_address.id')) # noqa registered_address = db.relationship( 'EntityAddress', primaryjoin= "and_(EntityAddress.id == foreign(EntityOrganization.registered_address_id), " # noqa "EntityAddress.deleted_at == None)") # noqa headquarters_address_id = db.Column( db.String(32), db.ForeignKey('entity_address.id')) # noqa headquarters_address = db.relationship( 'EntityAddress', primaryjoin= "and_(EntityAddress.id == foreign(EntityOrganization.headquarters_address_id), " # noqa "EntityAddress.deleted_at == None)") # noqa
def by_channels(cls, channels, role, since=None): channels = cast(channels, ARRAY(db.String(255))) q = cls.all() q = q.filter(cls.channels.overlap(channels)) q = q.filter(cls._event.in_(Events.names())) q = q.filter(or_( cls.actor_id != role.id, cls.actor_id == None # noqa )) since = since or role.notified_at if since is not None and role.notified_at is not None: since = max(since, role.notified_at) if since is not None: q = q.filter(cls.created_at >= since) q = q.order_by(cls.created_at.desc()) return q
class Subscription(db.Model, IdModel, SoftDeleteModel): channel = db.Column(db.String(255), index=True) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), index=True) role = db.relationship(Role) @classmethod def find(cls, channel=None, role_id=None, deleted=False): q = cls.all(deleted=deleted) if channel is not None: q = q.filter(cls.channel == channel) if role_id is not None: q = q.filter(cls.role_id == role_id) return q.first() @classmethod def subscribe(cls, role, channel): subscription = cls.find(channel=channel, role_id=role.id) if subscription is None: subscription = cls() subscription.channel = channel subscription.role_id = role.id subscription.deleted_at = None db.session.add(subscription) return subscription @classmethod def unsubscribe(cls, role=None, channel=None, deleted_at=None): assert role is not None or channel is not None if deleted_at is None: deleted_at = datetime.utcnow() q = db.session.query(cls) if role is not None: q = q.filter(cls.role_id == role.id) if channel is not None: q = q.filter(cls.channel == channel) q.update({cls.deleted_at: deleted_at}, synchronize_session=False)
class Entity(db.Model, DatedModel): THING = "Thing" LEGAL_ENTITY = "LegalEntity" id = db.Column( db.String(ENTITY_ID_LEN), primary_key=True, default=make_textid, nullable=False, unique=False, ) schema = db.Column(db.String(255), index=True) data = db.Column("data", JSONB) role_id = db.Column(db.Integer, db.ForeignKey("role.id"), nullable=True) # noqa collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True) collection = db.relationship(Collection, backref=db.backref("entities", lazy="dynamic")) @property def model(self): return model.get(self.schema) def update(self, data, collection): proxy = model.get_proxy(data, cleaned=False) proxy = collection.ns.apply(proxy) self.id = collection.ns.sign(self.id) self.schema = proxy.schema.name self.updated_at = datetime.utcnow() previous = self.to_proxy() for prop in proxy.schema.properties.values(): # Do not allow the user to overwrite hashes because this could # lead to a user accessing random objects. if prop.type == registry.checksum: prev = previous.get(prop) proxy.set(prop, prev, cleaned=True, quiet=True) self.data = proxy.properties db.session.add(self) def to_proxy(self): data = { "id": self.id, "schema": self.schema, "properties": self.data, "created_at": iso_text(self.created_at), "updated_at": iso_text(self.updated_at), "role_id": self.role_id, "mutable": True, } return model.get_proxy(data, cleaned=False) @classmethod def create(cls, data, collection, role_id=None): entity = cls() entity_id = data.get("id") or make_textid() if not registry.entity.validate(entity_id): raise InvalidData(gettext("Invalid entity ID")) entity.id = collection.ns.sign(entity_id) entity.collection_id = collection.id entity.role_id = role_id entity.update(data, collection) return entity @classmethod def by_id(cls, entity_id, collection=None): q = cls.all().filter(cls.id == entity_id) if collection is not None: q = q.filter(cls.collection_id == collection.id) return q.first() @classmethod def by_collection(cls, collection_id): q = cls.all() q = q.filter(Entity.collection_id == collection_id) q = q.yield_per(5000) return q @classmethod def delete_by_collection(cls, collection_id): pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) def __repr__(self): return "<Entity(%r, %r)>" % (self.id, self.schema)
class Alert(db.Model, SoftDeleteModel): """A subscription to notifications on a given query.""" __tablename__ = 'alert' id = db.Column(db.Integer, primary_key=True) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), index=True) custom_label = db.Column(db.Unicode, nullable=True) query_text = db.Column(db.Unicode, nullable=True) entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), nullable=True) # noqa entity = db.relationship(Entity, backref=db.backref('alerts', lazy='dynamic')) # noqa notified_at = db.Column(db.DateTime, nullable=True) @property def label(self): if self.custom_label is not None: return self.custom_label if self.entity: return self.entity.name return self.query_text def delete(self, deleted_at=None): self.deleted_at = deleted_at or datetime.utcnow() db.session.add(self) db.session.flush() def update(self): self.notified_at = datetime.utcnow() db.session.add(self) db.session.flush() def is_same(self, other): if other.role_id == self.role_id: if other.entity_id == self.entity_id: if other.query_text == self.query_text: return True return False @classmethod def by_id(cls, id, role=None): q = cls.all().filter_by(id=id) if role is not None: q = q.filter(cls.role_id == role.id) return q.first() @classmethod def by_role(cls, role): return cls.all().filter(cls.role_id == role.id) @classmethod def create(cls, data, role): validate(data, 'alert.json#') alert = cls() alert.role_id = role.id alert.query_text = data.get('query_text') if alert.query_text is not None: alert.query_text = alert.query_text.strip() alert.query_text = alert.query_text or None alert.entity_id = data.get('entity_id') or None alert.custom_label = data.get('label') alert.update() return alert @classmethod def exists(cls, query, role): q = cls.all_ids().filter(cls.role_id == role.id) query_text = query.get('q') if query_text is not None: query_text = query_text.strip() if not len(query_text): query_text = None q = q.filter(cls.query_text == query_text) entities = query.getlist('entity') if len(entities) == 1: q = q.filter(cls.entity_id == entities[0]) else: q = q.filter(cls.entity_id == None) # noqa q = q.limit(1) return q.scalar() @classmethod def dedupe(cls, entity_id): alerts = cls.all().filter_by(entity_id=entity_id).all() for left in alerts: for right in alerts: if left.id >= right.id: continue if left.is_same(right): left.delete() def __repr__(self): return '<Alert(%r, %r)>' % (self.id, self.label) def to_query(self): return MultiDict({ 'q': self.query_text or '', 'entity': self.entity_id }) def to_dict(self): return { 'id': self.id, 'label': self.label, 'role_id': self.role_id, 'query_text': self.query_text, 'entity_id': self.entity_id, 'created_at': self.created_at, 'notified_at': self.notified_at, 'updated_at': self.updated_at }
class Entity(db.Model, UuidModel, SoftDeleteModel): STATE_ACTIVE = 'active' STATE_PENDING = 'pending' STATE_DELETED = 'deleted' name = db.Column(db.Unicode) type = db.Column(db.String(255), index=True) state = db.Column(db.String(128), nullable=True, default=STATE_ACTIVE, index=True) # noqa foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa def delete_references(self, origin=None): pq = db.session.query(Reference) pq = pq.filter(Reference.entity_id == self.id) if origin is not None: pq = pq.filter(Reference.origin == origin) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_identities(self): pq = db.session.query(EntityIdentity) pq = pq.filter(EntityIdentity.entity_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete(self, deleted_at=None): self.delete_references() self.delete_identities() deleted_at = deleted_at or datetime.utcnow() for alert in self.alerts: alert.delete(deleted_at=deleted_at) self.state = self.STATE_DELETED super(Entity, self).delete(deleted_at=deleted_at) @classmethod def delete_dangling(cls, collection_id): """Delete dangling entities. Entities can dangle in pending state while they have no references pointing to them, thus making it impossible to enable them. This is a routine cleanup function. """ q = db.session.query(cls) q = q.filter(cls.collection_id == collection_id) q = q.filter(cls.state == cls.STATE_PENDING) q = q.outerjoin(Reference) q = q.group_by(cls) q = q.having(func.count(Reference.id) == 0) for entity in q.all(): entity.delete() def merge(self, other): if self.id == other.id: raise ValueError("Cannot merge an entity with itself.") if self.collection_id != other.collection_id: raise ValueError( "Cannot merge entities from different collections.") # noqa data = merge_data(self.data, other.data) if self.name.lower() != other.name.lower(): data = merge_data(data, {'alias': [other.name]}) self.data = data self.state = self.STATE_ACTIVE self.foreign_ids = self.foreign_ids or [] self.foreign_ids += other.foreign_ids or [] self.created_at = min((self.created_at, other.created_at)) self.updated_at = datetime.utcnow() # update alerts from aleph.model.alert import Alert q = db.session.query(Alert).filter(Alert.entity_id == other.id) q.update({'entity_id': self.id}) # update document references from aleph.model.reference import Reference q = db.session.query(Reference).filter(Reference.entity_id == other.id) q.update({'entity_id': self.id}) # delete source entities other.delete() db.session.add(self) db.session.commit() db.session.refresh(other) def update(self, entity): data = entity.get('data') or {} data['name'] = entity.get('name') self.data = self.schema.validate(data) self.name = self.data.pop('name') fid = [string_value(f) for f in entity.get('foreign_ids') or []] self.foreign_ids = list(set([f for f in fid if f is not None])) self.state = entity.pop('state', self.STATE_ACTIVE) self.updated_at = datetime.utcnow() db.session.add(self) @classmethod def save(cls, data, collection, merge=False): ent = cls.by_id(data.get('id')) if ent is None: ent = cls() ent.type = data.pop('schema', None) if ent.type is None: raise ValueError("No schema provided.") ent.id = make_textid() if merge: data = merge_data(data, ent.to_dict()) if collection is None: raise ValueError("No collection specified.") ent.collection = collection ent.update(data) return ent @classmethod def filter_collections(cls, q, collections=None): if collections is None: return q collection_ids = [] for collection in collections: if isinstance(collection, Collection): collection = collection.id collection_ids.append(collection) q = q.filter(Entity.collection_id.in_(collection_ids)) return q @classmethod def by_id_set(cls, ids, collections=None): if not len(ids): return {} q = cls.all() q = cls.filter_collections(q, collections=collections) q = q.options(joinedload('collection')) q = q.filter(cls.id.in_(ids)) entities = {} for ent in q: entities[ent.id] = ent return entities @classmethod def by_foreign_id(cls, foreign_id, collection_id, deleted=False): foreign_id = string_value(foreign_id) if foreign_id is None: return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast([foreign_id], ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def latest(cls): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.state == cls.STATE_ACTIVE) return q.scalar() @property def schema(self): return schemata.get(self.type) @property def terms(self): terms = set([self.name]) for alias in ensure_list(self.data.get('alias')): if alias is not None and len(alias): terms.add(alias) return terms @property def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = set([normalize_strong(t) for t in self.terms]) regex_terms = set() for term in terms: if term is None or len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other is None or other == term: continue if other in term: contained = True if not contained: regex_terms.add(term) return regex_terms def to_dict(self): data = super(Entity, self).to_dict() data.update({ 'schema': self.type, 'name': self.name, 'state': self.state, 'data': self.data, 'foreign_ids': self.foreign_ids or [], 'collection_id': self.collection_id }) return data def to_index(self): entity = self.to_dict() entity['properties'] = {'name': [self.name]} for k, v in self.data.items(): v = ensure_list(v) if len(v): entity['properties'][k] = v return entity def to_ref(self): return { 'id': self.id, 'label': self.name, 'schema': self.type, 'collection_id': self.collection_id } def __unicode__(self): return self.name def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)
class Entity(db.Model, UuidModel, SoftDeleteModel, SchemaModel): _schema = '/entity/entity.json#' _schema_recurse = True STATE_ACTIVE = 'active' STATE_PENDING = 'pending' STATE_DELETED = 'deleted' name = db.Column(db.Unicode) type = db.Column('type', db.String(255), index=True) state = db.Column(db.String(128), nullable=True, default=STATE_ACTIVE) summary = db.Column(db.Unicode, nullable=True) description = db.Column(db.Unicode, nullable=True) jurisdiction_code = db.Column(db.Unicode, nullable=True) register_name = db.Column(db.Unicode, nullable=True) register_url = db.Column(db.Unicode, nullable=True) __mapper_args__ = {'polymorphic_on': type, 'polymorphic_identity': _schema} collections = db.relationship( Collection, secondary=collection_entity_table, # noqa backref=db.backref('entities', lazy='dynamic')) # noqa def delete_references(self, origin=None): pq = db.session.query(Reference) pq = pq.filter(Reference.entity_id == self.id) if origin is not None: pq = pq.filter(Reference.origin == origin) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete(self, deleted_at=None): self.delete_references() deleted_at = deleted_at or datetime.utcnow() for alert in self.alerts: alert.delete(deleted_at=deleted_at) self.state = self.STATE_DELETED super(Entity, self).delete(deleted_at=deleted_at) def update(self, data, merge=False): self.schema_update(data, merge=merge) def merge(self, other): if self.id == other.id: return # De-dupe todo: # 1. merge identifiers # 2. merge properties # 3. merge names, make merged names into a.k.a's # 4. merge collections # 5. update references # 6. update alerts # 7. delete source entities # 8. update source entities # 9. update target entity collections = list(self.collections) for collection in other.collections: if collection not in collections: self.collections.append(collection) if self.name.lower() != other.name.lower(): aka = EntityOtherName() aka.update({'name': other.name}) aka.entity = self db.session.add(aka) from aleph.model.alert import Alert q = db.session.query(Alert).filter(Alert.entity_id == other.id) q.update({'entity_id': self.id}) from aleph.model.reference import Reference q = db.session.query(Reference).filter(Reference.entity_id == other.id) q.update({'entity_id': self.id}) db.session.commit() db.session.refresh(other) self.schema_merge(other) def schema_merge(self, other): """Attempt to merge other onto self via JSON schema.""" # TODO: figure out if we want to change schema for prop in self.schema_visitor.properties: if prop.name == 'id': continue self_value = getattr(self, prop.name) if \ hasattr(self, prop.name) else None other_value = getattr(other, prop.name) if \ hasattr(other, prop.name) else None if self_value is None and other_value is None: continue if prop.is_value and self_value is None: # update local properties setattr(self, prop.name, other_value) elif prop.is_object and self._schema_recurse: # update associated objects which are not set on the # existing object. rel = self._get_relationship(prop.name, 'MANYTOONE') if self_value is not None or other_value is None: continue data = other_value.to_dict() obj = type(other_value)() obj.update(data) for local, remote in self._get_associations(obj, rel): other_id = getattr(obj, remote) setattr(self, local, other_id) elif prop.is_array and self._schema_recurse \ and other_value is not None: # merge array associations rel = self._get_relationship(prop.name, 'ONETOMANY') full_list = list(self_value) for new_item in other_value: data = new_item.to_dict() existing = [o for o in full_list if o.merge_compare(data)] if len(existing): continue obj = type(new_item)() obj.update(data) for local, remote in self._get_associations(obj, rel): setattr(obj, remote, getattr(self, local)) db.session.add(obj) full_list.append(obj) self.created_at = min((self.created_at, other.created_at)) self.updated_at = datetime.utcnow() other.delete() db.session.flush() @classmethod def save(cls, data, collections, merge=False): ent = cls.by_id(data.get('id')) if 'state' not in data: data['state'] = cls.STATE_ACTIVE for identifier in data.get('identifiers', []): if ent is None: ent = cls.by_identifier(identifier.get('scheme'), identifier.get('identifier'), collections=collections) if ent is None: schema = data.get('$schema', cls._schema) cls = cls.get_schema_class(schema) ent = cls() ent.id = make_textid() if merge: for collection in ent.collections: if collection.id not in [c.id for c in collections]: collections.append(collection) if not len(collections): raise AttributeError("No collection specified.") ent.collections = collections ent.update(data, merge=merge) return ent @classmethod def filter_collections(cls, q, collections=None): if collections is None: return q collection_ids = [] for collection in collections: if isinstance(collection, Collection): collection = collection.id collection_ids.append(collection) coll = aliased(Collection) q = q.join(coll, Entity.collections) q = q.filter(coll.id.in_(collection_ids)) q = q.filter(coll.deleted_at == None) # noqa return q @classmethod def by_identifier(cls, scheme, identifier, collections=None): q = db.session.query(Entity) q = q.filter(Entity.deleted_at == None) # noqa q = cls.filter_collections(q, collections=collections) ident = aliased(EntityIdentifier) q = q.join(ident, Entity.identifiers) q = q.filter(ident.deleted_at == None) # noqa q = q.filter(ident.scheme == scheme) q = q.filter(ident.identifier == identifier) return q.first() @classmethod def by_id_set(cls, ids, collections=None): if not len(ids): return {} q = cls.all() q = cls.filter_collections(q, collections=collections) q = q.options(joinedload('collections')) q = q.filter(cls.id.in_(ids)) entities = {} for ent in q: entities[ent.id] = ent return entities @classmethod def latest(cls): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.state == cls.STATE_ACTIVE) return q.scalar() @classmethod def all_by_document(cls, document_id): from aleph.model.reference import Reference q = cls.all() q = q.options(joinedload('collections')) q = q.filter(cls.state == cls.STATE_ACTIVE) q = q.join(Reference) q = q.filter(Reference.document_id == document_id) return q.distinct() @property def fingerprint(self): return make_fingerprint(self.name) @property def terms(self): terms = set([self.name]) for other_name in self.other_names: terms.update(other_name.terms) return [t for t in terms if t is not None and len(t)] @property def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = [' %s ' % normalize_strong(t) for t in self.terms] regex_terms = set() for term in terms: if len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other == term: continue if other in term: contained = True if not contained: regex_terms.add(term.strip()) return regex_terms def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name) def __unicode__(self): return self.name def to_dict(self): data = super(Entity, self).to_dict() data['collection_id'] = [c.id for c in self.collections] return data def to_ref(self): return { 'id': self.id, 'name': self.name, '$schema': self.type, 'collection_id': [c.id for c in self.collections] }
class Entity(db.Model, UuidModel, SoftDeleteModel): THING = 'Thing' name = db.Column(db.Unicode) schema = db.Column(db.String(255), index=True) foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa @property def model(self): return model.get(self.schema) @property def terms(self): terms = set([self.name]) for alias in ensure_list(self.data.get('alias')): if alias is not None and len(alias): terms.add(alias) return terms @property def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = set([match_form(t) for t in self.terms]) regex_terms = set() for term in terms: if term is None or len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other is None or other == term: continue if other in term: contained = True if not contained: regex_terms.add(term) return regex_terms def delete_matches(self): pq = db.session.query(Match) pq = pq.filter( or_(Match.entity_id == self.id, Match.match_id == self.id)) pq.delete(synchronize_session=False) db.session.refresh(self) def delete(self, deleted_at=None): self.delete_matches() deleted_at = deleted_at or datetime.utcnow() for alert in self.alerts: alert.delete(deleted_at=deleted_at) super(Entity, self).delete(deleted_at=deleted_at) @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): from aleph.model import Alert deleted_at = deleted_at or datetime.utcnow() entities = db.session.query(cls.id) entities = entities.filter(cls.collection_id == collection_id) entities = entities.subquery() pq = db.session.query(Alert) pq = pq.filter(Alert.entity_id.in_(entities)) pq.update({Alert.deleted_at: deleted_at}, synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.entity_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.match_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) def merge(self, other): if self.id == other.id: raise ValueError("Cannot merge an entity with itself.") if self.collection_id != other.collection_id: raise ValueError( "Cannot merge entities from different collections.") # noqa self.schema = model.precise_schema(self.schema, other.schema) self.foreign_ids = string_set(self.foreign_ids, self.foreign_ids) self.created_at = min((self.created_at, other.created_at)) self.updated_at = datetime.utcnow() data = merge_data(self.data, other.data) if self.name != other.name: data = merge_data(data, {'alias': [other.name]}) self.data = data # update alerts from aleph.model.alert import Alert q = db.session.query(Alert).filter(Alert.entity_id == other.id) q.update({Alert.entity_id: self.id}) # delete source entities other.delete() db.session.add(self) db.session.commit() db.session.refresh(other) def update(self, entity): self.schema = entity.get('schema') data = entity.get('properties') if is_mapping(data): data['name'] = [entity.get('name')] self.data = self.model.validate(data) elif self.data is None: self.data = {} self.data.pop('name', None) self.name = entity.get('name') # TODO: should this be mutable? # self.foreign_ids = string_set(entity.get('foreign_ids')) self.updated_at = datetime.utcnow() db.session.add(self) @classmethod def create(cls, data, collection): foreign_ids = string_set(data.get('foreign_ids')) ent = cls.by_foreign_ids(foreign_ids, collection.id, deleted=True) if ent is None: ent = cls() ent.id = make_textid() ent.collection = collection ent.foreign_ids = foreign_ids ent.update(data) ent.deleted_at = None return ent @classmethod def by_foreign_ids(cls, foreign_ids, collection_id, deleted=False): if not len(foreign_ids): return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast(foreign_ids, ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def all_ids(cls, deleted=False, authz=None): q = super(Entity, cls).all_ids(deleted=deleted) if authz is not None and not authz.is_admin: q = q.join(Permission, cls.collection_id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def latest(cls): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.deleted_at == None) # noqa return q.scalar() def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)
class EntitySetItem(db.Model, SoftDeleteModel): __tablename__ = "entityset_item" id = db.Column(db.Integer, primary_key=True) entityset_id = db.Column(db.String(ENTITY_ID_LEN), db.ForeignKey("entityset.id"), index=True) entity_id = db.Column(db.String(ENTITY_ID_LEN), index=True) collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True) compared_to_entity_id = db.Column(db.String(ENTITY_ID_LEN)) added_by_id = db.Column(db.Integer, db.ForeignKey("role.id")) judgement = db.Column(db.Enum(Judgement)) entityset = db.relationship(EntitySet) collection = db.relationship(Collection) added_by = db.relationship(Role) @classmethod def by_entity_id(cls, entityset, entity_id): q = cls.all() q = q.filter(cls.entityset_id == entityset.id) q = q.filter(cls.entity_id == entity_id) q = q.order_by(cls.created_at.desc()) return q.first() @classmethod def save(cls, entityset, entity_id, judgement=None, collection_id=None, **data): if judgement is None: judgement = Judgement.POSITIVE else: judgement = Judgement(judgement) existing = cls.by_entity_id(entityset, entity_id) if existing is not None: if existing.judgement == judgement: return existing existing.delete() if judgement == Judgement.NO_JUDGEMENT: return item = cls( entityset_id=entityset.id, entity_id=entity_id, judgement=judgement, compared_to_entity_id=data.get("compared_to_entity_id"), collection_id=collection_id or entityset.collection_id, added_by_id=data.get("added_by_id"), ) db.session.add(item) return item @classmethod def delete_by_collection(cls, collection_id): pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(EntitySet.collection_id == collection_id) pq = pq.filter(EntitySet.id == cls.entityset_id) pq.delete(synchronize_session=False) @classmethod def delete_by_entity(cls, entity_id): pq = db.session.query(cls) pq = pq.filter(cls.entity_id == entity_id) pq.delete(synchronize_session=False) def to_dict(self): data = self.to_dict_dates() data.update({ "entityset_id": self.entityset_id, "entity_id": self.entity_id, "collection_id": self.collection_id, "added_by_id": self.added_by_id, "compared_to_entity_id": self.compared_to_entity_id, }) if self.judgement: data["judgement"] = self.judgement.value return data def __repr__(self): return "<EntitySetItem(%r, %r)>" % (self.entityset_id, self.entity_id)
class Notification(db.Model, IdModel, DatedModel): GLOBAL = 'Global' _event = db.Column('event', db.String(255), nullable=False) channels = db.Column(ARRAY(db.String(255)), index=True) params = db.Column(JSONB) actor_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) actor = db.relationship(Role) @hybrid_property def event(self): return Events.get(self._event) @event.setter def event(self, event): self._event = event.name def iterparams(self): if self.actor_id is not None: yield 'actor', Role, self.actor_id if self.event is None: return for name, clazz in self.event.params.items(): value = self.params.get(name) if value is not None: yield name, clazz, value def to_dict(self): data = self.to_dict_dates() data.update({ 'id': self.id, 'actor_id': self.actor_id, 'event': self._event, 'params': self.params }) return data @classmethod def publish(cls, event, actor_id=None, channels=[], params={}): notf = cls() notf.event = event notf.actor_id = actor_id notf.params = params notf.channels = list(set([c for c in channels if c is not None])) db.session.add(notf) return notf @classmethod def by_channels(cls, channels, role, since=None): channels = cast(channels, ARRAY(db.String(255))) q = cls.all() q = q.filter(cls.channels.overlap(channels)) q = q.filter(cls._event.in_(Events.names())) q = q.filter(or_( cls.actor_id != role.id, cls.actor_id == None # noqa )) since = since or role.notified_at if since is not None and role.notified_at is not None: since = max(since, role.notified_at) if since is not None: q = q.filter(cls.created_at >= since) q = q.order_by(cls.created_at.desc()) return q @classmethod def delete_by_channel(cls, channel): q = cls.all() q = q.filter(cls.channels.any(channel)) q.delete(synchronize_session=False)
class Entity(db.Model, SoftDeleteModel): THING = 'Thing' LEGAL_ENTITY = 'LegalEntity' id = db.Column(db.String(ENTITY_ID_LEN), primary_key=True, default=make_textid, nullable=False, unique=False) name = db.Column(db.Unicode) schema = db.Column(db.String(255), index=True) foreign_id = db.Column(db.Unicode) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa @property def model(self): return model.get(self.schema) @property def signed_id(self): return self.collection.ns.sign(self.id) def delete_matches(self): pq = db.session.query(Match) pq = pq.filter( or_(Match.entity_id == self.id, Match.match_id == self.id)) pq.delete(synchronize_session=False) db.session.refresh(self) def delete(self, deleted_at=None): self.delete_matches() deleted_at = deleted_at or datetime.utcnow() super(Entity, self).delete(deleted_at=deleted_at) def update(self, entity): proxy = model.get_proxy(entity) proxy.schema.validate(entity) self.schema = proxy.schema.name previous = self.to_proxy() for prop in proxy.iterprops(): # Do not allow the user to overwrite hashes because this could # lead to a user accessing random objects. if prop.type == registry.checksum: proxy.set(prop, previous.get(prop), cleaned=True, quiet=True) self.data = proxy.properties self.updated_at = datetime.utcnow() db.session.add(self) def to_proxy(self): proxy = model.get_proxy({ 'id': self.id, 'schema': self.schema, 'properties': self.data }) proxy.add('name', self.name) proxy.set('indexUpdatedAt', self.updated_at) return proxy @classmethod def create(cls, data, collection): foreign_id = data.get('foreign_id') ent = cls.by_foreign_id(foreign_id, collection.id, deleted=True) if ent is None: ent = cls() ent.id = make_textid() ent.collection = collection ent.foreign_id = foreign_id ent.data = {} ent.deleted_at = None ent.update(data) return ent @classmethod def by_id(cls, entity_id, collection_id=None): entity_id, _ = Namespace.parse(entity_id) q = cls.all() q = q.filter(cls.id == entity_id) return q.first() @classmethod def by_foreign_id(cls, foreign_id, collection_id, deleted=False): if foreign_id is None: return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) q = q.filter(cls.foreign_id == foreign_id) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def by_collection(cls, collection_id): return cls.all().filter(Entity.collection_id == collection_id) @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): deleted_at = deleted_at or datetime.utcnow() entities = db.session.query(cls.id) entities = entities.filter(cls.collection_id == collection_id) entities = entities.subquery() pq = db.session.query(Match) pq = pq.filter(Match.entity_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.match_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)
class EntitySet(db.Model, SoftDeleteModel): __tablename__ = "entityset" # set types LIST = "list" DIAGRAM = "diagram" TIMELINE = "timeline" PROFILE = "profile" TYPES = frozenset([LIST, DIAGRAM, TIMELINE, PROFILE]) id = db.Column(db.String(ENTITY_ID_LEN), primary_key=True) label = db.Column(db.Unicode) type = db.Column(db.String(10), index=True, default=LIST) summary = db.Column(db.Unicode, nullable=True) layout = db.Column("layout", JSONB, nullable=True) role_id = db.Column(db.Integer, db.ForeignKey("role.id"), index=True) role = db.relationship(Role) collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True) collection = db.relationship(Collection) parent_id = db.Column(db.String(ENTITY_ID_LEN), db.ForeignKey("entityset.id")) parent = db.relationship("EntitySet", backref="children", remote_side=[id]) @property def entities(self): q = db.session.query(EntitySetItem.entity_id) q = q.filter(EntitySetItem.entityset_id == self.id) q = q.filter(EntitySetItem.judgement == Judgement.POSITIVE) q = q.filter(EntitySetItem.deleted_at == None) # noqa return [entity_id for entity_id, in q.all()] @classmethod def create(cls, data, collection, authz): entityset = cls() entityset.id = make_textid() entityset.layout = {} entityset.role_id = authz.id entityset.collection_id = collection.id entityset.update(data) return entityset @classmethod def by_authz(cls, authz, types=None, prefix=None): ids = authz.collections(authz.READ) q = cls.by_type(types) q = q.filter(cls.collection_id.in_(ids)) if prefix is not None: q = q.filter(query_like(cls.label, prefix)) return q @classmethod def by_type(cls, types): """Retuns EntitySets of a particular type""" q = EntitySet.all() types = ensure_list(types) if len(types) and types != cls.TYPES: q = q.filter(EntitySet.type.in_(types)) return q @classmethod def by_collection_id(cls, collection_id, types=None): """Retuns EntitySets within a given collection_id""" q = cls.by_type(types) q = q.filter(EntitySet.collection_id == collection_id) return q @classmethod def by_entity_id(cls, entity_id, collection_ids=None, judgements=None, types=None, labels=None): """Retuns EntitySets that include EntitySetItems with the provided entity_id. NOTE: This only considers EntitySetItems who haven't been deleted """ q = cls.by_type(types) if labels is not None: q = q.filter(EntitySet.label.in_(ensure_list(labels))) q = q.join(EntitySetItem) q = q.filter(EntitySetItem.deleted_at == None) # NOQA q = q.filter(EntitySetItem.entity_id == entity_id) if collection_ids: q = q.filter(EntitySet.collection_id.in_(collection_ids)) if judgements is not None: q = q.filter(EntitySetItem.judgement.in_(ensure_list(judgements))) return q @classmethod def delete_by_collection(cls, collection_id, deleted_at): EntitySetItem.delete_by_collection(collection_id) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) def items(self, authz=None, deleted=False): q = EntitySetItem.all(deleted=deleted) if authz is not None: ids = authz.collections(authz.READ) q = q.filter(EntitySetItem.collection_id.in_(ids)) q = q.filter(EntitySetItem.entityset_id == self.id) q = q.order_by(EntitySetItem.created_at.asc()) return q def profile(self, judgements=None, deleted=False): q = self.items(deleted=deleted) if judgements is not None: q = q.filter(EntitySetItem.judgement.in_(judgements)) return q def merge(self, other, merged_by_id): """Merge two entity_sets into each other. The older one is retained. This tries to retain a state where there is only one judgement between a set and an entity. """ if other.id == self.id: return self if other.created_at > self.created_at: return other.merge(self, merged_by_id) local_items = {i.entity_id: i for i in self.items()} for remote in other.items(): local = local_items.get(remote.entity_id) if local is None: remote.entityset_id = self.id remote.updated_at = datetime.utcnow() db.session.add(remote) continue judgement = local.judgment + remote.judgement if judgement == local.judgement: remote.delete() continue origin = local.compared_to_entity_id or remote.compared_to_entity_id combined = EntitySetItem( entityset_id=self.id, entity_id=local.entity_id, collection_id=local.collection_id, added_by_id=merged_by_id, judgement=judgement, compared_to_entity_id=origin, ) db.session.add(combined) local.delete() remote.delete() other.delete() self.updated_at = datetime.utcnow() db.session.add(self) db.session.flush() return self def update(self, data): self.label = data.get("label", self.label) self.type = data.get("type", self.type) self.summary = data.get("summary", self.summary) self.layout = data.get("layout", self.layout) self.updated_at = datetime.utcnow() self.deleted_at = None db.session.add(self) def delete(self, deleted_at=None): pq = db.session.query(EntitySetItem) pq = pq.filter(EntitySetItem.entityset_id == self.id) pq = pq.filter(EntitySetItem.deleted_at == None) # noqa pq.update({EntitySetItem.deleted_at: deleted_at}, synchronize_session=False) for mapping in self.mappings: mapping.entityset_id = None db.session.add(mapping) self.deleted_at = deleted_at or datetime.utcnow() db.session.add(self) def to_dict(self): data = self.to_dict_dates() data.update({ "id": stringify(self.id), "type": self.type, "label": self.label, "summary": self.summary, "layout": self.layout, "role_id": stringify(self.role_id), "collection_id": stringify(self.collection_id), }) return data def __repr__(self): return "<EntitySet(%r, %r)>" % (self.id, self.collection_id)
class Mapping(db.Model, DatedModel): """A mapping to load entities from a table""" __tablename__ = "mapping" FAILED = "failed" SUCCESS = "success" PENDING = "pending" STATUS = { SUCCESS: lazy_gettext("success"), FAILED: lazy_gettext("failed"), PENDING: lazy_gettext("pending"), } id = db.Column(db.Integer, primary_key=True) query = db.Column("query", JSONB) role_id = db.Column(db.Integer, db.ForeignKey("role.id"), index=True) role = db.relationship(Role, backref=db.backref("mappings", lazy="dynamic")) # noqa collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True) collection = db.relationship(Collection, backref=db.backref("mappings", lazy="dynamic")) table_id = db.Column(db.String(ENTITY_ID_LEN), index=True) disabled = db.Column(db.Boolean, nullable=True) last_run_status = db.Column(db.Unicode, nullable=True) last_run_err_msg = db.Column(db.Unicode, nullable=True) def get_proxy_context(self): """Metadata to be added to each generated entity.""" return { "created_at": iso_text(self.created_at), "updated_at": iso_text(self.updated_at), "role_id": self.role_id, "mutable": True, } def update(self, query=None, table_id=None): self.updated_at = datetime.utcnow() if query: self.query = query if table_id: self.table_id = table_id db.session.add(self) def set_status(self, status, error=None): self.last_run_status = status self.last_run_err_msg = error db.session.add(self) def to_dict(self): data = self.to_dict_dates() status = self.STATUS.get(self.last_run_status) data.update({ "id": stringify(self.id), "query": dict(self.query), "role_id": stringify(self.role_id), "collection_id": stringify(self.collection_id), "table_id": self.table_id, "last_run_status": status, "last_run_err_msg": self.last_run_err_msg, }) return data @classmethod def by_collection(cls, collection_id, table_id=None): q = cls.all().filter(cls.collection_id == collection_id) if table_id is not None: q = q.filter(cls.table_id == table_id) return q @classmethod def delete_by_collection(cls, collection_id): pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) @classmethod def delete_by_table(cls, entity_id): pq = db.session.query(cls) pq = pq.filter(cls.table_id == entity_id) pq.delete(synchronize_session=False) @classmethod def create(cls, query, table_id, collection, role_id): mapping = cls() mapping.role_id = role_id mapping.query = query mapping.collection_id = collection.id mapping.table_id = table_id mapping.update() return mapping def __repr__(self): return "<Mapping(%r, %r)>" % (self.id, self.table_id)
class Mapping(db.Model, SoftDeleteModel): """A mapping to load entities from a table""" __tablename__ = 'mapping' FAILED = 'failed' SUCCESS = 'success' STATUS = {SUCCESS: lazy_gettext('success'), FAILED: lazy_gettext('failed')} id = db.Column(db.Integer, primary_key=True) query = db.Column('query', JSONB) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), index=True) role = db.relationship(Role, backref=db.backref('mappings', lazy='dynamic')) # noqa collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('mappings', lazy='dynamic')) # noqa table_id = db.Column(db.String(ENTITY_ID_LEN), index=True) last_run_status = db.Column(db.Unicode, nullable=True) last_run_err_msg = db.Column(db.Unicode, nullable=True) def update(self, query=None, table_id=None): self.updated_at = datetime.utcnow() if query: self.query = query if table_id: self.table_id = table_id db.session.add(self) db.session.commit() def set_status(self, status, error=None): self.last_run_status = status self.last_run_err_msg = error db.session.add(self) db.session.commit() def delete(self, deleted_at=None): self.deleted_at = deleted_at or datetime.utcnow() db.session.add(self) db.session.commit() def to_dict(self): data = self.to_dict_dates() status = self.STATUS.get(self.last_run_status) data.update({ 'id': stringify(self.id), 'query': dict(self.query), 'role_id': stringify(self.role_id), 'collection_id': stringify(self.collection_id), 'table_id': self.table_id, 'last_run_status': status, 'last_run_err_msg': self.last_run_err_msg }) return data @classmethod def by_collection(cls, collection_id, table_id=None): q = cls.all().filter(cls.collection_id == collection_id) if table_id is not None: q = q.filter(cls.table_id == table_id) return q @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): deleted_at = deleted_at or datetime.utcnow() pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) @classmethod def create(cls, query, table_id, collection, role_id): mapping = cls() mapping.role_id = role_id mapping.query = query mapping.collection_id = collection.id mapping.table_id = table_id mapping.update() return mapping def __repr__(self): return '<Mapping(%r, %r)>' % (self.id, self.table_id)
class Document(db.Model, DatedModel): SCHEMA = 'Document' SCHEMA_FOLDER = 'Folder' SCHEMA_TABLE = 'Table' id = db.Column(db.BigInteger, primary_key=True) content_hash = db.Column(db.Unicode(65), nullable=True, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True, index=True) schema = db.Column(db.String(255), nullable=False) meta = db.Column(JSONB, default={}) uploader_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) # noqa parent_id = db.Column(db.BigInteger, db.ForeignKey('document.id'), nullable=True, index=True) # noqa collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), nullable=False, index=True) # noqa collection = db.relationship(Collection, backref=db.backref('documents', lazy='dynamic')) # noqa def __init__(self, **kw): self.meta = {} super(Document, self).__init__(**kw) @property def model(self): return model.get(self.schema) @property def ancestors(self): if self.parent_id is None: return [] key = cache.key('ancestors', self.id) ancestors = cache.get_list(key) if len(ancestors): return ancestors parent_key = cache.key('ancestors', self.parent_id) ancestors = cache.get_list(parent_key) if not len(ancestors): ancestors = [] parent = Document.by_id(self.parent_id) if parent is not None: ancestors = parent.ancestors ancestors.append(self.parent_id) if self.model.is_a(model.get(self.SCHEMA_FOLDER)): cache.set_list(key, ancestors, expire=cache.EXPIRE) return ancestors def update(self, data): props = ('title', 'summary', 'author', 'crawler', 'source_url', 'file_name', 'mime_type', 'headers', 'date', 'authored_at', 'modified_at', 'published_at', 'retrieved_at', 'languages', 'countries', 'keywords') for prop in props: self.meta[prop] = data.get(prop, self.meta.get(prop)) flag_modified(self, 'meta') def delete(self, deleted_at=None): db.session.delete(self) @classmethod def delete_by_collection(cls, collection_id): pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) @classmethod def save(cls, collection, parent=None, foreign_id=None, content_hash=None, meta=None, uploader_id=None): """Try and find a document by various criteria.""" q = cls.all() q = q.filter(Document.collection_id == collection.id) if parent is not None: q = q.filter(Document.parent_id == parent.id) if foreign_id is not None: q = q.filter(Document.foreign_id == foreign_id) elif content_hash is not None: q = q.filter(Document.content_hash == content_hash) else: raise ValueError("No unique criterion for document.") document = q.first() if document is None: document = cls() document.schema = cls.SCHEMA document.collection_id = collection.id document.uploader_id = uploader_id if parent is not None: document.parent_id = parent.id if foreign_id is not None: document.foreign_id = foreign_id document.content_hash = content_hash if content_hash is None: document.schema = cls.SCHEMA_FOLDER if meta is not None: document.update(meta) db.session.add(document) return document @classmethod def by_id(cls, id, collection_id=None): try: id = int(id) except Exception: return q = cls.all() q = q.filter(cls.id == id) if collection_id is not None: q = q.filter(cls.collection_id == collection_id) return q.first() @classmethod def by_collection(cls, collection_id=None): q = cls.all() q = q.filter(cls.collection_id == collection_id) return q @classmethod def cleanup_deleted(cls): q = db.session.query(Collection.id) q = q.filter(Collection.deleted_at != None) # noqa collection_ids = [c for (c, ) in q.all()] pq = db.session.query(cls) pq = pq.filter(cls.collection_id.in_(collection_ids)) pq.delete(synchronize_session=False) def to_proxy(self): proxy = model.get_proxy({ 'id': str(self.id), 'schema': self.model, 'properties': {} }) meta = dict(self.meta) headers = meta.pop('headers', {}) or {} headers = {slugify(k, sep='_'): v for k, v in headers.items()} proxy.set('contentHash', self.content_hash) proxy.set('parent', self.parent_id) proxy.set('ancestors', self.ancestors) proxy.set('crawler', meta.get('crawler')) proxy.set('sourceUrl', meta.get('source_url')) proxy.set('title', meta.get('title')) proxy.set('fileName', meta.get('file_name')) if not proxy.has('fileName'): disposition = headers.get('content_disposition') if disposition is not None: _, attrs = cgi.parse_header(disposition) proxy.set('fileName', attrs.get('filename')) proxy.set('mimeType', meta.get('mime_type')) if not proxy.has('mimeType'): proxy.set('mimeType', headers.get('content_type')) proxy.set('language', meta.get('languages')) proxy.set('country', meta.get('countries')) proxy.set('keywords', meta.get('keywords')) proxy.set('headers', registry.json.pack(headers), quiet=True) proxy.set('authoredAt', meta.get('authored_at')) proxy.set('modifiedAt', meta.get('modified_at')) proxy.set('publishedAt', meta.get('published_at')) proxy.set('retrievedAt', meta.get('retrieved_at')) proxy.set('indexUpdatedAt', self.created_at) proxy.set('sourceUrl', meta.get('source_url')) return proxy def __repr__(self): return '<Document(%r,%r)>' % (self.id, self.schema)