class Reference(db.Model, IdModel, DatedModel): id = db.Column(db.Integer(), primary_key=True) document_id = db.Column(db.BigInteger, db.ForeignKey('document.id')) entity_id = db.Column(db.String(32), db.ForeignKey('entity.id')) origin = db.Column(db.String(128)) weight = db.Column(db.Integer) entity = db.relationship('Entity', backref=db.backref('references', lazy='dynamic')) document = db.relationship('Document', backref=db.backref('references', lazy='dynamic')) def to_dict(self): return { 'entity': { 'id': self.entity.id, 'name': self.entity.name, '$schema': self.entity.type }, 'weight': self.weight, 'origin': self.origin } def __repr__(self): return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
class EntityOtherName(db.Model, EntityDetails): _schema = '/entity/other_name.json#' entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) entity = db.relationship('Entity', primaryjoin="and_(Entity.id == foreign(EntityOtherName.entity_id), " # noqa "EntityOtherName.deleted_at == None)", # noqa backref=db.backref('other_names', lazy='dynamic', cascade='all, delete-orphan')) # noqa name = db.Column(db.Unicode) note = db.Column(db.Unicode) family_name = db.Column(db.Unicode) given_name = db.Column(db.Unicode) additional_name = db.Column(db.Unicode) honorific_prefix = db.Column(db.Unicode) honorific_suffix = db.Column(db.Unicode) patronymic_name = db.Column(db.Unicode) start_date = db.Column(db.DateTime) end_date = db.Column(db.DateTime) @property def display_name(self): if self.name is not None: return self.name return '' @property def terms(self): return [self.display_name] def to_dict(self): data = super(EntityOtherName, self).to_dict() data['display_name'] = self.display_name return data
class DocumentRecord(db.Model): id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=False) row_id = db.Column(db.Integer, nullable=False) data = db.Column(JSONB) document_id = db.Column(db.Integer(), db.ForeignKey('document.id')) document = db.relationship( Document, backref=db.backref('records', cascade='all, delete-orphan')) # noqa @property def tid(self): tid = sha1(str(self.document_id)) tid.update(str(self.sheet)) tid.update(str(self.row_id)) return tid.hexdigest() @property def text(self): if self.data is None: return [] text = [t for t in self.data.values() if t is not None] return list(set(text)) def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.row_id)
class DocumentPage(db.Model): id = db.Column(db.BigInteger, primary_key=True) number = db.Column(db.Integer(), nullable=False) text = db.Column(db.Unicode(), nullable=False) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( Document, backref=db.backref('pages', cascade='all, delete-orphan')) # noqa @property def tid(self): tid = sha1(str(self.document_id)) tid.update(str(self.id)) return tid.hexdigest() def __repr__(self): return '<DocumentPage(%r,%r)>' % (self.document_id, self.number) def text_parts(self): """Utility method to get all text snippets in a record.""" text = string_value(self.text) if text is not None: yield self.text def to_dict(self): return { 'id': self.id, 'number': self.number, 'text': self.text, 'document_id': self.document_id }
class DocumentRecord(db.Model): id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=False) row_id = db.Column(db.Integer, nullable=False) data = db.Column(JSONB) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( Document, backref=db.backref('records', cascade='all, delete-orphan')) # noqa @property def tid(self): tid = sha1(str(self.document_id)) tid.update(str(self.sheet)) tid.update(str(self.row_id)) return tid.hexdigest() def text_parts(self): """Utility method to get all text snippets in a record.""" for value in self.data.values(): text = string_value(value) if text is not None: yield value def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.row_id)
class Link(db.Model, UuidModel, SoftDeleteModel): type = db.Column(db.String(255), index=True) source_id = db.Column(db.String(254), index=True) target_id = db.Column(db.String(254), index=True) foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('links', lazy='dynamic')) # noqa @property def schema(self): return schemata.get(self.type) def to_dict(self): data = super(Link, self).to_dict() data.update({ 'schema': self.type, 'data': self.data, 'foreign_ids': self.foreign_ids or [], 'collection_id': self.collection_id }) return data def __repr__(self): return '<Link(%r, %r, %r)>' % (self.id, self.source_id, self.target_id)
class Selector(db.Model): id = db.Column(db.Integer, primary_key=True) _text = db.Column('text', db.Unicode, index=True) normalized = db.Column(db.Unicode, index=True) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) entity_id = db.Column(db.Unicode(50), db.ForeignKey('entity.id')) entity = db.relationship('Entity', backref=db.backref( 'selectors', lazy='dynamic', cascade='all, delete-orphan')) # noqa @hybrid_property def text(self): return self._text @text.setter def text(self, text): self._text = text self.normalized = self.normalize(text) @classmethod def normalize(cls, text): return normalize(text) def __repr__(self): return '<Selector(%r, %r)>' % (self.entity_id, self.text) def __unicode__(self): return self.text
class DocumentPage(db.Model): id = db.Column(db.BigInteger, primary_key=True) number = db.Column(db.Integer(), nullable=False) text = db.Column(db.Unicode(), nullable=False) document_id = db.Column(db.Integer(), db.ForeignKey('document.id')) document = db.relationship( Document, backref=db.backref('pages', cascade='all, delete-orphan')) # noqa def __repr__(self): return '<DocumentPage(%r,%r)>' % (self.document_id, self.number) def text_parts(self): """Utility method to get all text snippets in a record.""" if self.text is not None and len(self.text): yield self.text def to_dict(self): return { 'id': self.id, 'number': self.number, 'text': self.text, 'document_id': self.document_id }
class DocumentTag(db.Model, IdModel): """A record reflects an entity or tag extracted from a document.""" TEXT_LENGTH = 1024 TYPE_PHONE = 'phone' TYPE_EMAIL = 'email' TYPE_PERSON = 'person' TYPE_ORGANIZATION = 'organization' TYPE_LOCATION = 'location' TYPE_IP = 'ip' TYPE_IBAN = 'iban' TYPES = { TYPE_PERSON: exactitude.names, TYPE_ORGANIZATION: exactitude.names, TYPE_EMAIL: exactitude.emails, TYPE_PHONE: exactitude.phones, TYPE_LOCATION: exactitude.addresses, TYPE_IP: exactitude.ips, TYPE_IBAN: exactitude.ibans, } id = db.Column(db.BigInteger, primary_key=True) origin = db.Column(db.Unicode(255), nullable=False, index=True) type = db.Column(db.Unicode(16), nullable=False) weight = db.Column(db.Integer, default=1) text = db.Column(db.Unicode(TEXT_LENGTH), nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('tags', cascade='all, delete-orphan')) # noqa @property def field(self): type_ = self.TYPES[self.type] for (candidate, invert) in TYPES.values(): if candidate == type_: return invert @classmethod def delete_by(cls, document_id=None, origin=None, type=None): pq = db.session.query(cls) assert document_id or origin or type if document_id is not None: pq = pq.filter(cls.document_id == document_id) if origin is not None: pq = pq.filter(cls.origin == origin) if type is not None: pq = pq.filter(cls.type == type) pq.delete() db.session.flush() def __repr__(self): return '<DocumentTag(%r,%r)>' % (self.document_id, self.text)
class DocumentTag(db.Model, IdModel): """A record reflects an entity or tag extracted from a document.""" TEXT_LENGTH = 1024 TYPE_PHONE = 'phone' TYPE_EMAIL = 'email' TYPE_PERSON = 'person' TYPE_ORGANIZATION = 'organization' TYPE_LOCATION = 'location' TYPE_IP = 'ip' TYPE_IBAN = 'iban' TYPE_COUNTRY = 'country' TYPE_LANGUAGE = 'language' MAPPING = { TYPE_PERSON: 'namesMentioned', TYPE_ORGANIZATION: 'namesMentioned', TYPE_EMAIL: 'emailMentioned', TYPE_PHONE: 'phoneMentioned', TYPE_LOCATION: 'locationMentioned', TYPE_IP: 'ipMentioned', TYPE_IBAN: 'ibanMentioned', TYPE_COUNTRY: 'country', TYPE_LANGUAGE: 'language' } id = db.Column(db.BigInteger, primary_key=True) origin = db.Column(db.Unicode(255), nullable=False, index=True) type = db.Column(db.Unicode(16), nullable=False) weight = db.Column(db.Integer, default=1) text = db.Column(db.Unicode(TEXT_LENGTH), nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship("Document", backref=db.backref('tags', cascade='all, delete-orphan')) # noqa @property def field(self): type_ = registry.get(self.type) if type_ is not None and type_.group is not None: return type_.group @classmethod def delete_by(cls, document_id=None, origin=None, type=None): pq = db.session.query(cls) assert document_id or origin or type if document_id is not None: pq = pq.filter(cls.document_id == document_id) if origin is not None: pq = pq.filter(cls.origin == origin) if type is not None: pq = pq.filter(cls.type == type) pq.delete() db.session.flush() def __repr__(self): return '<DocumentTag(%r,%r)>' % (self.document_id, self.text)
class Reference(db.Model, IdModel, DatedModel): id = db.Column(db.Integer(), primary_key=True) document_id = db.Column(db.BigInteger, db.ForeignKey('document.id')) entity_id = db.Column(db.String(32), db.ForeignKey('entity.id')) weight = db.Column(db.Integer) entity = db.relationship(Entity, backref=db.backref('references', lazy='dynamic')) document = db.relationship(Document, backref=db.backref('references', lazy='dynamic')) @classmethod def delete_document(cls, document_id): q = cls.all().filter_by(document_id=document_id) q.delete(synchronize_session='fetch') def __repr__(self): return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
class Reference(db.Model, IdModel, DatedModel): id = db.Column(db.Integer(), primary_key=True) origin = db.Column(db.String(128)) weight = db.Column(db.Integer) document_id = db.Column(db.BigInteger, db.ForeignKey('document.id'), index=True) # noqa document = db.relationship('Document', backref=db.backref('references', lazy='dynamic')) # noqa entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) # noqa entity = db.relationship('Entity', backref=db.backref('references', lazy='dynamic')) # noqa @classmethod def index_references(cls, document_id): """Helper function to get reference data for indexing.""" # cf. aleph.index.entities.generate_entities() from aleph.model.entity import Entity q = db.session.query(Reference.entity_id, Entity.collection_id) q = q.filter(Reference.document_id == document_id) q = q.filter(Entity.id == Reference.entity_id) q = q.filter(Entity.state == Entity.STATE_ACTIVE) return q.all() def to_dict(self): return { 'entity': { 'id': self.entity.id, 'name': self.entity.name, '$schema': self.entity.type }, 'weight': self.weight, 'origin': self.origin } def __repr__(self): return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
class EntityIdentifier(db.Model, EntityDetails): _schema = '/entity/identifier.json#' __tablename__ = 'entity_identifier' entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) entity = db.relationship('Entity', primaryjoin="and_(Entity.id == foreign(EntityIdentifier.entity_id), " # noqa "EntityIdentifier.deleted_at == None)", # noqa backref=db.backref('identifiers', lazy='dynamic', cascade='all, delete-orphan')) # noqa identifier = db.Column(db.Unicode) scheme = db.Column(db.Unicode)
class EntityIdentity(db.Model, IdModel, DatedModel): CONFIRMED = 1 REJECTED = 2 UNDECIDED = 3 JUDGEMENTS = [1, 2, 3] entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) # noqa entity = db.relationship('Entity', backref=db.backref('identities', lazy='dynamic')) # noqa match_id = db.Column(db.String(254), index=True, nullable=False) judgement = db.Column(db.Integer(), nullable=False) judge_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) @classmethod def judgements_by_entity(cls, entity_id): q = db.session.query(cls.match_id, cls.judgement) q = q.filter(cls.entity_id == entity_id) return {k: v for k, v in q.all()} @classmethod def entity_ids(cls, entity_id): q = db.session.query(cls.match_id) q = q.filter(cls.entity_id == entity_id) q = q.filter(cls.judgement == cls.CONFIRMED) ids = [entity_id] for mapped_id, in q.all(): ids.append(mapped_id) return ids @classmethod def by_entity_match(cls, entity_id, match_id): q = db.session.query(cls) q = q.filter(cls.entity_id == entity_id) q = q.filter(cls.match_id == match_id) return q.first() @classmethod def save(cls, entity_id, match_id, judgement, judge=None): obj = cls.by_entity_match(entity_id, match_id) if obj is None: obj = cls() obj.entity_id = entity_id obj.match_id = match_id obj.judgement = judgement obj.judge = judge db.session.add(obj) return obj def __repr__(self): return 'EntityIdentity(%r, %r, %r)' % (self.entity_id, self.match_id, self.judgement)
class Alert(db.Model, DatedModel): """A subscription to notifications on a given query.""" __tablename__ = "alert" id = db.Column(db.Integer, primary_key=True) query = db.Column(db.Unicode, nullable=True) notified_at = db.Column(db.DateTime, nullable=True) role_id = db.Column(db.Integer, db.ForeignKey("role.id"), index=True) role = db.relationship(Role, backref=db.backref("alerts", lazy="dynamic")) # noqa def update(self): self.notified_at = datetime.utcnow() db.session.add(self) db.session.flush() def to_dict(self): data = self.to_dict_dates() data.update({ "id": stringify(self.id), "query": self.query, "role_id": stringify(self.role_id), "notified_at": self.notified_at, }) return data @classmethod def by_id(cls, id, role_id=None): q = cls.all().filter_by(id=id) if role_id is not None: q = q.filter(cls.role_id == role_id) return q.first() @classmethod def by_role_id(cls, role_id): q = cls.all() q = q.filter(cls.role_id == role_id) q = q.order_by(cls.created_at.desc()) q = q.order_by(cls.id.desc()) return q @classmethod def create(cls, data, role_id): alert = cls() alert.role_id = role_id alert.query = stringify(data.get("query")) alert.update() return alert def __repr__(self): return "<Alert(%r, %r)>" % (self.id, self.query)
class EntityContactDetail(db.Model, EntityDetails): _schema = '/entity/contact_detail.json#' entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) entity = db.relationship('EntityLegalPerson', primaryjoin="and_(Entity.id == foreign(EntityContactDetail.entity_id), " # noqa "EntityContactDetail.deleted_at == None)", # noqa backref=db.backref('contact_details', lazy='dynamic', cascade='all, delete-orphan')) # noqa label = db.Column(db.Unicode) type = db.Column(db.Unicode) note = db.Column(db.Unicode) valid_from = db.Column(db.DateTime) valid_until = db.Column(db.DateTime)
class DocumentRecord(db.Model): """A record reflects a row or page of a document.""" id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=True) index = db.Column(db.Integer, nullable=True, index=True) text = db.Column(db.Unicode, nullable=True) data = db.Column(JSONB, nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('records', cascade='all, delete-orphan')) # noqa def text_parts(self): """Utility method to get all text snippets in a record.""" if self.data is not None: for value in self.data.values(): text = string_value(value) if text is not None: yield text text = string_value(self.text) if text is not None: yield text @classmethod def find_records(cls, document_id, ids): if not len(ids): return [] q = db.session.query(cls) q = q.filter(cls.document_id == document_id) q = q.filter(cls.id.in_(ids)) return q def to_dict(self): return { 'id': self.id, 'sheet': self.sheet, 'index': self.index, 'data': self.data, 'text': self.text, 'document_id': self.document_id } def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.index)
class DocumentRecord(db.Model): """A record reflects a row or page of a document.""" id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=True) index = db.Column(db.Integer, nullable=True, index=True) text = db.Column(db.Unicode, nullable=True) data = db.Column(JSONB, nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('records', cascade='all, delete-orphan')) # noqa @property def texts(self): """Utility method to get all text snippets in a record.""" if self.data is not None: for value in self.data.values(): yield value yield self.text @classmethod def find_records(cls, ids): if not len(ids): return [] q = db.session.query(cls) q = q.filter(cls.id.in_(ids)) return q @classmethod def by_index(cls, document_id, index): q = db.session.query(cls) q = db.session.query(DocumentRecord) q = q.filter(cls.document_id == document_id) q = q.filter(cls.index == index) return q.first() def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.index)
class DocumentPage(db.Model): id = db.Column(db.BigInteger, primary_key=True) number = db.Column(db.Integer(), nullable=False) text = db.Column(db.Unicode(), nullable=False) document_id = db.Column(db.Integer(), db.ForeignKey('document.id')) document = db.relationship( Document, backref=db.backref('pages', cascade='all, delete-orphan')) # noqa def __repr__(self): return '<DocumentPage(%r,%r)>' % (self.document_id, self.number) def to_dict(self): return { 'id': self.id, 'number': self.number, 'text': self.text, 'document_id': self.document_id }
class EntityTag(db.Model): id = db.Column(db.Integer(), primary_key=True) collection = db.Column(db.Unicode(100)) package_id = db.Column(db.Unicode(100)) entity_id = db.Column(db.Unicode(50), db.ForeignKey('entity.id')) entity = db.relationship(Entity, backref=db.backref('tags', lazy='dynamic')) created_at = db.Column(db.DateTime, default=datetime.utcnow) @classmethod def delete_set(cls, collection, package_id): q = db.session.query(cls) q = q.filter_by(collection=collection) q = q.filter_by(package_id=package_id) q.delete() @classmethod def by_package(cls, collection, package_id): etag = aliased(cls) ent = aliased(Entity) q = db.session.query(etag.entity_id, ent.label, ent.category, ent.list_id) q = q.join(ent, ent.id == etag.entity_id) q = q.filter(etag.collection == collection) q = q.filter(etag.package_id == package_id) entities = [] for entity_id, label, category, lst in q.all(): entities.append({ 'id': entity_id, 'entity': entity_id, 'label': label, 'category': category, 'list': lst }) return entities def __repr__(self): return '<EntityTag(%r, %r)>' % (self.package_id, self.entity_id)
class DocumentRecord(db.Model): id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=False) row_id = db.Column(db.Integer, nullable=False) data = db.Column(JSONB) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( Document, backref=db.backref('records', cascade='all, delete-orphan')) # noqa @property def tid(self): tid = sha1(str(self.document_id)) tid.update(str(self.sheet)) tid.update(str(self.row_id)) return tid.hexdigest() def text_parts(self): """Utility method to get all text snippets in a record.""" for value in self.data.values(): text = string_value(value) if text is not None: yield value @classmethod def find_rows(cls, document_id, rows): if not len(rows): return [] q = db.session.query(cls) q = q.filter(cls.document_id == document_id) clauses = [and_(cls.sheet == r[0], cls.row_id == r[1]) for r in rows] q = q.filter(or_(*clauses)) return q def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.row_id)
class DocumentTag(db.Model, IdModel): """A record reflects an entity or tag extracted from a document.""" TYPE_PHONE = 'phone' TYPE_EMAIL = 'email' TYPE_PERSON = 'person' TYPE_ORGANIZATION = 'organization' TYPE_LOCATION = 'location' id = db.Column(db.BigInteger, primary_key=True) origin = db.Column(db.Unicode(255), nullable=False, index=True) type = db.Column(db.Unicode(16), nullable=False) weight = db.Column(db.Integer, default=1) key = db.Column(db.Unicode(1024), nullable=False, index=True) text = db.Column(db.Unicode(1024), nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('tags', cascade='all, delete-orphan')) # noqa @classmethod def delete_by(cls, document_id=None, origin=None, type=None): pq = db.session.query(cls) assert document_id or origin or type if document_id is not None: pq = pq.filter(cls.document_id == document_id) if origin is not None: pq = pq.filter(cls.origin == origin) if type is not None: pq = pq.filter(cls.type == type) pq.delete() db.session.flush() def __repr__(self): return '<DocumentTag(%r,%r)>' % (self.document_id, self.key)
class Mapping(db.Model, SoftDeleteModel): """A mapping to load entities from a table""" __tablename__ = 'mapping' FAILED = 'failed' SUCCESS = 'success' STATUS = {SUCCESS: lazy_gettext('success'), FAILED: lazy_gettext('failed')} id = db.Column(db.Integer, primary_key=True) query = db.Column('query', JSONB) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), index=True) role = db.relationship(Role, backref=db.backref('mappings', lazy='dynamic')) # noqa collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('mappings', lazy='dynamic')) # noqa table_id = db.Column(db.String(ENTITY_ID_LEN), index=True) last_run_status = db.Column(db.Unicode, nullable=True) last_run_err_msg = db.Column(db.Unicode, nullable=True) def update(self, query=None, table_id=None): self.updated_at = datetime.utcnow() if query: self.query = query if table_id: self.table_id = table_id db.session.add(self) db.session.commit() def set_status(self, status, error=None): self.last_run_status = status self.last_run_err_msg = error db.session.add(self) db.session.commit() def delete(self, deleted_at=None): self.deleted_at = deleted_at or datetime.utcnow() db.session.add(self) db.session.commit() def to_dict(self): data = self.to_dict_dates() status = self.STATUS.get(self.last_run_status) data.update({ 'id': stringify(self.id), 'query': dict(self.query), 'role_id': stringify(self.role_id), 'collection_id': stringify(self.collection_id), 'table_id': self.table_id, 'last_run_status': status, 'last_run_err_msg': self.last_run_err_msg }) return data @classmethod def by_collection(cls, collection_id, table_id=None): q = cls.all().filter(cls.collection_id == collection_id) if table_id is not None: q = q.filter(cls.table_id == table_id) return q @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): deleted_at = deleted_at or datetime.utcnow() pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) @classmethod def create(cls, query, table_id, collection, role_id): mapping = cls() mapping.role_id = role_id mapping.query = query mapping.collection_id = collection.id mapping.table_id = table_id mapping.update() return mapping def __repr__(self): return '<Mapping(%r, %r)>' % (self.id, self.table_id)
class Entity(db.Model, SoftDeleteModel): THING = 'Thing' LEGAL_ENTITY = 'LegalEntity' id = db.Column(db.String(ENTITY_ID_LEN), primary_key=True, default=make_textid, nullable=False, unique=False) name = db.Column(db.Unicode) schema = db.Column(db.String(255), index=True) foreign_id = db.Column(db.Unicode) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa @property def model(self): return model.get(self.schema) @property def signed_id(self): return self.collection.ns.sign(self.id) def delete_matches(self): pq = db.session.query(Match) pq = pq.filter( or_(Match.entity_id == self.id, Match.match_id == self.id)) pq.delete(synchronize_session=False) db.session.refresh(self) def delete(self, deleted_at=None): self.delete_matches() deleted_at = deleted_at or datetime.utcnow() super(Entity, self).delete(deleted_at=deleted_at) def update(self, entity): proxy = model.get_proxy(entity) proxy.schema.validate(entity) self.schema = proxy.schema.name previous = self.to_proxy() for prop in proxy.iterprops(): # Do not allow the user to overwrite hashes because this could # lead to a user accessing random objects. if prop.type == registry.checksum: proxy.set(prop, previous.get(prop), cleaned=True, quiet=True) self.data = proxy.properties self.updated_at = datetime.utcnow() db.session.add(self) def to_proxy(self): proxy = model.get_proxy({ 'id': self.id, 'schema': self.schema, 'properties': self.data }) proxy.add('name', self.name) proxy.set('indexUpdatedAt', self.updated_at) return proxy @classmethod def create(cls, data, collection): foreign_id = data.get('foreign_id') ent = cls.by_foreign_id(foreign_id, collection.id, deleted=True) if ent is None: ent = cls() ent.id = make_textid() ent.collection = collection ent.foreign_id = foreign_id ent.data = {} ent.deleted_at = None ent.update(data) return ent @classmethod def by_id(cls, entity_id, collection_id=None): entity_id, _ = Namespace.parse(entity_id) q = cls.all() q = q.filter(cls.id == entity_id) return q.first() @classmethod def by_foreign_id(cls, foreign_id, collection_id, deleted=False): if foreign_id is None: return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) q = q.filter(cls.foreign_id == foreign_id) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def by_collection(cls, collection_id): return cls.all().filter(Entity.collection_id == collection_id) @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): deleted_at = deleted_at or datetime.utcnow() entities = db.session.query(cls.id) entities = entities.filter(cls.collection_id == collection_id) entities = entities.subquery() pq = db.session.query(Match) pq = pq.filter(Match.entity_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.match_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)
class Entity(db.Model, DatedModel): THING = "Thing" LEGAL_ENTITY = "LegalEntity" id = db.Column( db.String(ENTITY_ID_LEN), primary_key=True, default=make_textid, nullable=False, unique=False, ) schema = db.Column(db.String(255), index=True) data = db.Column("data", JSONB) role_id = db.Column(db.Integer, db.ForeignKey("role.id"), nullable=True) # noqa collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True) collection = db.relationship(Collection, backref=db.backref("entities", lazy="dynamic")) @property def model(self): return model.get(self.schema) def update(self, data, collection): proxy = model.get_proxy(data, cleaned=False) proxy = collection.ns.apply(proxy) self.id = collection.ns.sign(self.id) self.schema = proxy.schema.name self.updated_at = datetime.utcnow() previous = self.to_proxy() for prop in proxy.schema.properties.values(): # Do not allow the user to overwrite hashes because this could # lead to a user accessing random objects. if prop.type == registry.checksum: prev = previous.get(prop) proxy.set(prop, prev, cleaned=True, quiet=True) self.data = proxy.properties db.session.add(self) def to_proxy(self): data = { "id": self.id, "schema": self.schema, "properties": self.data, "created_at": iso_text(self.created_at), "updated_at": iso_text(self.updated_at), "role_id": self.role_id, "mutable": True, } return model.get_proxy(data, cleaned=False) @classmethod def create(cls, data, collection, role_id=None): entity = cls() entity_id = data.get("id") or make_textid() if not registry.entity.validate(entity_id): raise InvalidData(gettext("Invalid entity ID")) entity.id = collection.ns.sign(entity_id) entity.collection_id = collection.id entity.role_id = role_id entity.update(data, collection) return entity @classmethod def by_id(cls, entity_id, collection=None): q = cls.all().filter(cls.id == entity_id) if collection is not None: q = q.filter(cls.collection_id == collection.id) return q.first() @classmethod def by_collection(cls, collection_id): q = cls.all() q = q.filter(Entity.collection_id == collection_id) q = q.yield_per(5000) return q @classmethod def delete_by_collection(cls, collection_id): pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) def __repr__(self): return "<Entity(%r, %r)>" % (self.id, self.schema)
class Alert(db.Model, SoftDeleteModel): """A subscription to notifications on a given query.""" __tablename__ = 'alert' id = db.Column(db.Integer, primary_key=True) query = db.Column(db.Unicode, nullable=True) notified_at = db.Column(db.DateTime, nullable=True) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), index=True) role = db.relationship(Role, backref=db.backref('alerts', lazy='dynamic')) # noqa @property def normalized(self): return normalize(self.query) def delete(self, deleted_at=None): self.deleted_at = deleted_at or datetime.utcnow() db.session.add(self) db.session.flush() def update(self): self.notified_at = datetime.utcnow() db.session.add(self) db.session.flush() def is_same(self, other): if other.role_id != self.role_id: return False if other.normalized != self.normalized: return False return True @classmethod def by_id(cls, id, role_id=None): q = cls.all().filter_by(id=id) if role_id is not None: q = q.filter(cls.role_id == role_id) return q.first() @classmethod def by_role_id(cls, role_id): q = cls.all() q = q.filter(cls.role_id == role_id) q = q.order_by(cls.created_at.desc()) q = q.order_by(cls.id.desc()) return q @classmethod def create(cls, data, role_id): alert = cls() alert.role_id = role_id alert.query = stringify(data.get('query')) alert.update() return alert @classmethod def dedupe(cls): alerts = cls.all() for (left, right) in permutations(alerts, 2): if left.id >= right.id: continue if left.is_same(right): left.delete() def __repr__(self): return '<Alert(%r, %r)>' % (self.id, self.query)
class Alert(db.Model, SoftDeleteModel): """A subscription to notifications on a given query.""" __tablename__ = 'alert' id = db.Column(db.Integer, primary_key=True) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), index=True) custom_label = db.Column(db.Unicode, nullable=True) query_text = db.Column(db.Unicode, nullable=True) entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), nullable=True) # noqa entity = db.relationship(Entity, backref=db.backref('alerts', lazy='dynamic')) # noqa notified_at = db.Column(db.DateTime, nullable=True) @property def label(self): if self.custom_label is not None: return self.custom_label if self.entity: return self.entity.name return self.query_text def delete(self, deleted_at=None): self.deleted_at = deleted_at or datetime.utcnow() db.session.add(self) db.session.flush() def update(self): self.notified_at = datetime.utcnow() db.session.add(self) db.session.flush() def is_same(self, other): if other.role_id == self.role_id: if other.entity_id == self.entity_id: if other.query_text == self.query_text: return True return False @classmethod def by_id(cls, id, role=None): q = cls.all().filter_by(id=id) if role is not None: q = q.filter(cls.role_id == role.id) return q.first() @classmethod def by_role(cls, role): return cls.all().filter(cls.role_id == role.id) @classmethod def create(cls, data, role): validate(data, 'alert.json#') alert = cls() alert.role_id = role.id alert.query_text = data.get('query_text') if alert.query_text is not None: alert.query_text = alert.query_text.strip() alert.query_text = alert.query_text or None alert.entity_id = data.get('entity_id') or None alert.custom_label = data.get('label') alert.update() return alert @classmethod def exists(cls, query, role): q = cls.all_ids().filter(cls.role_id == role.id) query_text = query.get('q') if query_text is not None: query_text = query_text.strip() if not len(query_text): query_text = None q = q.filter(cls.query_text == query_text) entities = query.getlist('entity') if len(entities) == 1: q = q.filter(cls.entity_id == entities[0]) else: q = q.filter(cls.entity_id == None) # noqa q = q.limit(1) return q.scalar() @classmethod def dedupe(cls, entity_id): alerts = cls.all().filter_by(entity_id=entity_id).all() for left in alerts: for right in alerts: if left.id >= right.id: continue if left.is_same(right): left.delete() def __repr__(self): return '<Alert(%r, %r)>' % (self.id, self.label) def to_query(self): return MultiDict({ 'q': self.query_text or '', 'entity': self.entity_id }) def to_dict(self): return { 'id': self.id, 'label': self.label, 'role_id': self.role_id, 'query_text': self.query_text, 'entity_id': self.entity_id, 'created_at': self.created_at, 'notified_at': self.notified_at, 'updated_at': self.updated_at }
class Entity(db.Model, UuidModel, SoftDeleteModel): STATE_ACTIVE = 'active' STATE_PENDING = 'pending' STATE_DELETED = 'deleted' name = db.Column(db.Unicode) type = db.Column(db.String(255), index=True) state = db.Column(db.String(128), nullable=True, default=STATE_ACTIVE, index=True) # noqa foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa def delete_references(self, origin=None): pq = db.session.query(Reference) pq = pq.filter(Reference.entity_id == self.id) if origin is not None: pq = pq.filter(Reference.origin == origin) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_identities(self): pq = db.session.query(EntityIdentity) pq = pq.filter(EntityIdentity.entity_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete(self, deleted_at=None): self.delete_references() self.delete_identities() deleted_at = deleted_at or datetime.utcnow() for alert in self.alerts: alert.delete(deleted_at=deleted_at) self.state = self.STATE_DELETED super(Entity, self).delete(deleted_at=deleted_at) @classmethod def delete_dangling(cls, collection_id): """Delete dangling entities. Entities can dangle in pending state while they have no references pointing to them, thus making it impossible to enable them. This is a routine cleanup function. """ q = db.session.query(cls) q = q.filter(cls.collection_id == collection_id) q = q.filter(cls.state == cls.STATE_PENDING) q = q.outerjoin(Reference) q = q.group_by(cls) q = q.having(func.count(Reference.id) == 0) for entity in q.all(): entity.delete() def merge(self, other): if self.id == other.id: raise ValueError("Cannot merge an entity with itself.") if self.collection_id != other.collection_id: raise ValueError( "Cannot merge entities from different collections.") # noqa data = merge_data(self.data, other.data) if self.name.lower() != other.name.lower(): data = merge_data(data, {'alias': [other.name]}) self.data = data self.state = self.STATE_ACTIVE self.foreign_ids = self.foreign_ids or [] self.foreign_ids += other.foreign_ids or [] self.created_at = min((self.created_at, other.created_at)) self.updated_at = datetime.utcnow() # update alerts from aleph.model.alert import Alert q = db.session.query(Alert).filter(Alert.entity_id == other.id) q.update({'entity_id': self.id}) # update document references from aleph.model.reference import Reference q = db.session.query(Reference).filter(Reference.entity_id == other.id) q.update({'entity_id': self.id}) # delete source entities other.delete() db.session.add(self) db.session.commit() db.session.refresh(other) def update(self, entity): data = entity.get('data') or {} data['name'] = entity.get('name') self.data = self.schema.validate(data) self.name = self.data.pop('name') fid = [string_value(f) for f in entity.get('foreign_ids') or []] self.foreign_ids = list(set([f for f in fid if f is not None])) self.state = entity.pop('state', self.STATE_ACTIVE) self.updated_at = datetime.utcnow() db.session.add(self) @classmethod def save(cls, data, collection, merge=False): ent = cls.by_id(data.get('id')) if ent is None: ent = cls() ent.type = data.pop('schema', None) if ent.type is None: raise ValueError("No schema provided.") ent.id = make_textid() if merge: data = merge_data(data, ent.to_dict()) if collection is None: raise ValueError("No collection specified.") ent.collection = collection ent.update(data) return ent @classmethod def filter_collections(cls, q, collections=None): if collections is None: return q collection_ids = [] for collection in collections: if isinstance(collection, Collection): collection = collection.id collection_ids.append(collection) q = q.filter(Entity.collection_id.in_(collection_ids)) return q @classmethod def by_id_set(cls, ids, collections=None): if not len(ids): return {} q = cls.all() q = cls.filter_collections(q, collections=collections) q = q.options(joinedload('collection')) q = q.filter(cls.id.in_(ids)) entities = {} for ent in q: entities[ent.id] = ent return entities @classmethod def by_foreign_id(cls, foreign_id, collection_id, deleted=False): foreign_id = string_value(foreign_id) if foreign_id is None: return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast([foreign_id], ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def latest(cls): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.state == cls.STATE_ACTIVE) return q.scalar() @property def schema(self): return schemata.get(self.type) @property def terms(self): terms = set([self.name]) for alias in ensure_list(self.data.get('alias')): if alias is not None and len(alias): terms.add(alias) return terms @property def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = set([normalize_strong(t) for t in self.terms]) regex_terms = set() for term in terms: if term is None or len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other is None or other == term: continue if other in term: contained = True if not contained: regex_terms.add(term) return regex_terms def to_dict(self): data = super(Entity, self).to_dict() data.update({ 'schema': self.type, 'name': self.name, 'state': self.state, 'data': self.data, 'foreign_ids': self.foreign_ids or [], 'collection_id': self.collection_id }) return data def to_index(self): entity = self.to_dict() entity['properties'] = {'name': [self.name]} for k, v in self.data.items(): v = ensure_list(v) if len(v): entity['properties'][k] = v return entity def to_ref(self): return { 'id': self.id, 'label': self.name, 'schema': self.type, 'collection_id': self.collection_id } def __unicode__(self): return self.name def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)
class Export(db.Model, IdModel, DatedModel): """A data export run in the background. The data is stored in a cloud storage bucket and the user is given a link to download the data. The link expires after a fixed duration and the exported data is deleted.""" DEFAULT_EXPIRATION = timedelta(days=30) # After 30 days label = db.Column(db.Unicode) operation = db.Column(db.Unicode) creator_id = db.Column(db.Integer, db.ForeignKey("role.id")) creator = db.relationship(Role, backref=db.backref("exports", lazy="dynamic")) collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True, nullable=True) collection = db.relationship(Collection, backref=db.backref("exports", lazy="dynamic")) expires_at = db.Column(db.DateTime, default=None, nullable=True) deleted = db.Column(db.Boolean, default=False) status = db.Column("export_status", db.Unicode, default=Status.DEFAULT) content_hash = db.Column(db.Unicode(65), index=True, nullable=True) file_size = db.Column(db.BigInteger, nullable=True) # In bytes file_name = db.Column(db.Unicode, nullable=True) mime_type = db.Column(db.Unicode) meta = db.Column(JSONB, default={}) def to_dict(self): data = self.to_dict_dates() data.update({ "id": stringify(self.id), "label": self.label, "operation": self.operation, "creator_id": stringify(self.creator_id), "collection_id": self.collection_id, "expires_at": self.expires_at, "deleted": self.deleted, "status": Status.LABEL.get(self.status), "content_hash": self.content_hash, "file_size": self.file_size, "file_name": self.file_name, "mime_type": self.mime_type, "meta": self.meta, }) return data @classmethod def create(cls, operation, role_id, label, collection=None, mime_type=None, meta=None): export = cls() export.creator_id = role_id export.operation = operation export.label = label if collection is not None: export.collection_id = collection.id export.mime_type = mime_type export.expires_at = datetime.utcnow() + cls.DEFAULT_EXPIRATION export.meta = meta or {} db.session.add(export) return export @property def namespace(self): return make_key("role", self.creator_id) def set_status(self, status): self.status = status db.session.add(self) def should_delete_publication(self): """Check whether the published export should be deleted from the archive Since we store exports by contenthash, there may be other non-expired exports that point to the same file in the archive""" q = (Export.all().filter( Export.content_hash == self.content_hash).filter( Export.deleted.isnot(True)).filter(Export.id != self.id)) return q.first() is None @classmethod def get_expired(cls, deleted=False): now = datetime.utcnow() q = cls.all() q = q.filter(cls.expires_at <= now) if not deleted: q = q.filter(cls.deleted == deleted) return q @classmethod def get_pending(cls): q = cls.all() q = q.filter(cls.status == Status.PENDING) q = q.filter(cls.deleted == False) # noqa return q @classmethod def by_id(cls, id, role_id=None, deleted=False): q = cls.all().filter_by(id=id) if role_id is not None: q = q.filter(cls.creator_id == role_id) if not deleted: q = q.filter(cls.deleted == False) # noqa return q.first() @classmethod def by_role_id(cls, role_id, deleted=False): q = cls.all() q = q.filter(cls.creator_id == role_id) if not deleted: q = q.filter(cls.deleted == False) # noqa q = q.order_by(cls.created_at.desc()) return q @classmethod def by_content_hash(cls, content_hash, deleted=False): q = cls.all() q = q.filter(cls.content_hash == content_hash) if not deleted: q = q.filter(cls.deleted == False) # noqa return q def __repr__(self): return "<Export(%r, %r, %r)>" % (self.id, self.creator_id, self.label)
class Entity(db.Model, UuidModel, SoftDeleteModel): THING = 'Thing' name = db.Column(db.Unicode) schema = db.Column(db.String(255), index=True) foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa @property def model(self): return model.get(self.schema) @property def terms(self): terms = set([self.name]) for alias in ensure_list(self.data.get('alias')): if alias is not None and len(alias): terms.add(alias) return terms @property def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = set([match_form(t) for t in self.terms]) regex_terms = set() for term in terms: if term is None or len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other is None or other == term: continue if other in term: contained = True if not contained: regex_terms.add(term) return regex_terms def delete_matches(self): pq = db.session.query(Match) pq = pq.filter( or_(Match.entity_id == self.id, Match.match_id == self.id)) pq.delete(synchronize_session=False) db.session.refresh(self) def delete(self, deleted_at=None): self.delete_matches() deleted_at = deleted_at or datetime.utcnow() for alert in self.alerts: alert.delete(deleted_at=deleted_at) super(Entity, self).delete(deleted_at=deleted_at) @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): from aleph.model import Alert deleted_at = deleted_at or datetime.utcnow() entities = db.session.query(cls.id) entities = entities.filter(cls.collection_id == collection_id) entities = entities.subquery() pq = db.session.query(Alert) pq = pq.filter(Alert.entity_id.in_(entities)) pq.update({Alert.deleted_at: deleted_at}, synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.entity_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.match_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) def merge(self, other): if self.id == other.id: raise ValueError("Cannot merge an entity with itself.") if self.collection_id != other.collection_id: raise ValueError( "Cannot merge entities from different collections.") # noqa self.schema = model.precise_schema(self.schema, other.schema) self.foreign_ids = string_set(self.foreign_ids, self.foreign_ids) self.created_at = min((self.created_at, other.created_at)) self.updated_at = datetime.utcnow() data = merge_data(self.data, other.data) if self.name != other.name: data = merge_data(data, {'alias': [other.name]}) self.data = data # update alerts from aleph.model.alert import Alert q = db.session.query(Alert).filter(Alert.entity_id == other.id) q.update({Alert.entity_id: self.id}) # delete source entities other.delete() db.session.add(self) db.session.commit() db.session.refresh(other) def update(self, entity): self.schema = entity.get('schema') data = entity.get('properties') if is_mapping(data): data['name'] = [entity.get('name')] self.data = self.model.validate(data) elif self.data is None: self.data = {} self.data.pop('name', None) self.name = entity.get('name') # TODO: should this be mutable? # self.foreign_ids = string_set(entity.get('foreign_ids')) self.updated_at = datetime.utcnow() db.session.add(self) @classmethod def create(cls, data, collection): foreign_ids = string_set(data.get('foreign_ids')) ent = cls.by_foreign_ids(foreign_ids, collection.id, deleted=True) if ent is None: ent = cls() ent.id = make_textid() ent.collection = collection ent.foreign_ids = foreign_ids ent.update(data) ent.deleted_at = None return ent @classmethod def by_foreign_ids(cls, foreign_ids, collection_id, deleted=False): if not len(foreign_ids): return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast(foreign_ids, ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def all_ids(cls, deleted=False, authz=None): q = super(Entity, cls).all_ids(deleted=deleted) if authz is not None and not authz.is_admin: q = q.join(Permission, cls.collection_id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def latest(cls): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.deleted_at == None) # noqa return q.scalar() def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)