class EventLog(db.Model, IdModel, DatedModel): action = db.Column(db.Unicode(255), index=True) source_ip = db.Column(db.Unicode(255), nullable=True) path = db.Column(db.Unicode(), nullable=True) query = db.Column(JSONB) data = db.Column(JSONB) role_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) @classmethod def emit(cls, action, path, source_ip=None, query=None, data=None, role_id=None): event = EventLog() event.action = action event.source_ip = source_ip event.path = path event.query = query event.data = data if role_id is not None: event.role_id = role_id db.session.add(event) return event def __repr__(self): return '<EventLog(%r, %r)>' % (self.id, self.action)
class DocumentTag(db.Model, IdModel): """A record reflects an entity or tag extracted from a document.""" TEXT_LENGTH = 1024 TYPE_PHONE = 'phone' TYPE_EMAIL = 'email' TYPE_PERSON = 'person' TYPE_ORGANIZATION = 'organization' TYPE_LOCATION = 'location' TYPE_IP = 'ip' TYPE_IBAN = 'iban' TYPES = { TYPE_PERSON: exactitude.names, TYPE_ORGANIZATION: exactitude.names, TYPE_EMAIL: exactitude.emails, TYPE_PHONE: exactitude.phones, TYPE_LOCATION: exactitude.addresses, TYPE_IP: exactitude.ips, TYPE_IBAN: exactitude.ibans, } id = db.Column(db.BigInteger, primary_key=True) origin = db.Column(db.Unicode(255), nullable=False, index=True) type = db.Column(db.Unicode(16), nullable=False) weight = db.Column(db.Integer, default=1) text = db.Column(db.Unicode(TEXT_LENGTH), nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('tags', cascade='all, delete-orphan')) # noqa @property def field(self): type_ = self.TYPES[self.type] for (candidate, invert) in TYPES.values(): if candidate == type_: return invert @classmethod def delete_by(cls, document_id=None, origin=None, type=None): pq = db.session.query(cls) assert document_id or origin or type if document_id is not None: pq = pq.filter(cls.document_id == document_id) if origin is not None: pq = pq.filter(cls.origin == origin) if type is not None: pq = pq.filter(cls.type == type) pq.delete() db.session.flush() def __repr__(self): return '<DocumentTag(%r,%r)>' % (self.document_id, self.text)
class DocumentTag(db.Model, IdModel): """A record reflects an entity or tag extracted from a document.""" TEXT_LENGTH = 1024 TYPE_PHONE = 'phone' TYPE_EMAIL = 'email' TYPE_PERSON = 'person' TYPE_ORGANIZATION = 'organization' TYPE_LOCATION = 'location' TYPE_IP = 'ip' TYPE_IBAN = 'iban' TYPE_COUNTRY = 'country' TYPE_LANGUAGE = 'language' MAPPING = { TYPE_PERSON: 'namesMentioned', TYPE_ORGANIZATION: 'namesMentioned', TYPE_EMAIL: 'emailMentioned', TYPE_PHONE: 'phoneMentioned', TYPE_LOCATION: 'locationMentioned', TYPE_IP: 'ipMentioned', TYPE_IBAN: 'ibanMentioned', TYPE_COUNTRY: 'country', TYPE_LANGUAGE: 'language' } id = db.Column(db.BigInteger, primary_key=True) origin = db.Column(db.Unicode(255), nullable=False, index=True) type = db.Column(db.Unicode(16), nullable=False) weight = db.Column(db.Integer, default=1) text = db.Column(db.Unicode(TEXT_LENGTH), nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship("Document", backref=db.backref('tags', cascade='all, delete-orphan')) # noqa @property def field(self): type_ = registry.get(self.type) if type_ is not None and type_.group is not None: return type_.group @classmethod def delete_by(cls, document_id=None, origin=None, type=None): pq = db.session.query(cls) assert document_id or origin or type if document_id is not None: pq = pq.filter(cls.document_id == document_id) if origin is not None: pq = pq.filter(cls.origin == origin) if type is not None: pq = pq.filter(cls.type == type) pq.delete() db.session.flush() def __repr__(self): return '<DocumentTag(%r,%r)>' % (self.document_id, self.text)
class EntityAsset(Entity): _schema = 'entity/asset.json#' __mapper_args__ = {'polymorphic_identity': _schema} valuation = db.Column(db.Integer, nullable=True) valuation_currency = db.Column(db.Unicode(100), nullable=True) valuation_date = db.Column(db.Unicode, nullable=True)
class Link(db.Model, UuidModel, SoftDeleteModel): type = db.Column(db.String(255), index=True) source_id = db.Column(db.String(254), index=True) target_id = db.Column(db.String(254), index=True) foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('links', lazy='dynamic')) # noqa @property def schema(self): return schemata.get(self.type) def to_dict(self): data = super(Link, self).to_dict() data.update({ 'schema': self.type, 'data': self.data, 'foreign_ids': self.foreign_ids or [], 'collection_id': self.collection_id }) return data def __repr__(self): return '<Link(%r, %r, %r)>' % (self.id, self.source_id, self.target_id)
class Selector(db.Model): id = db.Column(db.Integer, primary_key=True) _text = db.Column('text', db.Unicode, index=True) normalized = db.Column(db.Unicode, index=True) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) entity_id = db.Column(db.Unicode(50), db.ForeignKey('entity.id')) entity = db.relationship('Entity', backref=db.backref( 'selectors', lazy='dynamic', cascade='all, delete-orphan')) # noqa @hybrid_property def text(self): return self._text @text.setter def text(self, text): self._text = text self.normalized = self.normalize(text) @classmethod def normalize(cls, text): return normalize(text) def __repr__(self): return '<Selector(%r, %r)>' % (self.entity_id, self.text) def __unicode__(self): return self.text
class DocumentPage(db.Model): id = db.Column(db.BigInteger, primary_key=True) number = db.Column(db.Integer(), nullable=False) text = db.Column(db.Unicode(), nullable=False) document_id = db.Column(db.Integer(), db.ForeignKey('document.id')) document = db.relationship( Document, backref=db.backref('pages', cascade='all, delete-orphan')) # noqa def __repr__(self): return '<DocumentPage(%r,%r)>' % (self.document_id, self.number) def text_parts(self): """Utility method to get all text snippets in a record.""" if self.text is not None and len(self.text): yield self.text def to_dict(self): return { 'id': self.id, 'number': self.number, 'text': self.text, 'document_id': self.document_id }
class DocumentPage(db.Model): id = db.Column(db.BigInteger, primary_key=True) number = db.Column(db.Integer(), nullable=False) text = db.Column(db.Unicode(), nullable=False) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( Document, backref=db.backref('pages', cascade='all, delete-orphan')) # noqa @property def tid(self): tid = sha1(str(self.document_id)) tid.update(str(self.id)) return tid.hexdigest() def __repr__(self): return '<DocumentPage(%r,%r)>' % (self.document_id, self.number) def text_parts(self): """Utility method to get all text snippets in a record.""" text = string_value(self.text) if text is not None: yield self.text def to_dict(self): return { 'id': self.id, 'number': self.number, 'text': self.text, 'document_id': self.document_id }
class ProcessingLog(db.Model, DatedModel): """Report any events or errors during processing of documents.""" id = db.Column(db.BigInteger, primary_key=True) operation = db.Column(db.Unicode, nullable=True, index=True) component = db.Column(db.Unicode, nullable=True, index=True) source_location = db.Column(db.Unicode, nullable=True, index=True) content_hash = db.Column(db.Unicode(65), nullable=True, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True) source_id = db.Column(db.Integer, nullable=True) document_id = db.Column(db.BigInteger, nullable=True) meta = db.Column(JSONB) error_type = db.Column(db.Unicode, nullable=True) error_message = db.Column(db.Unicode, nullable=True) error_details = db.Column(db.Unicode, nullable=True) @classmethod def log(cls, operation, component=None, source_location=None, content_hash=None, foreign_id=None, source_id=None, document_id=None, meta=None, error_type=None, error_message=None, error_details=None): meta = meta or {} obj = ProcessingLog() obj.operation = operation obj.component = component obj.source_id = source_id obj.document_id = document_id obj.meta = meta source_location = source_location or meta.get('source_path') or \ meta.get('source_url') or meta.get('file_name') obj.source_location = source_location obj.content_hash = content_hash or meta.get('content_hash') obj.foreign_id = foreign_id or meta.get('foreign_id') obj.error_type = error_type obj.error_message = error_message obj.error_details = error_details session = db.create_scoped_session() session.add(obj) session.commit() session.remove() def __repr__(self): return '<ProcessingLog(%r,%r)>' % (self.id, self.content_hash) def __unicode__(self): return self.id
def by_foreign_ids(cls, foreign_ids, collection_id, deleted=False): if not len(foreign_ids): return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast(foreign_ids, ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first()
def by_foreign_id(cls, foreign_id, collection_id, deleted=False): foreign_id = string_value(foreign_id) if foreign_id is None: return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast([foreign_id], ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first()
class EntityTag(db.Model): id = db.Column(db.Integer(), primary_key=True) collection = db.Column(db.Unicode(100)) package_id = db.Column(db.Unicode(100)) entity_id = db.Column(db.Unicode(50), db.ForeignKey('entity.id')) entity = db.relationship(Entity, backref=db.backref('tags', lazy='dynamic')) created_at = db.Column(db.DateTime, default=datetime.utcnow) @classmethod def delete_set(cls, collection, package_id): q = db.session.query(cls) q = q.filter_by(collection=collection) q = q.filter_by(package_id=package_id) q.delete() @classmethod def by_package(cls, collection, package_id): etag = aliased(cls) ent = aliased(Entity) q = db.session.query(etag.entity_id, ent.label, ent.category, ent.list_id) q = q.join(ent, ent.id == etag.entity_id) q = q.filter(etag.collection == collection) q = q.filter(etag.package_id == package_id) entities = [] for entity_id, label, category, lst in q.all(): entities.append({ 'id': entity_id, 'entity': entity_id, 'label': label, 'category': category, 'list': lst }) return entities def __repr__(self): return '<EntityTag(%r, %r)>' % (self.package_id, self.entity_id)
class DocumentTag(db.Model, IdModel): """A record reflects an entity or tag extracted from a document.""" TYPE_PHONE = 'phone' TYPE_EMAIL = 'email' TYPE_PERSON = 'person' TYPE_ORGANIZATION = 'organization' TYPE_LOCATION = 'location' id = db.Column(db.BigInteger, primary_key=True) origin = db.Column(db.Unicode(255), nullable=False, index=True) type = db.Column(db.Unicode(16), nullable=False) weight = db.Column(db.Integer, default=1) key = db.Column(db.Unicode(1024), nullable=False, index=True) text = db.Column(db.Unicode(1024), nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('tags', cascade='all, delete-orphan')) # noqa @classmethod def delete_by(cls, document_id=None, origin=None, type=None): pq = db.session.query(cls) assert document_id or origin or type if document_id is not None: pq = pq.filter(cls.document_id == document_id) if origin is not None: pq = pq.filter(cls.origin == origin) if type is not None: pq = pq.filter(cls.type == type) pq.delete() db.session.flush() def __repr__(self): return '<DocumentTag(%r,%r)>' % (self.document_id, self.key)
class DocumentPage(db.Model): id = db.Column(db.BigInteger, primary_key=True) number = db.Column(db.Integer(), nullable=False) text = db.Column(db.Unicode(), nullable=False) document_id = db.Column(db.Integer(), db.ForeignKey('document.id')) document = db.relationship( Document, backref=db.backref('pages', cascade='all, delete-orphan')) # noqa def __repr__(self): return '<DocumentPage(%r,%r)>' % (self.document_id, self.number) def to_dict(self): return { 'id': self.id, 'number': self.number, 'text': self.text, 'document_id': self.document_id }
def find(cls, label=None, category=[], countries=[], managed=None, collection_id=None): q = db.session.query(cls) q = q.filter(cls.deleted_at == None) # noqa if label and len(label.strip()): label = '%%%s%%' % label.strip() q = q.filter(cls.label.ilike(label)) q = q.filter(cls.id.in_(collection_id)) if len(category): q = q.filter(cls.category.in_(category)) if len(countries): types = cast(countries, ARRAY(db.Unicode())) q = q.filter(cls.countries.contains(types)) if managed is not None: q = q.filter(cls.managed == managed) return q
class Role(db.Model, IdModel, SoftDeleteModel): """A user, group or other access control subject.""" __tablename__ = 'role' USER = '******' GROUP = 'group' SYSTEM = 'system' TYPES = [USER, GROUP, SYSTEM] SYSTEM_GUEST = 'guest' SYSTEM_USER = '******' #: Generates URL-safe signatures for invitations. SIGNATURE = URLSafeTimedSerializer(settings.SECRET_KEY) #: Signature maximum age, defaults to 1 day SIGNATURE_MAX_AGE = 60 * 60 * 24 #: Password minimum length PASSWORD_MIN_LENGTH = 6 foreign_id = db.Column(db.Unicode(2048), nullable=False, unique=True) name = db.Column(db.Unicode, nullable=False) email = db.Column(db.Unicode, nullable=True) api_key = db.Column(db.Unicode, nullable=True) is_admin = db.Column(db.Boolean, nullable=False, default=False) type = db.Column(db.Enum(*TYPES, name='role_type'), nullable=False) password_digest = db.Column(db.Unicode, nullable=True) password = None reset_token = db.Column(db.Unicode, nullable=True) permissions = db.relationship('Permission', backref='role') @property def has_password(self): return self.password_digest is not None def update(self, data): self.name = data.get('name', self.name) if data.get('password'): self.set_password(data.get('password')) def clear_roles(self): """Removes any existing roles from group membership.""" self.roles = [] db.session.add(self) def add_role(self, role): """Adds an existing role as a membership of a group.""" self.roles.append(role) db.session.add(role) db.session.add(self) @classmethod def notifiable(cls): return cls.all_ids().filter(cls.email != None) # noqa @classmethod def by_foreign_id(cls, foreign_id): if foreign_id is not None: return cls.all().filter_by(foreign_id=foreign_id).first() @classmethod def by_email(cls, email): if email: return cls.all().filter_by(email=email) @classmethod def by_api_key(cls, api_key): if api_key is not None: return cls.all().filter_by(api_key=api_key).first() @classmethod def load_or_create(cls, foreign_id, type, name, email=None, is_admin=None): role = cls.by_foreign_id(foreign_id) if role is None: role = cls() role.foreign_id = foreign_id role.name = name role.type = type role.is_admin = False if role.api_key is None: role.api_key = make_textid() role.email = email if is_admin is not None: role.is_admin = is_admin # see: https://github.com/alephdata/aleph/issues/111 auto_admins = [a.lower() for a in settings.ADMINS] if email is not None and email.lower() in auto_admins: role.is_admin = True db.session.add(role) db.session.flush() return role @classmethod def load_id(cls, foreign_id, type=None, name=None): """Load a role and return the ID. If type is given and no role is found, a new role will be created. """ if not hasattr(current_app, '_authz_roles'): current_app._authz_roles = {} if foreign_id not in current_app._authz_roles: role = cls.by_foreign_id(foreign_id) if role is None: if type is None: return name = name or foreign_id role = cls.load_or_create(foreign_id, type, name) current_app._authz_roles[foreign_id] = role.id return current_app._authz_roles[foreign_id] @classmethod def public_roles(cls): """Roles which make a collection to be considered public.""" return set([ cls.load_id(cls.SYSTEM_USER), cls.load_id(cls.SYSTEM_GUEST), ]) @classmethod def by_prefix(cls, prefix): """Load a list of roles matching a name, email address, or foreign_id. :param str pattern: Pattern to match. """ q = cls.all() q = q.filter(Role.type == Role.USER) q = q.filter( or_(cls.foreign_id.ilike('%' + prefix + '%'), cls.email.ilike('%' + prefix + '%'), cls.name.ilike('%' + prefix + '%'))) return q @classmethod def all_groups(cls): return cls.all().filter(Role.type != Role.USER) def set_password(self, secret): """Hashes and sets the role password. :param str secret: The password to be set. """ self.password_digest = generate_password_hash(secret) def check_password(self, secret): """Checks the password if it matches the role password hash. :param str secret: The password to be checked. :rtype: bool """ return check_password_hash(self.password_digest or '', secret) def __repr__(self): return '<Role(%r,%r)>' % (self.id, self.foreign_id)
class Entity(db.Model, UuidModel, SoftDeleteModel): STATE_ACTIVE = 'active' STATE_PENDING = 'pending' STATE_DELETED = 'deleted' name = db.Column(db.Unicode) type = db.Column(db.String(255), index=True) state = db.Column(db.String(128), nullable=True, default=STATE_ACTIVE, index=True) # noqa foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa def delete_references(self, origin=None): pq = db.session.query(Reference) pq = pq.filter(Reference.entity_id == self.id) if origin is not None: pq = pq.filter(Reference.origin == origin) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_identities(self): pq = db.session.query(EntityIdentity) pq = pq.filter(EntityIdentity.entity_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete(self, deleted_at=None): self.delete_references() self.delete_identities() deleted_at = deleted_at or datetime.utcnow() for alert in self.alerts: alert.delete(deleted_at=deleted_at) self.state = self.STATE_DELETED super(Entity, self).delete(deleted_at=deleted_at) @classmethod def delete_dangling(cls, collection_id): """Delete dangling entities. Entities can dangle in pending state while they have no references pointing to them, thus making it impossible to enable them. This is a routine cleanup function. """ q = db.session.query(cls) q = q.filter(cls.collection_id == collection_id) q = q.filter(cls.state == cls.STATE_PENDING) q = q.outerjoin(Reference) q = q.group_by(cls) q = q.having(func.count(Reference.id) == 0) for entity in q.all(): entity.delete() def merge(self, other): if self.id == other.id: raise ValueError("Cannot merge an entity with itself.") if self.collection_id != other.collection_id: raise ValueError( "Cannot merge entities from different collections.") # noqa data = merge_data(self.data, other.data) if self.name.lower() != other.name.lower(): data = merge_data(data, {'alias': [other.name]}) self.data = data self.state = self.STATE_ACTIVE self.foreign_ids = self.foreign_ids or [] self.foreign_ids += other.foreign_ids or [] self.created_at = min((self.created_at, other.created_at)) self.updated_at = datetime.utcnow() # update alerts from aleph.model.alert import Alert q = db.session.query(Alert).filter(Alert.entity_id == other.id) q.update({'entity_id': self.id}) # update document references from aleph.model.reference import Reference q = db.session.query(Reference).filter(Reference.entity_id == other.id) q.update({'entity_id': self.id}) # delete source entities other.delete() db.session.add(self) db.session.commit() db.session.refresh(other) def update(self, entity): data = entity.get('data') or {} data['name'] = entity.get('name') self.data = self.schema.validate(data) self.name = self.data.pop('name') fid = [string_value(f) for f in entity.get('foreign_ids') or []] self.foreign_ids = list(set([f for f in fid if f is not None])) self.state = entity.pop('state', self.STATE_ACTIVE) self.updated_at = datetime.utcnow() db.session.add(self) @classmethod def save(cls, data, collection, merge=False): ent = cls.by_id(data.get('id')) if ent is None: ent = cls() ent.type = data.pop('schema', None) if ent.type is None: raise ValueError("No schema provided.") ent.id = make_textid() if merge: data = merge_data(data, ent.to_dict()) if collection is None: raise ValueError("No collection specified.") ent.collection = collection ent.update(data) return ent @classmethod def filter_collections(cls, q, collections=None): if collections is None: return q collection_ids = [] for collection in collections: if isinstance(collection, Collection): collection = collection.id collection_ids.append(collection) q = q.filter(Entity.collection_id.in_(collection_ids)) return q @classmethod def by_id_set(cls, ids, collections=None): if not len(ids): return {} q = cls.all() q = cls.filter_collections(q, collections=collections) q = q.options(joinedload('collection')) q = q.filter(cls.id.in_(ids)) entities = {} for ent in q: entities[ent.id] = ent return entities @classmethod def by_foreign_id(cls, foreign_id, collection_id, deleted=False): foreign_id = string_value(foreign_id) if foreign_id is None: return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast([foreign_id], ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def latest(cls): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.state == cls.STATE_ACTIVE) return q.scalar() @property def schema(self): return schemata.get(self.type) @property def terms(self): terms = set([self.name]) for alias in ensure_list(self.data.get('alias')): if alias is not None and len(alias): terms.add(alias) return terms @property def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = set([normalize_strong(t) for t in self.terms]) regex_terms = set() for term in terms: if term is None or len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other is None or other == term: continue if other in term: contained = True if not contained: regex_terms.add(term) return regex_terms def to_dict(self): data = super(Entity, self).to_dict() data.update({ 'schema': self.type, 'name': self.name, 'state': self.state, 'data': self.data, 'foreign_ids': self.foreign_ids or [], 'collection_id': self.collection_id }) return data def to_index(self): entity = self.to_dict() entity['properties'] = {'name': [self.name]} for k, v in self.data.items(): v = ensure_list(v) if len(v): entity['properties'][k] = v return entity def to_ref(self): return { 'id': self.id, 'label': self.name, 'schema': self.type, 'collection_id': self.collection_id } def __unicode__(self): return self.name def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)
class Role(db.Model, IdModel, SoftDeleteModel): """A user, group or other access control subject.""" _schema = 'role.json#' __tablename__ = 'role' USER = '******' GROUP = 'group' SYSTEM = 'system' TYPES = [USER, GROUP, SYSTEM] SYSTEM_GUEST = 'guest' SYSTEM_USER = '******' foreign_id = db.Column(db.Unicode(2048), nullable=False, unique=True) name = db.Column(db.Unicode, nullable=False) email = db.Column(db.Unicode, nullable=True) api_key = db.Column(db.Unicode, nullable=True) is_admin = db.Column(db.Boolean, nullable=False, default=False) type = db.Column(db.Enum(*TYPES, name='role_type'), nullable=False) permissions = db.relationship("Permission", backref="role") def update(self, data): validate(data, self._schema) self.name = data.get('name', self.name) self.email = data.get('email', self.email) def clear_roles(self): self.roles = [] db.session.add(self) def add_role(self, role): self.roles.append(role) db.session.add(role) db.session.add(self) @classmethod def notifiable(cls): return cls.all_ids().filter(cls.email != None) # noqa @classmethod def by_foreign_id(cls, foreign_id): if foreign_id is not None: return cls.all().filter_by(foreign_id=foreign_id).first() @classmethod def by_api_key(cls, api_key): if api_key is not None: return cls.all().filter_by(api_key=api_key).first() @classmethod def load_or_create(cls, foreign_id, type, name, email=None, is_admin=None): role = cls.by_foreign_id(foreign_id) if role is None: role = cls() role.foreign_id = foreign_id role.name = name role.type = type role.is_admin = False if role.api_key is None: role.api_key = uuid4().hex role.email = email if is_admin is not None: role.is_admin = is_admin # see: https://github.com/pudo/aleph/issues/111 auto_admins = get_config('AUTHZ_ADMINS') or '' auto_admins = [a.lower() for a in auto_admins.split(',')] if email is not None and email.lower() in auto_admins: role.is_admin = True db.session.add(role) db.session.flush() return role @classmethod def load_id(cls, foreign_id, type=None, name=None): """Load a role and return the ID. If type is given and no role is found, a new role will be created. """ if not hasattr(current_app, '_authz_roles'): current_app._authz_roles = {} if foreign_id not in current_app._authz_roles: role = cls.by_foreign_id(foreign_id) if role is None: if type is None: return name = name or foreign_id role = cls.load_or_create(foreign_id, type, name) current_app._authz_roles[foreign_id] = role.id return current_app._authz_roles[foreign_id] def __repr__(self): return '<Role(%r,%r)>' % (self.id, self.foreign_id) def __unicode__(self): return self.name def to_dict(self): data = super(Role, self).to_dict() data.update({ 'api_url': url_for('roles_api.view', id=self.id), 'foreign_id': self.foreign_id, 'is_admin': self.is_admin, 'email': self.email, 'name': self.name, 'type': self.type }) return data
class Export(db.Model, IdModel, DatedModel): """A data export run in the background. The data is stored in a cloud storage bucket and the user is given a link to download the data. The link expires after a fixed duration and the exported data is deleted.""" DEFAULT_EXPIRATION = timedelta(days=30) # After 30 days label = db.Column(db.Unicode) operation = db.Column(db.Unicode) creator_id = db.Column(db.Integer, db.ForeignKey("role.id")) creator = db.relationship(Role, backref=db.backref("exports", lazy="dynamic")) collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True, nullable=True) collection = db.relationship(Collection, backref=db.backref("exports", lazy="dynamic")) expires_at = db.Column(db.DateTime, default=None, nullable=True) deleted = db.Column(db.Boolean, default=False) status = db.Column("export_status", db.Unicode, default=Status.DEFAULT) content_hash = db.Column(db.Unicode(65), index=True, nullable=True) file_size = db.Column(db.BigInteger, nullable=True) # In bytes file_name = db.Column(db.Unicode, nullable=True) mime_type = db.Column(db.Unicode) meta = db.Column(JSONB, default={}) def to_dict(self): data = self.to_dict_dates() data.update({ "id": stringify(self.id), "label": self.label, "operation": self.operation, "creator_id": stringify(self.creator_id), "collection_id": self.collection_id, "expires_at": self.expires_at, "deleted": self.deleted, "status": Status.LABEL.get(self.status), "content_hash": self.content_hash, "file_size": self.file_size, "file_name": self.file_name, "mime_type": self.mime_type, "meta": self.meta, }) return data @classmethod def create(cls, operation, role_id, label, collection=None, mime_type=None, meta=None): export = cls() export.creator_id = role_id export.operation = operation export.label = label if collection is not None: export.collection_id = collection.id export.mime_type = mime_type export.expires_at = datetime.utcnow() + cls.DEFAULT_EXPIRATION export.meta = meta or {} db.session.add(export) return export @property def namespace(self): return make_key("role", self.creator_id) def set_status(self, status): self.status = status db.session.add(self) def should_delete_publication(self): """Check whether the published export should be deleted from the archive Since we store exports by contenthash, there may be other non-expired exports that point to the same file in the archive""" q = (Export.all().filter( Export.content_hash == self.content_hash).filter( Export.deleted.isnot(True)).filter(Export.id != self.id)) return q.first() is None @classmethod def get_expired(cls, deleted=False): now = datetime.utcnow() q = cls.all() q = q.filter(cls.expires_at <= now) if not deleted: q = q.filter(cls.deleted == deleted) return q @classmethod def get_pending(cls): q = cls.all() q = q.filter(cls.status == Status.PENDING) q = q.filter(cls.deleted == False) # noqa return q @classmethod def by_id(cls, id, role_id=None, deleted=False): q = cls.all().filter_by(id=id) if role_id is not None: q = q.filter(cls.creator_id == role_id) if not deleted: q = q.filter(cls.deleted == False) # noqa return q.first() @classmethod def by_role_id(cls, role_id, deleted=False): q = cls.all() q = q.filter(cls.creator_id == role_id) if not deleted: q = q.filter(cls.deleted == False) # noqa q = q.order_by(cls.created_at.desc()) return q @classmethod def by_content_hash(cls, content_hash, deleted=False): q = cls.all() q = q.filter(cls.content_hash == content_hash) if not deleted: q = q.filter(cls.deleted == False) # noqa return q def __repr__(self): return "<Export(%r, %r, %r)>" % (self.id, self.creator_id, self.label)
class Document(db.Model, DatedModel): SCHEMA = 'Document' SCHEMA_FOLDER = 'Folder' SCHEMA_TABLE = 'Table' id = db.Column(db.BigInteger, primary_key=True) content_hash = db.Column(db.Unicode(65), nullable=True, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True, index=True) schema = db.Column(db.String(255), nullable=False) meta = db.Column(JSONB, default={}) uploader_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) # noqa parent_id = db.Column(db.BigInteger, db.ForeignKey('document.id'), nullable=True, index=True) # noqa collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), nullable=False, index=True) # noqa collection = db.relationship(Collection, backref=db.backref('documents', lazy='dynamic')) # noqa def __init__(self, **kw): self.meta = {} super(Document, self).__init__(**kw) @property def model(self): return model.get(self.schema) @property def ancestors(self): if self.parent_id is None: return [] key = cache.key('ancestors', self.id) ancestors = cache.get_list(key) if len(ancestors): return ancestors parent_key = cache.key('ancestors', self.parent_id) ancestors = cache.get_list(parent_key) if not len(ancestors): ancestors = [] parent = Document.by_id(self.parent_id) if parent is not None: ancestors = parent.ancestors ancestors.append(self.parent_id) if self.model.is_a(model.get(self.SCHEMA_FOLDER)): cache.set_list(key, ancestors, expire=cache.EXPIRE) return ancestors def update(self, data): props = ('title', 'summary', 'author', 'crawler', 'source_url', 'file_name', 'mime_type', 'headers', 'date', 'authored_at', 'modified_at', 'published_at', 'retrieved_at', 'languages', 'countries', 'keywords') for prop in props: self.meta[prop] = data.get(prop, self.meta.get(prop)) flag_modified(self, 'meta') def delete(self, deleted_at=None): db.session.delete(self) @classmethod def delete_by_collection(cls, collection_id): pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) @classmethod def save(cls, collection, parent=None, foreign_id=None, content_hash=None, meta=None, uploader_id=None): """Try and find a document by various criteria.""" q = cls.all() q = q.filter(Document.collection_id == collection.id) if parent is not None: q = q.filter(Document.parent_id == parent.id) if foreign_id is not None: q = q.filter(Document.foreign_id == foreign_id) elif content_hash is not None: q = q.filter(Document.content_hash == content_hash) else: raise ValueError("No unique criterion for document.") document = q.first() if document is None: document = cls() document.schema = cls.SCHEMA document.collection_id = collection.id document.uploader_id = uploader_id if parent is not None: document.parent_id = parent.id if foreign_id is not None: document.foreign_id = foreign_id document.content_hash = content_hash if content_hash is None: document.schema = cls.SCHEMA_FOLDER if meta is not None: document.update(meta) db.session.add(document) return document @classmethod def by_id(cls, id, collection_id=None): try: id = int(id) except Exception: return q = cls.all() q = q.filter(cls.id == id) if collection_id is not None: q = q.filter(cls.collection_id == collection_id) return q.first() @classmethod def by_collection(cls, collection_id=None): q = cls.all() q = q.filter(cls.collection_id == collection_id) return q @classmethod def cleanup_deleted(cls): q = db.session.query(Collection.id) q = q.filter(Collection.deleted_at != None) # noqa collection_ids = [c for (c, ) in q.all()] pq = db.session.query(cls) pq = pq.filter(cls.collection_id.in_(collection_ids)) pq.delete(synchronize_session=False) def to_proxy(self): proxy = model.get_proxy({ 'id': str(self.id), 'schema': self.model, 'properties': {} }) meta = dict(self.meta) headers = meta.pop('headers', {}) or {} headers = {slugify(k, sep='_'): v for k, v in headers.items()} proxy.set('contentHash', self.content_hash) proxy.set('parent', self.parent_id) proxy.set('ancestors', self.ancestors) proxy.set('crawler', meta.get('crawler')) proxy.set('sourceUrl', meta.get('source_url')) proxy.set('title', meta.get('title')) proxy.set('fileName', meta.get('file_name')) if not proxy.has('fileName'): disposition = headers.get('content_disposition') if disposition is not None: _, attrs = cgi.parse_header(disposition) proxy.set('fileName', attrs.get('filename')) proxy.set('mimeType', meta.get('mime_type')) if not proxy.has('mimeType'): proxy.set('mimeType', headers.get('content_type')) proxy.set('language', meta.get('languages')) proxy.set('country', meta.get('countries')) proxy.set('keywords', meta.get('keywords')) proxy.set('headers', registry.json.pack(headers), quiet=True) proxy.set('authoredAt', meta.get('authored_at')) proxy.set('modifiedAt', meta.get('modified_at')) proxy.set('publishedAt', meta.get('published_at')) proxy.set('retrievedAt', meta.get('retrieved_at')) proxy.set('indexUpdatedAt', self.created_at) proxy.set('sourceUrl', meta.get('source_url')) return proxy def __repr__(self): return '<Document(%r,%r)>' % (self.id, self.schema)
class Export(db.Model, IdModel, DatedModel): """A data export run in the background. The data is stored in a cloud storage bucket and the user is given a link to download the data. The link expires after a fixed duration and the exported data is deleted. """ MAX_FILE_SIZE = 10 * 1024 * 1024 * 1024 # 10 GB STATUS_PENDING = "pending" STATUS_SUCCESSFUL = "successful" STATUS_FAILED = "failed" EXPORT_STATUS = { STATUS_PENDING: lazy_gettext("pending"), STATUS_SUCCESSFUL: lazy_gettext("successful"), STATUS_FAILED: lazy_gettext("failed"), } DEFAULT_STATUS = STATUS_PENDING DEFAULT_EXPIRATION = timedelta(days=30) # After 30 days label = db.Column(db.Unicode) operation = db.Column(db.Unicode) creator_id = db.Column(db.Integer, db.ForeignKey("role.id")) creator = db.relationship(Role, backref=db.backref("exports", lazy="dynamic")) collection_id = db.Column(db.Integer, db.ForeignKey("collection.id"), index=True, nullable=True) collection = db.relationship(Collection, backref=db.backref("exports", lazy="dynamic")) expires_at = db.Column(db.DateTime, default=None, nullable=True) deleted = db.Column(db.Boolean, default=False) export_status = db.Column(db.Unicode, default=DEFAULT_STATUS) content_hash = db.Column(db.Unicode(65), index=True, nullable=True) file_size = db.Column(db.BigInteger, nullable=True) # In bytes file_name = db.Column(db.Unicode, nullable=True) mime_type = db.Column(db.Unicode) meta = db.Column(JSONB, default={}) def to_dict(self): data = self.to_dict_dates() if self.export_status in self.EXPORT_STATUS: data["export_status"] = self.EXPORT_STATUS.get(self.export_status) data.update({ "id": stringify(self.id), "label": self.label, "operation": self.operation, "creator_id": stringify(self.creator_id), "collection_id": self.collection_id, "expires_at": self.expires_at, "deleted": self.deleted, "export_status": self.export_status, "content_hash": self.content_hash, "file_size": self.file_size, "file_name": self.file_name, "meta": self.meta, }) return data @classmethod def create( cls, operation, role_id, label, file_path=None, expires_after=None, collection=None, mime_type=None, ): export = cls() export.creator_id = role_id export.operation = operation export.label = label if file_path is not None: export.set_filepath(file_path) if collection is not None: export.collection_id = collection.id export.mime_type = mime_type export.expires_at = datetime.utcnow() + (expires_after or cls.DEFAULT_EXPIRATION) db.session.add(export) return export @property def namespace(self): return make_key("role", self.creator_id) def publish(self): if not self._file_path: raise RuntimeError("file path not present for export: %r", self) # Use contenthash as filename to make to ensure uniqueness path = Path(self._file_path.parent, self.content_hash) self._file_path.rename(path) try: archive.publish(self.namespace, path, self.mime_type) self.set_status(status=Export.STATUS_SUCCESSFUL) except Exception as ex: self.set_status(status=Export.STATUS_FAILED) raise ex def set_filepath(self, file_path): file_path = ensure_path(file_path) file_name = safe_filename(file_path) file_size = file_path.stat().st_size self.file_name = file_name self.file_size = file_size self._file_path = file_path self.content_hash = checksum(file_path) def set_status(self, status): if status in self.EXPORT_STATUS: self.export_status = status db.session.add(self) def delete_publication(self): if self._should_delete_publication(): archive.delete_publication(self.namespace, self.content_hash) self.deleted = True db.session.add(self) def _should_delete_publication(self): """Check whether the published export should be deleted from the archive Since we store exports by contenthash, there may be other non-expired exports that point to the same file in the archive""" q = (Export.all().filter( Export.content_hash == self.content_hash).filter( Export.deleted.isnot(True)).filter(Export.id != self.id)) return q.first() is None @classmethod def get_expired(cls, deleted=False): now = datetime.utcnow() q = cls.all().filter( cls.expires_at.isnot(None)).filter(cls.expires_at <= now) if deleted is not None: q = q.filter(cls.deleted == deleted) return q @classmethod def by_id(cls, id, role_id=None, deleted=False): q = cls.all().filter_by(id=id) if role_id is not None: q = q.filter(cls.creator_id == role_id) if not deleted: q = q.filter(cls.deleted == False) return q.first() @classmethod def by_role_id(cls, role_id, deleted=False): q = cls.all() q = q.filter(cls.creator_id == role_id) if not deleted: q = q.filter(cls.deleted == False) q = q.order_by(cls.created_at.desc()) return q def __repr__(self): return "<Export(%r, %r)>" % (self.id, self.creator_id)
class Collection(db.Model, IdModel, SoftDeleteModel): """A set of documents and entities against which access control is enforced.""" # Category schema for collections. # TODO: add extra weight info. # TODO: should this be configurable? CATEGORIES = { 'news': lazy_gettext('News archives'), 'leak': lazy_gettext('Leaks'), 'land': lazy_gettext('Land registry'), 'gazette': lazy_gettext('Gazettes'), 'court': lazy_gettext('Court archives'), 'company': lazy_gettext('Company registries'), 'watchlist': lazy_gettext('Watchlists'), 'investigation': lazy_gettext('Personal collections'), 'sanctions': lazy_gettext('Sanctions lists'), 'scrape': lazy_gettext('Scrapes'), 'procurement': lazy_gettext('Procurement'), 'grey': lazy_gettext('Grey literature'), 'license': lazy_gettext('Licenses and concessions'), 'regulatory': lazy_gettext('Regulatory filings'), 'other': lazy_gettext('Other material') } DEFAULT = 'other' label = db.Column(db.Unicode) summary = db.Column(db.Unicode, nullable=True) category = db.Column(db.Unicode, nullable=True) countries = db.Column(ARRAY(db.Unicode()), default=[]) languages = db.Column(ARRAY(db.Unicode()), default=[]) foreign_id = db.Column(db.Unicode, unique=True, nullable=False) publisher = db.Column(db.Unicode, nullable=True) publisher_url = db.Column(db.Unicode, nullable=True) info_url = db.Column(db.Unicode, nullable=True) data_url = db.Column(db.Unicode, nullable=True) # A casefile is a type of collection which is used to manage the state # of an investigation. Unlike normal collections, cases do not serve # as source material, but as a mechanism of analysis. casefile = db.Column(db.Boolean, default=False) creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) creator = db.relationship(Role) def update(self, data, creator=None): self.label = data.get('label', self.label) self.summary = data.get('summary', self.summary) self.summary = data.get('summary', self.summary) self.publisher = data.get('publisher', self.publisher) self.publisher_url = data.get('publisher_url', self.publisher_url) self.info_url = data.get('info_url', self.info_url) self.data_url = data.get('data_url', self.data_url) self.category = data.get('category') or self.DEFAULT self.casefile = as_bool(data.get('casefile'), default=False) self.countries = data.get('countries', []) self.languages = data.get('languages', []) if creator is None: creator = Role.by_id(data.get('creator_id')) self.creator = creator self.updated_at = datetime.utcnow() db.session.add(self) db.session.flush() if creator is not None: Permission.grant(self, creator, True, True) @property def roles(self): if not hasattr(self, '_roles'): q = db.session.query(Permission.role_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.collection_id == self.id) # noqa q = q.filter(Permission.read == True) # noqa self._roles = [e.role_id for e in q.all()] return self._roles @property def kind(self): return 'casefile' if self.casefile else 'source' @classmethod def by_foreign_id(cls, foreign_id, deleted=False): if foreign_id is None: return q = cls.all(deleted=deleted) return q.filter(cls.foreign_id == foreign_id).first() @classmethod def all_by_ids(cls, ids, deleted=False, authz=None): q = super(Collection, cls).all_by_ids(ids, deleted=deleted) if authz is not None and not authz.is_admin: q = q.join(Permission, cls.id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def create(cls, data, role=None): foreign_id = data.get('foreign_id') or make_textid() collection = cls.by_foreign_id(foreign_id, deleted=True) if collection is None: collection = cls() collection.foreign_id = foreign_id collection.update(data, creator=role) collection.deleted_at = None return collection def __repr__(self): return '<Collection(%r, %r, %r)>' % \ (self.id, self.foreign_id, self.label)
class Collection(db.Model, IdModel, SoftDeleteModel): """A set of documents and entities against which access control is enforced.""" label = db.Column(db.Unicode) summary = db.Column(db.Unicode, nullable=True) category = db.Column(db.Unicode, nullable=True) countries = db.Column(ARRAY(db.Unicode()), default=[]) languages = db.Column(ARRAY(db.Unicode()), default=[]) foreign_id = db.Column(db.Unicode, unique=True, nullable=False) # Managed collections are generated by API crawlers and thus UI users # shouldn't be allowed to add entities or documents to them. They also # don't use advanced entity extraction features for performance reasons. managed = db.Column(db.Boolean, default=False) creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) creator = db.relationship(Role) def update(self, data): self.label = data.get('label', self.label) self.summary = data.get('summary', self.summary) self.category = data.get('category', self.category) self.managed = data.get('managed', False) self.countries = data.get('countries', []) creator = data.get('creator') or {} self.update_creator(creator.get('id')) self.touch() def update_creator(self, role): """Set the creator (and admin) of a collection.""" if not isinstance(role, Role): role = Role.by_id(role) if role is None or role.type != Role.USER: return self.creator = role db.session.add(self) db.session.flush() Permission.grant(self, role, True, True) def touch(self): self.updated_at = datetime.utcnow() db.session.add(self) def delete_matches(self): pq = db.session.query(Match) pq = pq.filter( or_(Match.collection_id == self.id, Match.match_collection_id == self.id)) pq.delete(synchronize_session=False) def delete_permissions(self, deleted_at): pq = db.session.query(Permission) pq = pq.filter(Permission.collection_id == self.id) pq.update({Permission.deleted_at: deleted_at}, synchronize_session=False) def delete(self, deleted_at=None): self.delete_matches() self.delete_permissions(deleted_at=deleted_at) super(Collection, self).delete(deleted_at=deleted_at) @property def roles(self): if not hasattr(self, '_roles'): q = db.session.query(Permission.role_id) q = q.filter(Permission.collection_id == self.id) # noqa q = q.filter(Permission.read == True) # noqa self._roles = [e.role_id for e in q.all()] return self._roles @classmethod def by_foreign_id(cls, foreign_id, deleted=False): if foreign_id is None: return q = cls.all(deleted=deleted) return q.filter(cls.foreign_id == foreign_id).first() @classmethod def all_by_ids(cls, ids, deleted=False, authz=None): q = super(Collection, cls).all_by_ids(ids, deleted=deleted) if authz is not None and not authz.is_admin: q = q.join(Permission, cls.id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def create(cls, data, role=None): foreign_id = data.get('foreign_id') or make_textid() collection = cls.by_foreign_id(foreign_id, deleted=True) if collection is None: collection = cls() collection.foreign_id = foreign_id collection.update(data) collection.update_creator(role) collection.deleted_at = None return collection def __repr__(self): return '<Collection(%r, %r, %r)>' % \ (self.id, self.foreign_id, self.label)
class Collection(db.Model, IdModel, SoftDeleteModel): """A set of documents and entities against which access control is enforced.""" # Category schema for collections. # TODO: add extra weight info. # TODO: should this be configurable? CATEGORIES = { 'news': 'News archives', 'leak': 'Leaks', 'land': 'Land registry', 'gazette': 'Gazettes', 'court': 'Court archives', 'company': 'Company registries', 'watchlist': 'Watchlists', 'investigation': 'Personal collections', 'sanctions': 'Sanctions lists', 'scrape': 'Scrapes', 'procurement': 'Procurement', 'grey': 'Grey literature', 'license': 'Licenses and concessions', 'regulatory': 'Regulatory filings', 'other': 'Other material' } label = db.Column(db.Unicode) summary = db.Column(db.Unicode, nullable=True) category = db.Column(db.Unicode, nullable=True) countries = db.Column(ARRAY(db.Unicode()), default=[]) languages = db.Column(ARRAY(db.Unicode()), default=[]) foreign_id = db.Column(db.Unicode, unique=True, nullable=False) # Managed collections are generated by API crawlers and thus UI users # shouldn't be allowed to add entities or documents to them. They also # don't use advanced entity extraction features for performance reasons. managed = db.Column(db.Boolean, default=False) creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) creator = db.relationship(Role) def update(self, data, creator=None): self.label = data.get('label', self.label) self.summary = data.get('summary', self.summary) self.category = data.get('category', self.category) self.managed = data.get('managed', False) self.countries = data.get('countries', []) if creator is None: creator = Role.by_id(data.get('creator_id')) self.creator = creator self.updated_at = datetime.utcnow() db.session.add(self) db.session.flush() if creator is not None: Permission.grant(self, creator, True, True) @property def roles(self): if not hasattr(self, '_roles'): q = db.session.query(Permission.role_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.collection_id == self.id) # noqa q = q.filter(Permission.read == True) # noqa self._roles = [e.role_id for e in q.all()] return self._roles @classmethod def by_foreign_id(cls, foreign_id, deleted=False): if foreign_id is None: return q = cls.all(deleted=deleted) return q.filter(cls.foreign_id == foreign_id).first() @classmethod def all_by_ids(cls, ids, deleted=False, authz=None): q = super(Collection, cls).all_by_ids(ids, deleted=deleted) if authz is not None and not authz.is_admin: q = q.join(Permission, cls.id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def create(cls, data, role=None): foreign_id = data.get('foreign_id') or make_textid() collection = cls.by_foreign_id(foreign_id, deleted=True) if collection is None: collection = cls() collection.foreign_id = foreign_id collection.update(data, creator=role) collection.deleted_at = None return collection def __repr__(self): return '<Collection(%r, %r, %r)>' % \ (self.id, self.foreign_id, self.label)
class Document(db.Model, DatedModel, Metadata): MAX_TAGS = 10000 SCHEMA = 'Document' SCHEMA_FOLDER = 'Folder' SCHEMA_PACKAGE = 'Package' SCHEMA_WORKBOOK = 'Workbook' SCHEMA_TEXT = 'PlainText' SCHEMA_HTML = 'HyperText' SCHEMA_PDF = 'Pages' SCHEMA_IMAGE = 'Image' SCHEMA_AUDIO = 'Audio' SCHEMA_VIDEO = 'Video' SCHEMA_TABLE = 'Table' SCHEMA_EMAIL = 'Email' STATUS_PENDING = 'pending' STATUS_SUCCESS = 'success' STATUS_FAIL = 'fail' id = db.Column(db.BigInteger, primary_key=True) content_hash = db.Column(db.Unicode(65), nullable=True, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True, index=True) schema = db.Column(db.String(255), nullable=False) status = db.Column(db.Unicode(10), nullable=True) meta = db.Column(JSONB, default={}) error_message = db.Column(db.Unicode(), nullable=True) body_text = db.Column(db.Unicode(), nullable=True) body_raw = db.Column(db.Unicode(), nullable=True) uploader_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) # noqa parent_id = db.Column(db.BigInteger, db.ForeignKey('document.id'), nullable=True, index=True) # noqa children = db.relationship('Document', lazy='dynamic', backref=db.backref('parent', uselist=False, remote_side=[id])) # noqa collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), nullable=False, index=True) # noqa collection = db.relationship(Collection, backref=db.backref('documents', lazy='dynamic')) # noqa def __init__(self, **kw): self.meta = {} super(Document, self).__init__(**kw) @property def model(self): return model.get(self.schema) @property def name(self): if self.title is not None: return self.title if self.file_name is not None: return self.file_name if self.source_url is not None: return self.source_url @property def supports_records(self): # Slightly unintuitive naming: this just checks the document type, # not if there actually are any records. return self.schema in [self.SCHEMA_PDF, self.SCHEMA_TABLE] @property def supports_pages(self): return self.schema == self.SCHEMA_PDF @property def supports_nlp(self): structural = [ Document.SCHEMA, Document.SCHEMA_PACKAGE, Document.SCHEMA_FOLDER, Document.SCHEMA_WORKBOOK, Document.SCHEMA_VIDEO, Document.SCHEMA_AUDIO, ] return self.schema not in structural @property def ancestors(self): if self.parent_id is None: return [] key = cache.key('ancestors', self.id) ancestors = cache.get_list(key) if ancestors is not None: return ancestors ancestors = self.parent.ancestors ancestors.append(self.parent_id) cache.set_list(key, ancestors) return ancestors def update(self, data): props = ('title', 'summary', 'author', 'crawler', 'source_url', 'file_name', 'mime_type', 'headers', 'date', 'authored_at', 'modified_at', 'published_at', 'retrieved_at', 'languages', 'countries', 'keywords') for prop in props: value = data.get(prop, self.meta.get(prop)) setattr(self, prop, value) db.session.add(self) def update_meta(self): flag_modified(self, 'meta') def delete_records(self): pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id == self.id) pq.delete() db.session.flush() def delete_tags(self): pq = db.session.query(DocumentTag) pq = pq.filter(DocumentTag.document_id == self.id) pq.delete() db.session.flush() def delete(self, deleted_at=None): self.delete_records() self.delete_tags() db.session.delete(self) @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): documents = db.session.query(cls.id) documents = documents.filter(cls.collection_id == collection_id) documents = documents.subquery() pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id.in_(documents)) pq.delete(synchronize_session=False) pq = db.session.query(DocumentTag) pq = pq.filter(DocumentTag.document_id.in_(documents)) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq.delete(synchronize_session=False) def raw_texts(self): yield self.title yield self.file_name yield self.source_url yield self.summary yield self.author if self.status != self.STATUS_SUCCESS: return yield self.body_text if self.supports_records: # iterate over all the associated records. pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id == self.id) pq = pq.order_by(DocumentRecord.index.asc()) for record in pq.yield_per(10000): yield from record.raw_texts() @property def texts(self): yield from filter_texts(self.raw_texts()) @classmethod def by_keys(cls, parent_id=None, collection_id=None, foreign_id=None, content_hash=None): """Try and find a document by various criteria.""" q = cls.all() q = q.filter(Document.collection_id == collection_id) if parent_id is not None: q = q.filter(Document.parent_id == parent_id) if foreign_id is not None: q = q.filter(Document.foreign_id == foreign_id) elif content_hash is not None: q = q.filter(Document.content_hash == content_hash) else: raise ValueError("No unique criterion for document.") document = q.first() if document is None: document = cls() document.schema = cls.SCHEMA document.collection_id = collection_id if parent_id is not None: document.parent_id = parent_id if foreign_id is not None: document.foreign_id = foreign_id if content_hash is not None: document.content_hash = content_hash db.session.add(document) return document @classmethod def by_id(cls, id, collection_id=None): if id is None: return q = cls.all() q = q.filter(cls.id == id) if collection_id is not None: q = q.filter(cls.collection_id == collection_id) return q.first() @classmethod def by_collection(cls, collection_id=None): q = cls.all() q = q.filter(cls.collection_id == collection_id) return q @classmethod def find_ids(cls, collection_id=None, failed_only=False): q = cls.all_ids() if collection_id is not None: q = q.filter(cls.collection_id == collection_id) if failed_only: q = q.filter(cls.status != cls.STATUS_SUCCESS) q = q.order_by(cls.id.asc()) return q def to_proxy(self): meta = dict(self.meta) headers = meta.pop('headers', {}) headers = {slugify(k, sep='_'): v for k, v in headers.items()} proxy = model.get_proxy({ 'id': str(self.id), 'schema': self.model, 'properties': meta }) proxy.set('contentHash', self.content_hash) proxy.set('parent', self.parent_id) proxy.set('ancestors', self.ancestors) proxy.set('fileSize', meta.get('file_size')) proxy.set('fileName', meta.get('file_name')) if not proxy.has('fileName'): disposition = headers.get('content_disposition') if disposition is not None: _, attrs = cgi.parse_header(disposition) proxy.set('fileName', attrs.get('filename')) proxy.set('mimeType', meta.get('mime_type')) if not proxy.has('mimeType'): proxy.set('mimeType', headers.get('content_type')) proxy.set('language', meta.get('languages')) proxy.set('country', meta.get('countries')) proxy.set('authoredAt', meta.get('authored_at')) proxy.set('modifiedAt', meta.get('modified_at')) proxy.set('publishedAt', meta.get('published_at')) proxy.set('retrievedAt', meta.get('retrieved_at')) proxy.set('sourceUrl', meta.get('source_url')) proxy.set('messageId', meta.get('message_id'), quiet=True) proxy.set('inReplyTo', meta.get('in_reply_to'), quiet=True) proxy.set('bodyText', self.body_text, quiet=True) proxy.set('bodyHtml', self.body_raw, quiet=True) columns = meta.get('columns') proxy.set('columns', registry.json.pack(columns), quiet=True) proxy.set('headers', registry.json.pack(headers), quiet=True) pdf = 'application/pdf' if meta.get('extension') == 'pdf' or proxy.first('mimeType') == pdf: proxy.set('pdfHash', self.content_hash, quiet=True) proxy.add('pdfHash', meta.get('pdf_version'), quiet=True) q = db.session.query(DocumentTag) q = q.filter(DocumentTag.document_id == self.id) q = q.filter(DocumentTag.type.in_(DocumentTag.MAPPING.keys())) q = q.order_by(DocumentTag.weight.desc()) q = q.limit(Document.MAX_TAGS) for tag in q.all(): prop = DocumentTag.MAPPING.get(tag.type) if prop is not None: proxy.add(prop, tag.text) return proxy def to_dict(self): proxy = self.to_proxy() data = proxy.to_full_dict() data.update(self.to_dict_dates()) data.update({ 'name': self.name, 'status': self.status, 'foreign_id': self.foreign_id, 'document_id': self.id, 'collection_id': self.collection_id, 'error_message': self.error_message, 'uploader_id': self.uploader_id, 'bulk': False, }) return data def __repr__(self): return '<Document(%r,%r,%r)>' % (self.id, self.schema, self.title)
class Document(db.Model, DatedModel): _schema = 'document.json#' SCHEMA = 'Document' TYPE_TEXT = 'text' TYPE_TABULAR = 'tabular' TYPE_OTHER = 'other' STATUS_PENDING = 'pending' STATUS_SUCCESS = 'success' STATUS_FAIL = 'fail' id = db.Column(db.BigInteger, primary_key=True) content_hash = db.Column(db.Unicode(65), nullable=False, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True) type = db.Column(db.Unicode(10), nullable=False, index=True) status = db.Column(db.Unicode(10), nullable=True, index=True) _meta = db.Column('meta', JSONB) crawler = db.Column(db.Unicode(), index=True) crawler_run = db.Column(db.Unicode()) error_type = db.Column(db.Unicode(), nullable=True) error_message = db.Column(db.Unicode(), nullable=True) error_details = db.Column(db.Unicode(), nullable=True) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), nullable=False, index=True) # noqa collection = db.relationship(Collection, backref=db.backref('documents', lazy='dynamic')) # noqa @property def title(self): return self.meta.title @hybrid_property def meta(self): self._meta = self._meta or {} self._meta['content_hash'] = self.content_hash self._meta['foreign_id'] = self.foreign_id self._meta['crawler'] = self.crawler self._meta['crawler_run'] = self.crawler_run return Metadata.from_data(self._meta or {}) @meta.setter def meta(self, meta): if isinstance(meta, Metadata): self.content_hash = meta.content_hash self.foreign_id = meta.foreign_id self.crawler = meta.crawler self.crawler_run = meta.crawler_run meta = meta.to_attr_dict() self._meta = meta flag_modified(self, '_meta') def update(self, data): validate(data, self._schema) meta = self.meta meta.update(data, safe=True) self.meta = meta db.session.add(self) def delete_pages(self): pq = db.session.query(DocumentPage) pq = pq.filter(DocumentPage.document_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_records(self): pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_references(self, origin=None): pq = db.session.query(Reference) pq = pq.filter(Reference.document_id == self.id) if origin is not None: pq = pq.filter(Reference.origin == origin) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete(self, deleted_at=None): self.delete_references() self.delete_records() self.delete_pages() db.session.delete(self) def insert_records(self, sheet, iterable, chunk_size=1000): chunk = [] for i, data in enumerate(iterable): chunk.append({ 'document_id': self.id, 'row_id': i, 'sheet': sheet, 'data': data }) if len(chunk) >= chunk_size: db.session.bulk_insert_mappings(DocumentRecord, chunk) chunk = [] if len(chunk): db.session.bulk_insert_mappings(DocumentRecord, chunk) def text_parts(self): if self.type == self.TYPE_TEXT: for page in self.pages: for text in page.text_parts(): yield text elif self.type == self.TYPE_TABULAR: for record in self.records: for text in record.text_parts(): yield text @classmethod def crawler_last_run(cls, crawler_id): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.crawler == crawler_id) return q.scalar() @classmethod def is_crawler_active(cls, crawler_id): # TODO: add a function to see if a particular crawl is still running # this should be defined as having "pending" documents. last_run_time = cls.crawler_last_run(crawler_id) if last_run_time is None: return False return last_run_time > (datetime.utcnow() - timedelta(hours=1)) @classmethod def crawler_stats(cls, crawler_id): # Check if the crawler was active very recently, if so, don't # allow the user to execute a new run right now. stats = { 'updated': cls.crawler_last_run(crawler_id), 'running': cls.is_crawler_active(crawler_id) } q = db.session.query(cls.status, func.count(cls.id)) q = q.filter(cls.crawler == crawler_id) q = q.group_by(cls.status) for (status, count) in q.all(): stats[status] = count return stats def _add_to_dict(self, data): try: from flask import request source_id = self.collection_id data['public'] = request.authz.collection_public(source_id) except: data['public'] = None data.update({ 'id': self.id, 'type': self.type, 'status': self.status, 'error_type': self.error_type, 'error_message': self.error_message, 'error_details': self.error_details, 'collection_id': self.collection_id, 'created_at': self.created_at, 'updated_at': self.updated_at }) return data def to_dict(self): data = self.meta.to_dict() return self._add_to_dict(data) def to_index_dict(self): data = self.meta.to_index_dict() data['text'] = index_form(self.text_parts()) data['schema'] = self.SCHEMA data['schemata'] = [self.SCHEMA] data['name_sort'] = ascii_text(data.get('title')) data['title_latin'] = ascii_text(data.get('title')) data['summary_latin'] = ascii_text(data.get('summary')) return self._add_to_dict(data) def __repr__(self): return '<Document(%r,%r,%r)>' % (self.id, self.type, self.title)
class Role(db.Model, IdModel, SoftDeleteModel): """A user, group or other access control subject.""" __tablename__ = 'role' USER = '******' GROUP = 'group' SYSTEM = 'system' TYPES = [USER, GROUP, SYSTEM] SYSTEM_GUEST = 'guest' SYSTEM_USER = '******' #: Generates URL-safe signatures for invitations. SIGNATURE = URLSafeTimedSerializer(settings.SECRET_KEY) #: Signature maximum age, defaults to 1 day SIGNATURE_MAX_AGE = 60 * 60 * 24 foreign_id = db.Column(db.Unicode(2048), nullable=False, unique=True) name = db.Column(db.Unicode, nullable=False) email = db.Column(db.Unicode, nullable=True) type = db.Column(db.Enum(*TYPES, name='role_type'), nullable=False) api_key = db.Column(db.Unicode, nullable=True) is_admin = db.Column(db.Boolean, nullable=False, default=False) is_muted = db.Column(db.Boolean, nullable=False, default=False) is_tester = db.Column(db.Boolean, nullable=False, default=False) is_blocked = db.Column(db.Boolean, nullable=False, default=False) password_digest = db.Column(db.Unicode, nullable=True) password = None reset_token = db.Column(db.Unicode, nullable=True) locale = db.Column(db.Unicode, nullable=True) permissions = db.relationship('Permission', backref='role') @property def has_password(self): return self.password_digest is not None @property def is_public(self): return self.id in self.public_roles() @property def is_alertable(self): if self.email is None: return False if self.is_muted is True: return False # TODO: ignore people that have not logged in for a certain time? return True @property def label(self): return anonymize_email(self.name, self.email) def update(self, data): self.name = data.get('name', self.name) self.is_muted = data.get('is_muted', self.is_muted) self.is_tester = data.get('is_tester', self.is_tester) if data.get('password'): self.set_password(data.get('password')) self.locale = stringify(data.get('locale', self.locale)) self.updated_at = datetime.utcnow() def clear_roles(self): """Removes any existing roles from group membership.""" self.roles = [] self.updated_at = datetime.utcnow() db.session.add(self) db.session.flush() def add_role(self, role): """Adds an existing role as a membership of a group.""" self.roles.append(role) db.session.add(role) db.session.add(self) self.updated_at = datetime.utcnow() def to_dict(self): data = self.to_dict_dates() data.update({ 'id': stringify(self.id), 'type': self.type, 'name': self.name, 'label': self.label, 'email': self.email, 'locale': self.locale, 'api_key': self.api_key, 'is_admin': self.is_admin, 'is_muted': self.is_muted, 'is_tester': self.is_tester, 'has_password': self.has_password, # 'notified_at': self.notified_at }) return data @classmethod def by_foreign_id(cls, foreign_id): if foreign_id is not None: return cls.all().filter_by(foreign_id=foreign_id).first() @classmethod def by_email(cls, email): if email is None: return None q = cls.all() q = q.filter(func.lower(cls.email) == email.lower()) return q.first() @classmethod def by_api_key(cls, api_key): if api_key is not None: return cls.all().filter_by(api_key=api_key).first() @classmethod def load_or_create(cls, foreign_id, type, name, email=None, is_admin=None): role = cls.by_foreign_id(foreign_id) if role is None: role = cls() role.foreign_id = foreign_id role.name = name or email role.type = type role.is_admin = False role.is_muted = False role.is_tester = False role.is_blocked = False role.notified_at = datetime.utcnow() if role.api_key is None: role.api_key = make_textid() if email is not None: role.email = email if is_admin is not None: role.is_admin = is_admin # see: https://github.com/alephdata/aleph/issues/111 auto_admins = [a.lower() for a in settings.ADMINS] if email is not None and email.lower() in auto_admins: role.is_admin = True db.session.add(role) db.session.flush() return role @classmethod def load_cli_user(cls): return cls.load_or_create(foreign_id=settings.SYSTEM_USER, name='Aleph', type=cls.USER, is_admin=True) @classmethod def load_id(cls, foreign_id): """Load a role and return the ID.""" if not hasattr(settings, '_roles'): settings._roles = {} if foreign_id not in settings._roles: role_id = cls.all_ids().filter_by(foreign_id=foreign_id).first() if role_id is not None: settings._roles[foreign_id] = role_id[0] return settings._roles.get(foreign_id) @classmethod def public_roles(cls): """Roles which make a collection to be considered public.""" return set([ cls.load_id(cls.SYSTEM_USER), cls.load_id(cls.SYSTEM_GUEST), ]) @classmethod def by_prefix(cls, prefix, exclude=[]): """Load a list of roles matching a name, email address, or foreign_id. :param str pattern: Pattern to match. """ query = prefix.replace('%', ' ').replace('_', ' ') query = '%%%s%%' % query q = cls.all() q = q.filter(Role.type == Role.USER) if len(exclude): q = q.filter(not_(Role.id.in_(exclude))) q = q.filter( or_( func.lower(cls.email) == prefix.lower(), cls.name.ilike(query))) q = q.order_by(Role.id.asc()) return q @classmethod def all_groups(cls, authz): q = cls.all() q = q.filter(Role.type == Role.GROUP) q = q.order_by(Role.name.asc()) q = q.order_by(Role.foreign_id.asc()) if not authz.is_admin: q = q.filter(Role.id.in_(authz.roles)) return q @classmethod def all_users(cls): return cls.all().filter(Role.type == Role.USER) @classmethod def all_system(cls): return cls.all().filter(Role.type == Role.SYSTEM) def set_password(self, secret): """Hashes and sets the role password. :param str secret: The password to be set. """ self.password_digest = generate_password_hash(secret) def check_password(self, secret): """Checks the password if it matches the role password hash. :param str secret: The password to be checked. :rtype: bool """ digest = self.password_digest or '' return check_password_hash(digest, secret) def __repr__(self): return '<Role(%r,%r)>' % (self.id, self.foreign_id)
class Entity(db.Model, UuidModel, SoftDeleteModel): THING = 'Thing' name = db.Column(db.Unicode) schema = db.Column(db.String(255), index=True) foreign_ids = db.Column(ARRAY(db.Unicode())) data = db.Column('data', JSONB) collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), index=True) # noqa collection = db.relationship(Collection, backref=db.backref('entities', lazy='dynamic')) # noqa @property def model(self): return model.get(self.schema) @property def terms(self): terms = set([self.name]) for alias in ensure_list(self.data.get('alias')): if alias is not None and len(alias): terms.add(alias) return terms @property def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = set([match_form(t) for t in self.terms]) regex_terms = set() for term in terms: if term is None or len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other is None or other == term: continue if other in term: contained = True if not contained: regex_terms.add(term) return regex_terms def delete_matches(self): pq = db.session.query(Match) pq = pq.filter( or_(Match.entity_id == self.id, Match.match_id == self.id)) pq.delete(synchronize_session=False) db.session.refresh(self) def delete(self, deleted_at=None): self.delete_matches() deleted_at = deleted_at or datetime.utcnow() for alert in self.alerts: alert.delete(deleted_at=deleted_at) super(Entity, self).delete(deleted_at=deleted_at) @classmethod def delete_by_collection(cls, collection_id, deleted_at=None): from aleph.model import Alert deleted_at = deleted_at or datetime.utcnow() entities = db.session.query(cls.id) entities = entities.filter(cls.collection_id == collection_id) entities = entities.subquery() pq = db.session.query(Alert) pq = pq.filter(Alert.entity_id.in_(entities)) pq.update({Alert.deleted_at: deleted_at}, synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.entity_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(Match) pq = pq.filter(Match.match_id.in_(entities)) pq.delete(synchronize_session=False) pq = db.session.query(cls) pq = pq.filter(cls.collection_id == collection_id) pq = pq.filter(cls.deleted_at == None) # noqa pq.update({cls.deleted_at: deleted_at}, synchronize_session=False) def merge(self, other): if self.id == other.id: raise ValueError("Cannot merge an entity with itself.") if self.collection_id != other.collection_id: raise ValueError( "Cannot merge entities from different collections.") # noqa self.schema = model.precise_schema(self.schema, other.schema) self.foreign_ids = string_set(self.foreign_ids, self.foreign_ids) self.created_at = min((self.created_at, other.created_at)) self.updated_at = datetime.utcnow() data = merge_data(self.data, other.data) if self.name != other.name: data = merge_data(data, {'alias': [other.name]}) self.data = data # update alerts from aleph.model.alert import Alert q = db.session.query(Alert).filter(Alert.entity_id == other.id) q.update({Alert.entity_id: self.id}) # delete source entities other.delete() db.session.add(self) db.session.commit() db.session.refresh(other) def update(self, entity): self.schema = entity.get('schema') data = entity.get('properties') if is_mapping(data): data['name'] = [entity.get('name')] self.data = self.model.validate(data) elif self.data is None: self.data = {} self.data.pop('name', None) self.name = entity.get('name') # TODO: should this be mutable? # self.foreign_ids = string_set(entity.get('foreign_ids')) self.updated_at = datetime.utcnow() db.session.add(self) @classmethod def create(cls, data, collection): foreign_ids = string_set(data.get('foreign_ids')) ent = cls.by_foreign_ids(foreign_ids, collection.id, deleted=True) if ent is None: ent = cls() ent.id = make_textid() ent.collection = collection ent.foreign_ids = foreign_ids ent.update(data) ent.deleted_at = None return ent @classmethod def by_foreign_ids(cls, foreign_ids, collection_id, deleted=False): if not len(foreign_ids): return None q = cls.all(deleted=deleted) q = q.filter(Entity.collection_id == collection_id) foreign_id = func.cast(foreign_ids, ARRAY(db.Unicode())) q = q.filter(cls.foreign_ids.contains(foreign_id)) q = q.order_by(Entity.deleted_at.desc().nullsfirst()) return q.first() @classmethod def all_ids(cls, deleted=False, authz=None): q = super(Entity, cls).all_ids(deleted=deleted) if authz is not None and not authz.is_admin: q = q.join(Permission, cls.collection_id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def latest(cls): q = db.session.query(func.max(cls.updated_at)) q = q.filter(cls.deleted_at == None) # noqa return q.scalar() def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name)
class Collection(db.Model, IdModel, SoftDeleteModel): """A set of documents and entities against which access control is enforced.""" # Category schema for collections. # TODO: should this be configurable? CATEGORIES = { 'news': lazy_gettext('News archives'), 'leak': lazy_gettext('Leaks'), 'land': lazy_gettext('Land registry'), 'gazette': lazy_gettext('Gazettes'), 'court': lazy_gettext('Court archives'), 'company': lazy_gettext('Company registries'), 'sanctions': lazy_gettext('Sanctions lists'), 'procurement': lazy_gettext('Procurement'), 'finance': lazy_gettext('Financial records'), 'grey': lazy_gettext('Grey literature'), 'library': lazy_gettext('Document libraries'), 'license': lazy_gettext('Licenses and concessions'), 'regulatory': lazy_gettext('Regulatory filings'), 'poi': lazy_gettext('Persons of interest'), 'customs': lazy_gettext('Customs declarations'), 'census': lazy_gettext('Population census'), 'transport': lazy_gettext('Air and maritime registers'), 'other': lazy_gettext('Other material') } DEFAULT = 'other' label = db.Column(db.Unicode) summary = db.Column(db.Unicode, nullable=True) category = db.Column(db.Unicode, nullable=True) countries = db.Column(ARRAY(db.Unicode()), default=[]) languages = db.Column(ARRAY(db.Unicode()), default=[]) foreign_id = db.Column(db.Unicode, unique=True, nullable=False) publisher = db.Column(db.Unicode, nullable=True) publisher_url = db.Column(db.Unicode, nullable=True) info_url = db.Column(db.Unicode, nullable=True) data_url = db.Column(db.Unicode, nullable=True) # A casefile is a type of collection which is used to manage the state # of an investigation. Unlike normal collections, cases do not serve # as source material, but as a mechanism of analysis. casefile = db.Column(db.Boolean, default=False) creator_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) creator = db.relationship(Role) def touch(self): # https://www.youtube.com/watch?v=wv-34w8kGPM self.updated_at = datetime.utcnow() db.session.add(self) def update(self, data, authz): self.label = data.get('label', self.label) self.summary = data.get('summary', self.summary) self.summary = data.get('summary', self.summary) self.publisher = data.get('publisher', self.publisher) self.publisher_url = data.get('publisher_url', self.publisher_url) self.info_url = data.get('info_url', self.info_url) self.data_url = data.get('data_url', self.data_url) self.countries = ensure_list(data.get('countries', self.countries)) self.languages = ensure_list(data.get('languages', self.languages)) # Some fields are editable only by admins in order to have # a strict separation between source evidence and case # material. if authz.is_admin: self.category = data.get('category', self.category) self.casefile = as_bool(data.get('casefile'), default=self.casefile) creator = Role.by_id(data.get('creator_id')) if creator is not None: self.creator = creator self.touch() db.session.flush() if self.creator is not None: Permission.grant(self, self.creator, True, True) @property def team_id(self): role = aliased(Role) perm = aliased(Permission) q = db.session.query(role.id) q = q.filter(role.type != Role.SYSTEM) q = q.filter(role.id == perm.role_id) q = q.filter(perm.collection_id == self.id) q = q.filter(perm.read == True) # noqa q = q.filter(role.deleted_at == None) # noqa q = q.filter(perm.deleted_at == None) # noqa return [stringify(i) for (i, ) in q.all()] @property def secret(self): q = db.session.query(Permission.id) q = q.filter(Permission.role_id.in_(Role.public_roles())) q = q.filter(Permission.collection_id == self.id) q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.deleted_at == None) # noqa return q.count() < 1 @property def ns(self): if not hasattr(self, '_ns'): self._ns = Namespace(self.foreign_id) return self._ns def to_dict(self): data = self.to_dict_dates() data['category'] = self.DEFAULT if self.category in self.CATEGORIES: data['category'] = self.category data['kind'] = 'casefile' if self.casefile else 'source' data.update({ 'id': stringify(self.id), 'collection_id': stringify(self.id), 'foreign_id': self.foreign_id, 'creator_id': stringify(self.creator_id), 'team_id': self.team_id, 'label': self.label, 'summary': self.summary, 'publisher': self.publisher, 'publisher_url': self.publisher_url, 'info_url': self.info_url, 'data_url': self.data_url, 'casefile': self.casefile, 'secret': self.secret }) return data @classmethod def by_foreign_id(cls, foreign_id, deleted=False): if foreign_id is None: return q = cls.all(deleted=deleted) return q.filter(cls.foreign_id == foreign_id).first() @classmethod def _apply_authz(cls, q, authz): if authz is not None and not authz.is_admin: q = q.join(Permission, cls.id == Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.read == True) # noqa q = q.filter(Permission.role_id.in_(authz.roles)) return q @classmethod def all_authz(cls, authz, deleted=False): q = super(Collection, cls).all(deleted=deleted) return cls._apply_authz(q, authz) @classmethod def all_by_ids(cls, ids, deleted=False, authz=None): q = super(Collection, cls).all_by_ids(ids, deleted=deleted) return cls._apply_authz(q, authz) @classmethod def create(cls, data, authz, created_at=None): foreign_id = data.get('foreign_id') or make_textid() collection = cls.by_foreign_id(foreign_id, deleted=True) if collection is None: collection = cls() collection.created_at = created_at collection.foreign_id = foreign_id collection.category = cls.DEFAULT collection.casefile = True collection.creator_id = authz.id collection.update(data, authz) collection.deleted_at = None return collection def __repr__(self): fmt = '<Collection(%r, %r, %r)>' return fmt % (self.id, self.foreign_id, self.label)
class Document(db.Model, DatedModel): _schema = 'document.json#' TYPE_TEXT = 'text' TYPE_TABULAR = 'tabular' TYPE_OTHER = 'other' id = db.Column(db.BigInteger, primary_key=True) content_hash = db.Column(db.Unicode(65), nullable=False, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True) type = db.Column(db.Unicode(10), nullable=False, index=True) _meta = db.Column('meta', JSONB) collections = db.relationship( Collection, secondary=collection_document_table, # noqa backref=db.backref('documents', lazy='dynamic')) # noqa source_collection_id = db.Column(db.Integer, db.ForeignKey('collection.id'), nullable=True) # noqa source_collection = db.relationship(Collection) @property def title(self): return self.meta.title @hybrid_property def meta(self): self._meta = self._meta or {} self._meta['content_hash'] = self.content_hash self._meta['foreign_id'] = self.foreign_id return Metadata.from_data(self._meta or {}) @meta.setter def meta(self, meta): if isinstance(meta, Metadata): self.content_hash = meta.content_hash self.foreign_id = meta.foreign_id meta = meta.to_attr_dict() self._meta = meta flag_modified(self, '_meta') def update(self, data, writeable): validate(data, self._schema) collection_id = data.pop('collection_id', []) self.update_collections(collection_id, writeable) meta = self.meta meta.update(data, safe=True) self.meta = meta db.session.add(self) def update_collections(self, collection_id, writeable): for coll in self.collections: if coll.id == self.source_collection_id: continue if coll.id not in collection_id and coll.id in writeable: self.collections.remove(coll) for coll_id in collection_id: if coll_id in writeable: coll = Collection.by_id(coll_id) if coll not in self.collections: self.collections.append(coll) db.session.add(self) def delete_pages(self): pq = db.session.query(DocumentPage) pq = pq.filter(DocumentPage.document_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_records(self): pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_references(self, origin=None): pq = db.session.query(Reference) pq = pq.filter(Reference.document_id == self.id) if origin is not None: pq = pq.filter(Reference.origin == origin) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete(self, deleted_at=None): self.delete_references() self.delete_records() self.delete_pages() db.session.delete(self) def insert_records(self, sheet, iterable, chunk_size=1000): chunk = [] for i, data in enumerate(iterable): chunk.append({ 'document_id': self.id, 'row_id': i, 'sheet': sheet, 'data': data }) if len(chunk) >= chunk_size: db.session.bulk_insert_mappings(DocumentRecord, chunk) chunk = [] if len(chunk): db.session.bulk_insert_mappings(DocumentRecord, chunk) def text_parts(self): if self.type == self.TYPE_TEXT: for page in self.pages: for text in page.text_parts(): yield text elif self.type == self.TYPE_TABULAR: for record in self.records: for text in record.text_parts(): yield text @classmethod def get_max_id(cls): q = db.session.query(func.max(cls.id)) return q.scalar() def __repr__(self): return '<Document(%r,%r,%r)>' % (self.id, self.type, self.meta.title) @property def collection_ids(self): collection_ids = [c.id for c in self.collections] if self.source_collection_id not in collection_ids: if self.source_collection_id is not None: collection_ids.append(self.source_collection_id) return collection_ids def _add_to_dict(self, data): collection_ids = self.collection_ids try: from aleph.authz import collections_public data['public'] = collections_public(collection_ids) except: pass data.update({ 'id': self.id, 'type': self.type, 'source_collection_id': self.source_collection_id, 'collection_id': collection_ids, 'created_at': self.created_at, 'updated_at': self.updated_at }) return data def to_dict(self): data = self.meta.to_dict() return self._add_to_dict(data) def to_index_dict(self): data = self.meta.to_index_dict() return self._add_to_dict(data)