class DocumentPage(db.Model): id = db.Column(db.BigInteger, primary_key=True) number = db.Column(db.Integer(), nullable=False) text = db.Column(db.Unicode(), nullable=False) document_id = db.Column(db.Integer(), db.ForeignKey('document.id')) document = db.relationship( Document, backref=db.backref('pages', cascade='all, delete-orphan')) # noqa def __repr__(self): return '<DocumentPage(%r,%r)>' % (self.document_id, self.number) def text_parts(self): """Utility method to get all text snippets in a record.""" if self.text is not None and len(self.text): yield self.text def to_dict(self): return { 'id': self.id, 'number': self.number, 'text': self.text, 'document_id': self.document_id }
class DocumentPage(db.Model): id = db.Column(db.BigInteger, primary_key=True) number = db.Column(db.Integer(), nullable=False) text = db.Column(db.Unicode(), nullable=False) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( Document, backref=db.backref('pages', cascade='all, delete-orphan')) # noqa @property def tid(self): tid = sha1(str(self.document_id)) tid.update(str(self.id)) return tid.hexdigest() def __repr__(self): return '<DocumentPage(%r,%r)>' % (self.document_id, self.number) def text_parts(self): """Utility method to get all text snippets in a record.""" text = string_value(self.text) if text is not None: yield self.text def to_dict(self): return { 'id': self.id, 'number': self.number, 'text': self.text, 'document_id': self.document_id }
class DocumentRecord(db.Model): id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=False) row_id = db.Column(db.Integer, nullable=False) data = db.Column(JSONB) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( Document, backref=db.backref('records', cascade='all, delete-orphan')) # noqa @property def tid(self): tid = sha1(str(self.document_id)) tid.update(str(self.sheet)) tid.update(str(self.row_id)) return tid.hexdigest() def text_parts(self): """Utility method to get all text snippets in a record.""" for value in self.data.values(): text = string_value(value) if text is not None: yield value def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.row_id)
class Reference(db.Model, IdModel, DatedModel): id = db.Column(db.Integer(), primary_key=True) document_id = db.Column(db.BigInteger, db.ForeignKey('document.id')) entity_id = db.Column(db.String(32), db.ForeignKey('entity.id')) origin = db.Column(db.String(128)) weight = db.Column(db.Integer) entity = db.relationship('Entity', backref=db.backref('references', lazy='dynamic')) document = db.relationship('Document', backref=db.backref('references', lazy='dynamic')) def to_dict(self): return { 'entity': { 'id': self.entity.id, 'name': self.entity.name, '$schema': self.entity.type }, 'weight': self.weight, 'origin': self.origin } def __repr__(self): return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
class DocumentRecord(db.Model): id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=False) row_id = db.Column(db.Integer, nullable=False) data = db.Column(JSONB) document_id = db.Column(db.Integer(), db.ForeignKey('document.id')) document = db.relationship( Document, backref=db.backref('records', cascade='all, delete-orphan')) # noqa @property def tid(self): tid = sha1(str(self.document_id)) tid.update(str(self.sheet)) tid.update(str(self.row_id)) return tid.hexdigest() @property def text(self): if self.data is None: return [] text = [t for t in self.data.values() if t is not None] return list(set(text)) def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.row_id)
class IdModel(object): id = db.Column(db.Integer(), primary_key=True) def to_dict(self): parent = super(IdModel, self) data = parent.to_dict() if hasattr(parent, 'to_dict') else {} data['id'] = self.id return data
class DocumentTag(db.Model, IdModel): """A record reflects an entity or tag extracted from a document.""" TEXT_LENGTH = 1024 TYPE_PHONE = 'phone' TYPE_EMAIL = 'email' TYPE_PERSON = 'person' TYPE_ORGANIZATION = 'organization' TYPE_LOCATION = 'location' TYPE_IP = 'ip' TYPE_IBAN = 'iban' TYPES = { TYPE_PERSON: exactitude.names, TYPE_ORGANIZATION: exactitude.names, TYPE_EMAIL: exactitude.emails, TYPE_PHONE: exactitude.phones, TYPE_LOCATION: exactitude.addresses, TYPE_IP: exactitude.ips, TYPE_IBAN: exactitude.ibans, } id = db.Column(db.BigInteger, primary_key=True) origin = db.Column(db.Unicode(255), nullable=False, index=True) type = db.Column(db.Unicode(16), nullable=False) weight = db.Column(db.Integer, default=1) text = db.Column(db.Unicode(TEXT_LENGTH), nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('tags', cascade='all, delete-orphan')) # noqa @property def field(self): type_ = self.TYPES[self.type] for (candidate, invert) in TYPES.values(): if candidate == type_: return invert @classmethod def delete_by(cls, document_id=None, origin=None, type=None): pq = db.session.query(cls) assert document_id or origin or type if document_id is not None: pq = pq.filter(cls.document_id == document_id) if origin is not None: pq = pq.filter(cls.origin == origin) if type is not None: pq = pq.filter(cls.type == type) pq.delete() db.session.flush() def __repr__(self): return '<DocumentTag(%r,%r)>' % (self.document_id, self.text)
class DocumentTag(db.Model, IdModel): """A record reflects an entity or tag extracted from a document.""" TEXT_LENGTH = 1024 TYPE_PHONE = 'phone' TYPE_EMAIL = 'email' TYPE_PERSON = 'person' TYPE_ORGANIZATION = 'organization' TYPE_LOCATION = 'location' TYPE_IP = 'ip' TYPE_IBAN = 'iban' TYPE_COUNTRY = 'country' TYPE_LANGUAGE = 'language' MAPPING = { TYPE_PERSON: 'namesMentioned', TYPE_ORGANIZATION: 'namesMentioned', TYPE_EMAIL: 'emailMentioned', TYPE_PHONE: 'phoneMentioned', TYPE_LOCATION: 'locationMentioned', TYPE_IP: 'ipMentioned', TYPE_IBAN: 'ibanMentioned', TYPE_COUNTRY: 'country', TYPE_LANGUAGE: 'language' } id = db.Column(db.BigInteger, primary_key=True) origin = db.Column(db.Unicode(255), nullable=False, index=True) type = db.Column(db.Unicode(16), nullable=False) weight = db.Column(db.Integer, default=1) text = db.Column(db.Unicode(TEXT_LENGTH), nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship("Document", backref=db.backref('tags', cascade='all, delete-orphan')) # noqa @property def field(self): type_ = registry.get(self.type) if type_ is not None and type_.group is not None: return type_.group @classmethod def delete_by(cls, document_id=None, origin=None, type=None): pq = db.session.query(cls) assert document_id or origin or type if document_id is not None: pq = pq.filter(cls.document_id == document_id) if origin is not None: pq = pq.filter(cls.origin == origin) if type is not None: pq = pq.filter(cls.type == type) pq.delete() db.session.flush() def __repr__(self): return '<DocumentTag(%r,%r)>' % (self.document_id, self.text)
class EntityIdentity(db.Model, IdModel, DatedModel): CONFIRMED = 1 REJECTED = 2 UNDECIDED = 3 JUDGEMENTS = [1, 2, 3] entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) # noqa entity = db.relationship('Entity', backref=db.backref('identities', lazy='dynamic')) # noqa match_id = db.Column(db.String(254), index=True, nullable=False) judgement = db.Column(db.Integer(), nullable=False) judge_id = db.Column(db.Integer, db.ForeignKey('role.id'), nullable=True) @classmethod def judgements_by_entity(cls, entity_id): q = db.session.query(cls.match_id, cls.judgement) q = q.filter(cls.entity_id == entity_id) return {k: v for k, v in q.all()} @classmethod def entity_ids(cls, entity_id): q = db.session.query(cls.match_id) q = q.filter(cls.entity_id == entity_id) q = q.filter(cls.judgement == cls.CONFIRMED) ids = [entity_id] for mapped_id, in q.all(): ids.append(mapped_id) return ids @classmethod def by_entity_match(cls, entity_id, match_id): q = db.session.query(cls) q = q.filter(cls.entity_id == entity_id) q = q.filter(cls.match_id == match_id) return q.first() @classmethod def save(cls, entity_id, match_id, judgement, judge=None): obj = cls.by_entity_match(entity_id, match_id) if obj is None: obj = cls() obj.entity_id = entity_id obj.match_id = match_id obj.judgement = judgement obj.judge = judge db.session.add(obj) return obj def __repr__(self): return 'EntityIdentity(%r, %r, %r)' % (self.entity_id, self.match_id, self.judgement)
class DocumentPage(db.Model): id = db.Column(db.BigInteger, primary_key=True) number = db.Column(db.Integer(), nullable=False) text = db.Column(db.Unicode(), nullable=False) document_id = db.Column(db.Integer(), db.ForeignKey('document.id')) document = db.relationship( Document, backref=db.backref('pages', cascade='all, delete-orphan')) # noqa def __repr__(self): return '<DocumentPage(%r,%r)>' % (self.document_id, self.number) def to_dict(self): return { 'id': self.id, 'number': self.number, 'text': self.text, 'document_id': self.document_id }
class DocumentRecord(db.Model): """A record reflects a row or page of a document.""" id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=True) index = db.Column(db.Integer, nullable=True, index=True) text = db.Column(db.Unicode, nullable=True) data = db.Column(JSONB, nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('records', cascade='all, delete-orphan')) # noqa def text_parts(self): """Utility method to get all text snippets in a record.""" if self.data is not None: for value in self.data.values(): text = string_value(value) if text is not None: yield text text = string_value(self.text) if text is not None: yield text @classmethod def find_records(cls, document_id, ids): if not len(ids): return [] q = db.session.query(cls) q = q.filter(cls.document_id == document_id) q = q.filter(cls.id.in_(ids)) return q def to_dict(self): return { 'id': self.id, 'sheet': self.sheet, 'index': self.index, 'data': self.data, 'text': self.text, 'document_id': self.document_id } def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.index)
class Reference(db.Model, IdModel, DatedModel): id = db.Column(db.Integer(), primary_key=True) origin = db.Column(db.String(128)) weight = db.Column(db.Integer) document_id = db.Column(db.BigInteger, db.ForeignKey('document.id'), index=True) # noqa document = db.relationship('Document', backref=db.backref('references', lazy='dynamic')) # noqa entity_id = db.Column(db.String(32), db.ForeignKey('entity.id'), index=True) # noqa entity = db.relationship('Entity', backref=db.backref('references', lazy='dynamic')) # noqa @classmethod def index_references(cls, document_id): """Helper function to get reference data for indexing.""" # cf. aleph.index.entities.generate_entities() from aleph.model.entity import Entity q = db.session.query(Reference.entity_id, Entity.collection_id) q = q.filter(Reference.document_id == document_id) q = q.filter(Entity.id == Reference.entity_id) q = q.filter(Entity.state == Entity.STATE_ACTIVE) return q.all() def to_dict(self): return { 'entity': { 'id': self.entity.id, 'name': self.entity.name, '$schema': self.entity.type }, 'weight': self.weight, 'origin': self.origin } def __repr__(self): return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
class Reference(db.Model, IdModel, DatedModel): id = db.Column(db.Integer(), primary_key=True) document_id = db.Column(db.BigInteger, db.ForeignKey('document.id')) entity_id = db.Column(db.String(32), db.ForeignKey('entity.id')) weight = db.Column(db.Integer) entity = db.relationship(Entity, backref=db.backref('references', lazy='dynamic')) document = db.relationship(Document, backref=db.backref('references', lazy='dynamic')) @classmethod def delete_document(cls, document_id): q = cls.all().filter_by(document_id=document_id) q.delete(synchronize_session='fetch') def __repr__(self): return '<Reference(%r, %r)>' % (self.document_id, self.entity_id)
class DocumentRecord(db.Model): """A record reflects a row or page of a document.""" id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=True) index = db.Column(db.Integer, nullable=True, index=True) text = db.Column(db.Unicode, nullable=True) data = db.Column(JSONB, nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('records', cascade='all, delete-orphan')) # noqa @property def texts(self): """Utility method to get all text snippets in a record.""" if self.data is not None: for value in self.data.values(): yield value yield self.text @classmethod def find_records(cls, ids): if not len(ids): return [] q = db.session.query(cls) q = q.filter(cls.id.in_(ids)) return q @classmethod def by_index(cls, document_id, index): q = db.session.query(cls) q = db.session.query(DocumentRecord) q = q.filter(cls.document_id == document_id) q = q.filter(cls.index == index) return q.first() def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.index)
class EntityTag(db.Model): id = db.Column(db.Integer(), primary_key=True) collection = db.Column(db.Unicode(100)) package_id = db.Column(db.Unicode(100)) entity_id = db.Column(db.Unicode(50), db.ForeignKey('entity.id')) entity = db.relationship(Entity, backref=db.backref('tags', lazy='dynamic')) created_at = db.Column(db.DateTime, default=datetime.utcnow) @classmethod def delete_set(cls, collection, package_id): q = db.session.query(cls) q = q.filter_by(collection=collection) q = q.filter_by(package_id=package_id) q.delete() @classmethod def by_package(cls, collection, package_id): etag = aliased(cls) ent = aliased(Entity) q = db.session.query(etag.entity_id, ent.label, ent.category, ent.list_id) q = q.join(ent, ent.id == etag.entity_id) q = q.filter(etag.collection == collection) q = q.filter(etag.package_id == package_id) entities = [] for entity_id, label, category, lst in q.all(): entities.append({ 'id': entity_id, 'entity': entity_id, 'label': label, 'category': category, 'list': lst }) return entities def __repr__(self): return '<EntityTag(%r, %r)>' % (self.package_id, self.entity_id)
class DocumentRecord(db.Model): id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=False) row_id = db.Column(db.Integer, nullable=False) data = db.Column(JSONB) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( Document, backref=db.backref('records', cascade='all, delete-orphan')) # noqa @property def tid(self): tid = sha1(str(self.document_id)) tid.update(str(self.sheet)) tid.update(str(self.row_id)) return tid.hexdigest() def text_parts(self): """Utility method to get all text snippets in a record.""" for value in self.data.values(): text = string_value(value) if text is not None: yield value @classmethod def find_rows(cls, document_id, rows): if not len(rows): return [] q = db.session.query(cls) q = q.filter(cls.document_id == document_id) clauses = [and_(cls.sheet == r[0], cls.row_id == r[1]) for r in rows] q = q.filter(or_(*clauses)) return q def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.row_id)
class DocumentTag(db.Model, IdModel): """A record reflects an entity or tag extracted from a document.""" TYPE_PHONE = 'phone' TYPE_EMAIL = 'email' TYPE_PERSON = 'person' TYPE_ORGANIZATION = 'organization' TYPE_LOCATION = 'location' id = db.Column(db.BigInteger, primary_key=True) origin = db.Column(db.Unicode(255), nullable=False, index=True) type = db.Column(db.Unicode(16), nullable=False) weight = db.Column(db.Integer, default=1) key = db.Column(db.Unicode(1024), nullable=False, index=True) text = db.Column(db.Unicode(1024), nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('tags', cascade='all, delete-orphan')) # noqa @classmethod def delete_by(cls, document_id=None, origin=None, type=None): pq = db.session.query(cls) assert document_id or origin or type if document_id is not None: pq = pq.filter(cls.document_id == document_id) if origin is not None: pq = pq.filter(cls.origin == origin) if type is not None: pq = pq.filter(cls.type == type) pq.delete() db.session.flush() def __repr__(self): return '<DocumentTag(%r,%r)>' % (self.document_id, self.key)
class DocumentRecord(db.Model): """A record reflects a row or page of a document.""" SCHEMA_ROW = 'Row' SCHEMA_PAGE = 'Page' id = db.Column(db.BigInteger, primary_key=True) index = db.Column(db.Integer, nullable=True, index=True) text = db.Column(db.Unicode, nullable=True) data = db.Column(JSONB, nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('records', cascade='all, delete-orphan')) # noqa def raw_texts(self): """Utility method to get all text snippets in a record.""" if self.data is not None: for value in self.data.values(): yield value yield self.text @property def texts(self): yield from filter_texts(self.raw_texts()) @classmethod def insert_records(cls, document_id, iterable, chunk_size=1000): chunk = [] table = cls.__table__ for index, data in enumerate(iterable): chunk.append({ 'document_id': document_id, 'index': index, 'data': data }) if len(chunk) >= chunk_size: q = table.insert().values(chunk) db.session.execute(q) chunk = [] if len(chunk): q = table.insert().values(chunk) db.session.execute(q) def to_proxy(self): if self.text is not None: proxy = model.make_entity(self.SCHEMA_PAGE) proxy.make_id('record', self.id) proxy.set('document', self.document_id) proxy.set('index', self.index) proxy.set('bodyText', stringify(self.text)) return proxy else: proxy = model.make_entity(self.SCHEMA_ROW) proxy.make_id('record', self.id) proxy.set('table', self.document_id) proxy.set('index', self.index) if self.data is not None: values = [v for (k, v) in sorted(self.data.items())] proxy.set('cells', registry.json.pack(values)) return proxy def to_dict(self): proxy = self.to_proxy() data = proxy.to_full_dict() data.update({ 'document_id': self.document_id, 'bulk': False, }) return data def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.index)
class Entity(db.Model): id = db.Column(db.Unicode(50), primary_key=True, default=make_textid) label = db.Column(db.Unicode) category = db.Column(db.Enum(*CATEGORIES, name='entity_categories'), nullable=False) creator_id = db.Column(db.Integer(), db.ForeignKey('user.id')) creator = db.relationship(User, backref=db.backref('entities', lazy='dynamic', cascade='all, delete-orphan')) list_id = db.Column(db.Integer(), db.ForeignKey('list.id')) list = db.relationship('List', backref=db.backref('entities', lazy='dynamic', cascade='all, delete-orphan')) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) def to_dict(self): return { 'id': self.id, 'api_url': url_for('entities.view', id=self.id), 'label': self.label, 'category': self.category, 'creator_id': self.creator_id, 'selectors': [s.text for s in self.selectors], 'list': self.list_id, 'created_at': self.created_at, 'updated_at': self.updated_at } def has_selector(self, text): normalized = Selector.normalize(text) for selector in self.selectors: if selector.normalized == normalized: return True return False def delete(self): db.session.delete(self) @classmethod def create(cls, data, user): ent = cls() ent.update(data) ent.creator = user db.session.add(ent) return ent def update(self, data): data = EntityForm().deserialize(data) self.label = data.get('label') self.list = data.get('list') self.category = data.get('category') selectors = set(data.get('selectors')) selectors.add(self.label) existing = list(self.selectors) for sel in list(existing): if sel.text in selectors: selectors.remove(sel.text) existing.remove(sel) for sel in existing: db.session.delete(sel) for text in selectors: sel = Selector() sel.entity = self sel.text = text db.session.add(sel) @classmethod def by_normalized_label(cls, label, lst): q = db.session.query(cls) q = q.filter_by(list=lst) q = q.filter(db_compare(cls.label, label)) return q.first() @classmethod def by_id(cls, id): q = db.session.query(cls).filter_by(id=id) return q.first() @classmethod def by_lists(cls, lists, prefix=None): q = db.session.query(cls) q = q.filter(cls.list_id.in_(lists)) if prefix is not None and len(prefix): q = q.join(Selector, cls.id == Selector.entity_id) q = cls.apply_filter(q, Selector.normalized, prefix) q = q.order_by(cls.label.asc()) return q @classmethod def by_id_set(cls, ids): if not len(ids): return {} q = db.session.query(cls) q = q.filter(cls.id.in_(ids)) entities = {} for ent in q: entities[ent.id] = ent return entities @classmethod def apply_filter(cls, q, col, prefix): prefix = Selector.normalize(prefix) return q.filter( or_(col.like('%s%%' % prefix), col.like('%% %s%%' % prefix))) @classmethod def suggest_prefix(cls, prefix, lists, limit=10): from aleph.model import EntityTag ent = aliased(Entity) sel = aliased(Selector) tag = aliased(EntityTag) q = db.session.query(ent.id, ent.label, ent.category) q = q.join(sel, ent.id == sel.entity_id) q = q.join(tag, ent.id == tag.entity_id) q = q.filter(ent.list_id.in_(lists)) if prefix is None or not len(prefix): return [] q = cls.apply_filter(q, sel.normalized, prefix) q = q.order_by(ent.label.asc()) q = q.limit(limit) q = q.distinct() suggestions = [] for entity_id, label, category in q.all(): suggestions.append({ 'id': entity_id, 'label': label, 'category': category }) return suggestions @property def terms(self): return set([s.normalized for s in self.selectors]) def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.label) def __unicode__(self): return self.label
class List(db.Model): id = db.Column(db.Integer(), primary_key=True) label = db.Column(db.Unicode) public = db.Column(db.Boolean, default=False) creator_id = db.Column(db.Integer(), db.ForeignKey('user.id'), nullable=True) creator = db.relationship(User) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) users = db.relationship(User, secondary=list_user_table, backref='lists') def to_dict(self): return { 'id': self.id, 'api_url': url_for('lists.view', id=self.id), 'entities_api_url': url_for('entities.index', list=self.id), 'label': self.label, 'public': self.public, 'creator_id': self.creator_id, 'created_at': self.created_at, 'updated_at': self.updated_at } @classmethod def create(cls, data, user): lst = cls() lst.update(data, user) lst.creator = user db.session.add(lst) return lst def update(self, data, user): data = ListForm().deserialize(data) self.label = data.get('label') if data.get('public') is not None: self.public = data.get('public') users = set(data.get('users', [])) if user is not None: users.add(user) self.users = list(users) def delete(self): # for entity in self.entities: # entity.delete() db.session.delete(self) @classmethod def by_label(cls, label): q = db.session.query(cls).filter_by(label=label) return q.first() @classmethod def by_id(cls, id): q = db.session.query(cls).filter_by(id=id) return q.first() @classmethod def user_list_ids(cls, user=None, include_public=True): logged_in = user is not None and user.is_authenticated() q = db.session.query(cls.id) conds = [] if include_public: conds.append(cls.public == True) # noqa if logged_in: conds.append(cls.users.any(User.id == user.id)) if not len(conds): return [] if not (logged_in and user.is_admin): q = q.filter(or_(*conds)) return [c.id for c in q.all()] @classmethod def all_by_user(cls, user): q = db.session.query(cls) q = q.filter(cls.id.in_(cls.user_list_ids(user))) q = q.order_by(cls.id.desc()) return q @property def terms(self): from aleph.model.entity import Entity from aleph.model.selector import Selector q = db.session.query(Selector.normalized) q = q.join(Entity, Entity.id == Selector.entity_id) q = q.filter(Entity.list_id == self.id) q = q.distinct() return set([r[0] for r in q]) def __repr__(self): return '<List(%r, %r)>' % (self.id, self.label) def __unicode__(self): return self.label
class Entity(db.Model, UuidModel, SoftDeleteModel, SchemaModel): _schema = '/entity/entity.json#' _schema_recurse = True name = db.Column(db.Unicode) type = db.Column('type', db.String(255), index=True) summary = db.Column(db.Unicode, nullable=True) description = db.Column(db.Unicode, nullable=True) jurisdiction_code = db.Column(db.Unicode, nullable=True) __mapper_args__ = {'polymorphic_on': type, 'polymorphic_identity': _schema} collection_id = db.Column(db.Integer(), db.ForeignKey('collection.id')) collection = db.relationship(Collection, backref=db.backref( 'entities', lazy='dynamic', cascade='all, delete-orphan')) # noqa def delete(self): from aleph.model import Reference q = db.session.query(Reference) q = q.filter(Reference.entity_id == self.id) q.delete(synchronize_session='fetch') super(Entity, self).delete() def update(self, data, merge=False): self.schema_update(data, merge=merge) @classmethod def save(cls, data, collection_id=None, merge=False): ent = cls.by_id(data.get('id')) for identifier in data.get('identifiers', []): if ent is None: ent = cls.by_identifier(identifier.get('scheme'), identifier.get('identifier'), collection_id=collection_id) if ent is None: schema = data.get('$schema', cls._schema) cls = cls.get_schema_class(schema) ent = cls() ent.id = make_textid() if collection_id is not None: ent.collection_id = collection_id ent.update(data, merge=merge) return ent @property def terms(self): terms = set([self.name]) # for other_name in self.other_names: # terms.update(other_name.terms) return [t for t in terms if t is not None and len(t)] @classmethod def by_identifier(cls, scheme, identifier, collection_id=None): ent = aliased(Entity) q = db.session.query(ent) q = q.filter(ent.deleted_at == None) # noqa if collection_id is not None: q = q.filter(ent.collection_id == collection_id) ident = aliased(EntityIdentifier) q = q.join(ident, ent.identifiers) q = q.filter(ident.deleted_at == None) # noqa q = q.filter(ident.scheme == scheme) q = q.filter(ident.identifier == identifier) return q.first() @classmethod def by_id_set(cls, ids, collection_id=None): if not len(ids): return {} q = cls.all() q = q.filter(cls.id.in_(ids)) if collection_id is not None: q = q.filter(cls.collection_id == collection_id) entities = {} for ent in q: entities[ent.id] = ent return entities @classmethod def suggest_prefix(cls, prefix, collections, limit=10): if prefix is None or not len(prefix): return [] prefix = prefix.strip() ent = aliased(Entity) q = db.session.query(ent.id, ent.name, ent.type) q = q.filter(ent.deleted_at == None) # noqa q = q.filter(ent.collection_id.in_(collections)) q = q.filter( or_(ent.name.ilike('%s%%' % prefix), ent.name.ilike('%% %s%%' % prefix))) q = q.limit(limit) suggestions = [] for entity_id, name, schema in q.all(): suggestions.append({ 'id': entity_id, 'name': name, '$schema': schema }) return suggestions def __repr__(self): return '<Entity(%r, %r)>' % (self.id, self.name) def __unicode__(self): return self.name def to_dict(self): data = super(Entity, self).to_dict() data['collection_id'] = self.collection_id return data
class DocumentRecord(db.Model): """A record reflects a row or page of a document.""" id = db.Column(db.BigInteger, primary_key=True) sheet = db.Column(db.Integer, nullable=True) index = db.Column(db.Integer, nullable=True, index=True) text = db.Column(db.Unicode, nullable=True) data = db.Column(JSONB, nullable=True) document_id = db.Column(db.Integer(), db.ForeignKey('document.id'), index=True) # noqa document = db.relationship( "Document", backref=db.backref('records', cascade='all, delete-orphan')) # noqa def raw_texts(self): """Utility method to get all text snippets in a record.""" if self.data is not None: for value in self.data.values(): yield value yield self.text @property def texts(self): yield from filter_texts(self.raw_texts()) @classmethod def insert_records(cls, document_id, iterable, chunk_size=1000): chunk = [] table = cls.__table__ for index, data in enumerate(iterable): chunk.append({ 'document_id': document_id, 'index': index, 'data': data }) if len(chunk) >= chunk_size: q = table.insert().values(chunk) db.session.execute(q) chunk = [] if len(chunk): q = table.insert().values(chunk) db.session.execute(q) @classmethod def find_records(cls, ids): if not len(ids): return [] q = db.session.query(cls) q = q.filter(cls.id.in_(ids)) return q @classmethod def by_index(cls, document_id, index): q = db.session.query(cls) q = db.session.query(DocumentRecord) q = q.filter(cls.document_id == document_id) q = q.filter(cls.index == index) return q.first() def __repr__(self): return '<DocumentRecord(%r,%r)>' % (self.document_id, self.index)
class IdModel(object): id = db.Column(db.Integer(), primary_key=True)
class Role(db.Model, RoleMixin): id = db.Column(db.Integer(), primary_key=True) name = db.Column(db.String(80), unique=True) description = db.Column(db.String(255))
from aleph.model.util import make_token from aleph.model.forms import UserForm from flask.ext.security import Security, SQLAlchemyUserDatastore, \ UserMixin, RoleMixin, login_required from flask.ext.security.utils import encrypt_password, get_hmac log = logging.getLogger(__name__) @login_manager.user_loader def load_user(id): return User.query.get(int(id)) roles_users = db.Table( 'roles_users', db.Column('user_id', db.Integer(), db.ForeignKey('user.id')), db.Column('role_id', db.Integer(), db.ForeignKey('role.id'))) class Role(db.Model, RoleMixin): id = db.Column(db.Integer(), primary_key=True) name = db.Column(db.String(80), unique=True) description = db.Column(db.String(255)) class User(db.Model): id = db.Column(db.Integer, primary_key=True) email = db.Column( db.Unicode, #following attributes are for flask-user
class CrawlerState(db.Model): """Report the state of a file being processed.""" TIMEOUT = timedelta(minutes=60) STATUS_OK = 'ok' STATUS_FAIL = 'fail' id = db.Column(db.BigInteger, primary_key=True) crawler_id = db.Column(db.Unicode(), index=True) crawler_run = db.Column(db.Unicode(), nullable=True) content_hash = db.Column(db.Unicode(65), nullable=True) foreign_id = db.Column(db.Unicode, nullable=True) status = db.Column(db.Unicode(10), nullable=False) error_type = db.Column(db.Unicode(), nullable=True) error_message = db.Column(db.Unicode(), nullable=True) error_details = db.Column(db.Unicode(), nullable=True) meta = db.Column(JSONB) collection_id = db.Column(db.Integer(), db.ForeignKey('collection.id'), index=True) collection = db.relationship(Collection, backref=db.backref( 'crawl_states', cascade='all, delete-orphan')) # noqa created_at = db.Column(db.DateTime, default=datetime.utcnow) @classmethod def _from_meta(cls, meta, collection_id): obj = cls() obj.collection_id = collection_id obj.crawler_id = meta.crawler obj.crawler_run = meta.crawler_run obj.foreign_id = meta.foreign_id obj.content_hash = meta.content_hash obj.meta = expand_json(meta.to_attr_dict(compute=True)) db.session.add(obj) return obj @classmethod def store_stub(cls, collection_id, crawler_id, crawler_run): obj = cls() obj.collection_id = collection_id obj.crawler_id = crawler_id obj.crawler_run = crawler_run obj.error_type = 'init' obj.status = cls.STATUS_OK db.session.add(obj) return obj @classmethod def store_ok(cls, meta, collection_id): obj = cls._from_meta(meta, collection_id) obj.status = cls.STATUS_OK return obj @classmethod def store_fail(cls, meta, collection_id, error_type=None, error_message=None, error_details=None): obj = cls._from_meta(meta, collection_id) obj.status = cls.STATUS_FAIL obj.error_type = error_type obj.error_message = error_message obj.error_details = error_details return obj @classmethod def crawler_last_run(cls, crawler_id): q = db.session.query(cls.crawler_run, cls.created_at) q = q.filter(cls.crawler_id == crawler_id) q = q.order_by(cls.created_at.desc()) q = q.limit(1) res = q.first() if res is None: return None, None return (res.crawler_run, res.created_at) @classmethod def crawler_stats(cls, crawler_id): stats = {} last_run_id, last_run_time = cls.crawler_last_run(crawler_id) # Check if the crawler was active very recently, if so, don't # allow the user to execute a new run right now. timeout = (datetime.utcnow() - CrawlerState.TIMEOUT) stats['running'] = last_run_time > timeout if last_run_time else False q = db.session.query(func.count(cls.id)) q = q.filter(cls.crawler_id == crawler_id) for section in ['last', 'all']: data = {} sq = q if section == 'last': sq = sq.filter(cls.crawler_run == last_run_id) okq = sq.filter(cls.status == cls.STATUS_OK) data['ok'] = okq.scalar() if last_run_id else 0 failq = sq.filter(cls.status == cls.STATUS_FAIL) data['fail'] = failq.scalar() if last_run_id else 0 stats[section] = data stats['last']['updated'] = last_run_time stats['last']['run_id'] = last_run_id return stats @classmethod def all(cls): return db.session.query(CrawlerState) def to_dict(self): return { 'id': self.id, 'status': self.status, 'crawler_id': self.crawler_id, 'crawler_run': self.crawler_run, 'content_hash': self.content_hash, 'foreign_id': self.foreign_id, 'error_type': self.error_type, 'error_message': self.error_message, 'error_details': self.error_details, 'meta': self.meta, 'collection_id': self.collection_id, 'created_at': self.created_at } def __repr__(self): return '<CrawlerState(%r,%r)>' % (self.id, self.status) def __unicode__(self): return self.id
class Alert(db.Model): ''' Also consider adding: - active/inactive - label (short human-readable text) ''' id = db.Column(db.Integer, primary_key=True) user_id = db.Column(db.Integer(), db.ForeignKey('user.id')) user = db.relationship(User, backref=db.backref('alerts')) created_at = db.Column(db.DateTime, default=datetime.utcnow) checked_at = db.Column( db.DateTime, default=None, ) query = db.Column(db.Unicode) label = db.Column(db.Unicode) checking_interval = db.Column(db.Integer, default=None) # number of days between checks. None == 'never check' def due_to_check(self): ''' Return True if it is time to run this query NB We expect this script to run at nearly-but-not-precisely the same time each day, and we want to run at an intuitive 'once per day', rather than skipping because today's run has happened a few seconds earlier. Therefore we allow 2 hours of wiggle room [we aren't worried about sending duplicate alerts, because that will be handled precisely by filtering result insert dates against the checked_at field ''' if self.checking_interval == None: # query is disabled return False if self.checked_at == None: # query is being run for the first time return True min_check_date = datetime.utcnow() - timedelta( days=self.checking_interval) + timedelta(hours=2) return self.checked_at <= min_check_date def mark_as_checked(self): self.checked_at = datetime.utcnow() db.session.add(self) db.session.commit() def to_dict(self): attrs = ('id', 'label', 'query', 'checking_interval', 'user_id', 'created_at', 'checked_at') return {attr: getattr(self, attr) for attr in attrs} @property def search_url(self): ''' where to go to reach the original search ''' return 'http://search.openoil.net/#/search?q=' + urllib.parse.quote_plus( self.query) @classmethod def by_id(cls, id, role=None): q = db.session.query(cls).filter_by(id=id) if role is not None: #only applies if we are using authz roles q = q.filter(cls.role_id == role.id) return q.first()
class Document(db.Model, DatedModel): TYPE_TEXT = 'text' TYPE_TABULAR = 'tabular' TYPE_OTHER = 'other' id = db.Column(db.BigInteger, primary_key=True) content_hash = db.Column(db.Unicode(65), nullable=False, index=True) foreign_id = db.Column(db.Unicode, unique=False, nullable=True) type = db.Column(db.Unicode(10), nullable=False, index=True) source_id = db.Column(db.Integer(), db.ForeignKey('source.id'), index=True) source = db.relationship(Source, backref=db.backref( 'documents', lazy='dynamic', cascade='all, delete-orphan')) # noqa _meta = db.Column('meta', JSONB) @property def title(self): return self.meta.title @hybrid_property def meta(self): self._meta = self._meta or {} self._meta['content_hash'] = self.content_hash self._meta['foreign_id'] = self.foreign_id return Metadata(data=self._meta or {}) @meta.setter def meta(self, meta): if isinstance(meta, Metadata): self.content_hash = meta.content_hash self.foreign_id = meta.foreign_id meta = meta.data self._meta = meta flag_modified(self, '_meta') def delete_pages(self): pq = db.session.query(DocumentPage) pq = pq.filter(DocumentPage.document_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def delete_records(self): pq = db.session.query(DocumentRecord) pq = pq.filter(DocumentRecord.document_id == self.id) pq.delete(synchronize_session='fetch') db.session.refresh(self) def insert_records(self, sheet, iterable, chunk_size=1000): chunk = [] for i, data in enumerate(iterable): chunk.append({ 'document_id': self.id, 'row_id': i, 'sheet': sheet, 'data': data }) if len(chunk) >= chunk_size: db.session.bulk_insert_mappings(DocumentRecord, chunk) chunk = [] if len(chunk): db.session.bulk_insert_mappings(DocumentRecord, chunk) def text_parts(self): """Utility method to get all text snippets in a document.""" if self.type == Document.TYPE_TEXT: for page in self.pages: if page.text is not None and len(page.text): yield page.text, page if self.type == Document.TYPE_TABULAR: for record in self.records: for value in record.data.values(): if isinstance(value, basestring) and len(value): yield value, record @classmethod def get_max_id(cls): q = db.session.query(func.max(cls.id)) return q.scalar() def __repr__(self): return '<Document(%r,%r,%r)>' % (self.id, self.type, self.meta.title) def _add_to_dict(self, data): data.update({ 'id': self.id, 'type': self.type, 'source_id': self.source_id, 'created_at': self.created_at, 'updated_at': self.updated_at }) return data def to_dict(self): data = self.meta.to_dict() return self._add_to_dict(data) def to_index_dict(self): data = self.meta.to_index_dict() return self._add_to_dict(data)