class Account(db.Model): __tablename__ = 'account' id = db.Column(db.Integer, primary_key=True) github_id = db.Column(db.Integer) login = db.Column(db.Unicode) email = db.Column(db.Unicode) api_key = db.Column(db.Unicode, default=make_key) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) datasets = db.relationship('Dataset', backref='owner', lazy='dynamic') uploads = db.relationship('Upload', backref='creator', lazy='dynamic') entities_created = db.relationship('Entity', backref='creator', lazy='dynamic') def to_dict(self): return { 'id': self.id, 'github_id': self.github_id, 'login': self.login, 'created_at': self.created_at, 'updated_at': self.updated_at, } @classmethod def by_id(cls, id): return cls.query.filter_by(id=id).first() @classmethod def by_api_key(cls, api_key): return cls.query.filter_by(api_key=api_key).first() @classmethod def by_github_id(cls, github_id): return cls.query.filter_by(github_id=github_id).first() @classmethod def create(cls, data): account = cls() account.github_id = data['id'] account.login = data['login'] account.email = data.get('email') db.session.add(account) db.session.flush() return account def update(self, data): self.login = data['login'] self.email = data.get('email') db.session.add(self)
class Upload(db.Model): __tablename__ = 'upload' id = db.Column(db.Integer, primary_key=True) mimetype = db.Column(db.Unicode) filename = db.Column(db.Unicode) data = db.deferred(db.Column(db.LargeBinary)) dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id')) creator_id = db.Column(db.Integer, db.ForeignKey('account.id')) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) def to_dict(self): data = { 'id': self.id, 'mimetype': self.mimetype, 'filename': self.filename, 'created_at': self.created_at, 'updated_at': self.updated_at, 'headers': None, 'sample': None, 'rows': 0 } if self.tab is not None: data['headers'] = self.tab.headers data['sample'] = self.tab.dict[:5] data['rows'] = self.tab.height data['parse_error'] = self._tab_error return data @property def tab(self): if not hasattr(self, '_tab'): try: self._tab = TablibDataset() self._tab.csv = self.data self._tab_error = None except Exception, e: self._tab = None self._tab_error = unicode(e) return self._tab
class Dataset(db.Model): __tablename__ = 'dataset' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode) label = db.Column(db.Unicode) ignore_case = db.Column(db.Boolean, default=False) match_aliases = db.Column(db.Boolean, default=False) public_edit = db.Column(db.Boolean, default=False) normalize_text = db.Column(db.Boolean, default=True) enable_invalid = db.Column(db.Boolean, default=True) owner_id = db.Column(db.Integer, db.ForeignKey('account.id')) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) entities = db.relationship('Entity', backref='dataset', lazy='dynamic') uploads = db.relationship('Upload', backref='dataset', lazy='dynamic') def to_dict(self): from nomenklatura.model.entity import Entity num_aliases = Entity.all(self).filter( Entity.canonical_id != None).count() num_review = Entity.all(self).filter_by(reviewed=False).count() num_entities = Entity.all(self).count() num_invalid = Entity.all(self).filter_by(invalid=True).count() return { 'id': self.id, 'name': self.name, 'label': self.label, 'owner': self.owner.to_dict(), 'stats': { 'num_aliases': num_aliases, 'num_entities': num_entities, 'num_review': num_review, 'num_invalid': num_invalid }, 'ignore_case': self.ignore_case, 'match_aliases': self.match_aliases, 'public_edit': self.public_edit, 'normalize_text': self.normalize_text, 'enable_invalid': self.enable_invalid, 'created_at': self.created_at, 'updated_at': self.updated_at } @property def last_modified(self): dates = [self.updated_at] from nomenklatura.model.entity import Entity latest_entity = self.entities.order_by( Entity.updated_at.desc()).first() if latest_entity is not None: dates.append(latest_entity.updated_at) from nomenklatura.model.alias import Alias latest_alias = self.aliases.order_by(Alias.updated_at.desc()).first() if latest_alias is not None: dates.append(latest_alias.updated_at) return max(dates) @classmethod def by_name(cls, name): return cls.query.filter_by(name=name).first() @classmethod def find(cls, name): dataset = cls.by_name(name) if dataset is None: raise NotFound("No such dataset: %s" % name) return dataset @classmethod def from_form(cls, form_data): data = FormDatasetSchema().to_python(form_data) return data.get('dataset') @classmethod def all(cls): return cls.query @classmethod def create(cls, data, account): data = DatasetNewSchema().to_python(data) dataset = cls() dataset.owner = account dataset.name = data['name'] dataset.label = data['label'] db.session.add(dataset) db.session.flush() return dataset def update(self, data): data = DatasetEditSchema().to_python(data) self.label = data['label'] self.normalize_text = data['normalize_text'] self.ignore_case = data['ignore_case'] self.public_edit = data['public_edit'] self.match_aliases = data['match_aliases'] self.enable_invalid = data['enable_invalid'] db.session.add(self) db.session.flush()
class Entity(db.Model): __tablename__ = 'entity' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode) normalized = db.Column(db.Unicode) attributes = db.Column(HSTORE) reviewed = db.Column(db.Boolean, default=False) invalid = db.Column(db.Boolean, default=False) canonical_id = db.Column(db.Integer, db.ForeignKey('entity.id'), nullable=True) dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id')) creator_id = db.Column(db.Integer, db.ForeignKey('account.id')) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) canonical = db.relationship('Entity', backref=backref('aliases', lazy='dynamic'), remote_side='Entity.id') def to_dict(self, shallow=False): d = { 'id': self.id, 'name': self.name, 'dataset': self.dataset.name, 'reviewed': self.reviewed, 'invalid': self.invalid, 'canonical': self.canonical, #'normalized': self.normalized, 'created_at': self.created_at, 'updated_at': self.updated_at, } if not shallow: d['creator'] = self.creator.to_dict() d['attributes'] = self.attributes d['num_aliases'] = self.aliases.count() return d def to_row(self): row = self.attributes or {} row = row.copy() row.update(self.to_dict(shallow=True)) if self.canonical is not None: row['canonical'] = self.canonical.name return row @property def display_name(self): return self.name @classmethod def by_name(cls, dataset, name): q = cls.query.filter_by(dataset=dataset) attr = Entity.name if dataset.normalize_text: attr = Entity.normalized name = normalize_text(name) if dataset.ignore_case: attr = func.lower(attr) if isinstance(name, basestring): name = name.lower() q = q.filter(attr==name) return q.first() @classmethod def by_id(cls, id): try: return cls.query.filter_by(id=int(id)).first() except ValueError: return None @classmethod def id_map(cls, ids): entities = {} for entity in cls.query.filter(cls.id.in_(ids)): entities[entity.id] = entity return entities @classmethod def find(cls, dataset, id): entity = cls.by_id(id) if entity is None: raise NotFound("No such value ID: %s" % id) return entity @classmethod def all(cls, dataset=None, query=None, eager_aliases=False, eager=False): q = cls.query if dataset is not None: q = q.filter_by(dataset=dataset) if query is not None and len(query.strip()): q = q.filter(cls.name.ilike('%%%s%%' % query.strip())) if eager_aliases: q = q.options(joinedload_all(cls.aliases_static)) if eager: q = q.options(db.joinedload('dataset')) q = q.options(db.joinedload('creator')) return q @classmethod def create(cls, dataset, data, account): state = EntityState(dataset, None) data = EntitySchema().to_python(data, state) entity = cls() entity.dataset = dataset entity.creator = account entity.name = data['name'] entity.normalized = normalize_text(entity.name) entity.attributes = data.get('attributes', {}) entity.reviewed = data['reviewed'] entity.invalid = data['invalid'] entity.canonical = data['canonical'] db.session.add(entity) db.session.flush() return entity def update(self, data, account): state = EntityState(self.dataset, self) data = EntitySchema().to_python(data, state) self.creator = account self.name = data['name'] self.normalized = normalize_text(self.name) self.attributes = data['attributes'] self.reviewed = data['reviewed'] self.invalid = data['invalid'] self.canonical = data['canonical'] # redirect all aliases of this entity if self.canonical: if self.canonical.canonical_id: if self.canonial.canonical_id == self.id: self.canonical.canonical = None else: self.canonical = self.canonical.canonical for alias in self.aliases: alias.canonical = self.canonical db.session.add(self)
class Upload(db.Model): __tablename__ = 'upload' id = db.Column(db.Integer, primary_key=True) mimetype = db.Column(db.Unicode) filename = db.Column(db.Unicode) data = db.deferred(db.Column(db.LargeBinary)) dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id')) creator_id = db.Column(db.Integer, db.ForeignKey('account.id')) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) def to_dict(self): data = { 'id': self.id, 'mimetype': self.mimetype, 'filename': self.filename, 'created_at': self.created_at, 'updated_at': self.updated_at, 'headers': None, 'sample': None, 'rows': 0 } if self.tab is not None: data['headers'] = self.tab.headers data['sample'] = self.tab.dict[:5] data['rows'] = self.tab.height data['parse_error'] = self._tab_error return data @property def tab(self): if not hasattr(self, '_tab'): try: self._tab = TablibDataset() self._tab.csv = self.data.decode('utf-8') self._tab_error = None except Exception as e: self._tab = None self._tab_error = str(e) return self._tab @classmethod def by_id(cls, dataset, id): q = cls.query.filter_by(id=id) q = q.filter_by(dataset_id=dataset.id) return q.first() @classmethod def find(cls, dataset, id): upload = cls.by_id(dataset, id) if upload is None: raise NotFound("No such upload: %s" % id) return upload @classmethod def all(cls): return cls.query @classmethod def create(cls, dataset, account, file_): upload = cls() upload.dataset = dataset upload.creator = account upload.mimetype = file_.mimetype upload.filename = file_.filename upload.data = file_.read() db.session.add(upload) db.session.flush() return upload
class Dataset(db.Model): __tablename__ = 'dataset' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode) label = db.Column(db.Unicode) ignore_case = db.Column(db.Boolean, default=False) match_aliases = db.Column(db.Boolean, default=False) public_edit = db.Column(db.Boolean, default=False) normalize_text = db.Column(db.Boolean, default=True) enable_invalid = db.Column(db.Boolean, default=True) algorithm = db.Column(db.Unicode) owner_id = db.Column(db.Integer, db.ForeignKey('account.id')) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) entities = db.relationship('Entity', backref='dataset', lazy='dynamic') aliases = db.relationship('Alias', backref='dataset', lazy='dynamic') def as_dict(self): return { 'id': self.id, 'name': self.name, 'label': self.label, 'owner': self.owner.as_dict(), 'ignore_case': self.ignore_case, 'match_aliases': self.match_aliases, 'public_edit': self.public_edit, 'normalize_text': self.normalize_text, 'enable_invalid': self.enable_invalid, 'algorithm': self.algorithm, 'created_at': self.created_at, 'updated_at': self.updated_at } @classmethod def by_name(cls, name): return cls.query.filter_by(name=name).first() @classmethod def find(cls, name): dataset = cls.by_name(name) if dataset is None: raise NotFound("No such dataset: %s" % name) return dataset @classmethod def all(cls): return cls.query @classmethod def create(cls, data, account): data = DatasetNewSchema().to_python(data) dataset = cls() dataset.owner = account dataset.name = data['name'] dataset.label = data['label'] db.session.add(dataset) db.session.flush() flush_cache(dataset) return dataset def update(self, data): data = DatasetEditSchema().to_python(data) self.label = data['label'] self.normalize_text = data['normalize_text'] self.ignore_case = data['ignore_case'] self.public_edit = data['public_edit'] self.match_aliases = data['match_aliases'] self.enable_invalid = data['enable_invalid'] self.algorithm = data['algorithm'] db.session.add(self) db.session.flush() flush_cache(self)
class Alias(db.Model): __tablename__ = 'alias' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode) data = db.Column(JsonType, default=dict) is_matched = db.Column(db.Boolean, default=False) is_invalid = db.Column(db.Boolean, default=False) dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id')) creator_id = db.Column(db.Integer, db.ForeignKey('account.id')) matcher_id = db.Column(db.Integer, db.ForeignKey('account.id'), nullable=True) entity_id = db.Column(db.Integer, db.ForeignKey('entity.id'), nullable=True) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) def as_dict(self): return { 'id': self.id, 'name': self.name, 'entity': self.entity.as_dict(shallow=True) if self.entity else None, 'created_at': self.created_at, 'creator': self.creator.as_dict(), 'updated_at': self.updated_at, 'is_matched': self.is_matched, 'data': self.data, 'matcher': self.matcher.as_dict() if self.matcher else None, 'is_invalid': self.is_invalid, 'dataset': self.dataset.name } def as_row(self): if self.is_invalid: return None row = self.entity.as_row() if self.entity else { 'name': None, 'id': None } for k, v in self.data.items(): if k not in row: row[k] = v row['alias'] = self.name row['alias_id'] = self.id return row @property def display_name(self): return self.name @classmethod def by_name(cls, dataset, name): return cls.query.filter_by(dataset=dataset).\ filter_by(name=name).first() @classmethod def by_id(cls, dataset, id): return cls.query.filter_by(dataset=dataset).\ filter_by(id=id).first() @classmethod def all(cls, dataset, eager=False): q = cls.query.filter_by(dataset=dataset) if eager: q = q.options(db.joinedload('matcher')) q = q.options(db.joinedload('creator')) q = q.options(db.joinedload('entity')) q = q.options(db.joinedload('dataset')) return q @classmethod def all_matched(cls, dataset): return cls.all(dataset).\ filter_by(is_matched=True) @classmethod def all_unmatched(cls, dataset): return cls.all(dataset).\ filter_by(is_matched=False) @classmethod def all_invalid(cls, dataset): return cls.all(dataset).\ filter_by(is_invalid=True) @classmethod def find(cls, dataset, id): link = cls.by_id(dataset, id) if link is None: raise NotFound("No such link ID: %s" % id) return link @classmethod def lookup(cls, dataset, data, account, match_entity=True, readonly=False): data = AliasLookupSchema().to_python(data) if match_entity: entity = Entity.by_name(dataset, data['name']) if entity is not None: return entity else: entity = None alias = cls.by_name(dataset, data['name']) if alias is not None: return alias choices = match_op(data['name'], dataset) choices = filter(lambda (c, v, s): s > 99.9, choices) if len(choices) == 1: c, entity_id, s = choices.pop() entity = Entity.by_id(dataset, entity_id) if readonly: return entity alias = cls() alias.creator = account alias.dataset = dataset alias.entity = entity alias.is_matched = entity is not None alias.name = data['name'] alias.data = data['data'] db.session.add(alias) db.session.flush() if entity is not None: add_candidate_to_cache(dataset, alias.name, entity.id) return alias def match(self, dataset, data, account): state = AliasMatchState(dataset) data = AliasMatchSchema().to_python(data, state) self.is_matched = True self.matcher = account if data['choice'] == 'INVALID': self.entity = None self.is_invalid = True elif data['choice'] == 'NEW': self.entity = Entity.create(dataset, data, account) self.is_invalid = False else: self.entity = data['choice'] self.is_invalid = False db.session.add(self) db.session.flush()
class Entity(db.Model): __tablename__ = 'entity' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode) data = db.Column(JsonType, default=dict) dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id')) creator_id = db.Column(db.Integer, db.ForeignKey('account.id')) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) aliases = db.relationship('Alias', backref='entity', lazy='dynamic') aliases_static = db.relationship('Alias') def as_dict(self, shallow=False): d = { 'id': self.id, 'name': self.name, 'created_at': self.created_at, 'updated_at': self.updated_at, } if not shallow: d['creator'] = self.creator.as_dict() d['dataset'] = self.dataset.name, d['data'] = self.data, return d def as_row(self): row = self.data.copy() row.update(self.as_dict(shallow=True)) return row @property def display_name(self): return self.name @classmethod def by_name(cls, dataset, name): return cls.query.filter_by(dataset=dataset).\ filter_by(name=name).first() @classmethod def by_id(cls, dataset, id): return cls.query.filter_by(dataset=dataset).\ filter_by(id=id).first() @classmethod def id_map(cls, dataset, ids): entities = {} for entity in cls.query.filter_by(dataset=dataset).\ filter(cls.id.in_(ids)): entities[entity.id] = entity return entities @classmethod def find(cls, dataset, id): entity = cls.by_id(dataset, id) if entity is None: raise NotFound("No such value ID: %s" % id) return entity @classmethod def all(cls, dataset, query=None, eager_aliases=False, eager=False): q = cls.query.filter_by(dataset=dataset) if query is not None and len(query.strip()): q = q.filter(cls.name.ilike('%%%s%%' % query.strip())) if eager_aliases: q = q.options(joinedload_all(cls.aliases_static)) if eager: q = q.options(db.joinedload('dataset')) q = q.options(db.joinedload('creator')) return q @classmethod def create(cls, dataset, data, account): state = EntityState(dataset, None) data = EntitySchema().to_python(data, state) entity = cls() entity.dataset = dataset entity.creator = account entity.name = data['name'] entity.data = data['data'] db.session.add(entity) db.session.flush() add_candidate_to_cache(dataset, entity.name, entity.id) return entity def update(self, data, account): state = EntityState(self.dataset, self) data = EntitySchema().to_python(data, state) self.creator = account self.name = data['name'] self.data = data['data'] flush_cache(self.dataset) db.session.add(self) def merge_into(self, data, account): from nomenklatura.model.alias import Alias state = EntityState(self.dataset, self) data = EntityMergeSchema().to_python(data, state) target = data.get('target') for alias in self.aliases: alias.value = target alias = Alias() alias.name = self.name alias.creator = self.creator alias.matcher = account alias.entity = target alias.dataset = self.dataset alias.is_matched = True db.session.delete(self) db.session.add(alias) db.session.commit() flush_cache(self.dataset) return target