Esempio n. 1
0
class Upload(db.Model):
    __tablename__ = 'upload'

    id = db.Column(db.Integer, primary_key=True)
    mimetype = db.Column(db.Unicode)
    filename = db.Column(db.Unicode)
    data = db.deferred(db.Column(db.LargeBinary))
    dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id'))
    creator_id = db.Column(db.Integer, db.ForeignKey('account.id'))
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime,
                           default=datetime.utcnow,
                           onupdate=datetime.utcnow)

    def to_dict(self):
        data = {
            'id': self.id,
            'mimetype': self.mimetype,
            'filename': self.filename,
            'created_at': self.created_at,
            'updated_at': self.updated_at,
            'headers': None,
            'sample': None,
            'rows': 0
        }
        if self.tab is not None:
            data['headers'] = self.tab.headers
            data['sample'] = self.tab.dict[:5]
            data['rows'] = self.tab.height
        data['parse_error'] = self._tab_error
        return data

    @property
    def tab(self):
        if not hasattr(self, '_tab'):
            try:
                self._tab = TablibDataset()
                self._tab.csv = self.data
                self._tab_error = None
            except Exception, e:
                self._tab = None
                self._tab_error = unicode(e)
        return self._tab
Esempio n. 2
0
class Dataset(db.Model):
    __tablename__ = 'dataset'

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode)
    label = db.Column(db.Unicode)
    ignore_case = db.Column(db.Boolean, default=False)
    match_aliases = db.Column(db.Boolean, default=False)
    public_edit = db.Column(db.Boolean, default=False)
    normalize_text = db.Column(db.Boolean, default=True)
    enable_invalid = db.Column(db.Boolean, default=True)
    owner_id = db.Column(db.Integer, db.ForeignKey('account.id'))
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime,
                           default=datetime.utcnow,
                           onupdate=datetime.utcnow)

    entities = db.relationship('Entity', backref='dataset', lazy='dynamic')
    uploads = db.relationship('Upload', backref='dataset', lazy='dynamic')

    def to_dict(self):
        from nomenklatura.model.entity import Entity
        num_aliases = Entity.all(self).filter(
            Entity.canonical_id != None).count()
        num_review = Entity.all(self).filter_by(reviewed=False).count()
        num_entities = Entity.all(self).count()
        num_invalid = Entity.all(self).filter_by(invalid=True).count()

        return {
            'id': self.id,
            'name': self.name,
            'label': self.label,
            'owner': self.owner.to_dict(),
            'stats': {
                'num_aliases': num_aliases,
                'num_entities': num_entities,
                'num_review': num_review,
                'num_invalid': num_invalid
            },
            'ignore_case': self.ignore_case,
            'match_aliases': self.match_aliases,
            'public_edit': self.public_edit,
            'normalize_text': self.normalize_text,
            'enable_invalid': self.enable_invalid,
            'created_at': self.created_at,
            'updated_at': self.updated_at
        }

    @property
    def last_modified(self):
        dates = [self.updated_at]
        from nomenklatura.model.entity import Entity
        latest_entity = self.entities.order_by(
            Entity.updated_at.desc()).first()
        if latest_entity is not None:
            dates.append(latest_entity.updated_at)

        from nomenklatura.model.alias import Alias
        latest_alias = self.aliases.order_by(Alias.updated_at.desc()).first()
        if latest_alias is not None:
            dates.append(latest_alias.updated_at)
        return max(dates)

    @classmethod
    def by_name(cls, name):
        return cls.query.filter_by(name=name).first()

    @classmethod
    def find(cls, name):
        dataset = cls.by_name(name)
        if dataset is None:
            raise NotFound("No such dataset: %s" % name)
        return dataset

    @classmethod
    def from_form(cls, form_data):
        data = FormDatasetSchema().to_python(form_data)
        return data.get('dataset')

    @classmethod
    def all(cls):
        return cls.query

    @classmethod
    def create(cls, data, account):
        data = DatasetNewSchema().to_python(data)
        dataset = cls()
        dataset.owner = account
        dataset.name = data['name']
        dataset.label = data['label']
        db.session.add(dataset)
        db.session.flush()
        return dataset

    def update(self, data):
        data = DatasetEditSchema().to_python(data)
        self.label = data['label']
        self.normalize_text = data['normalize_text']
        self.ignore_case = data['ignore_case']
        self.public_edit = data['public_edit']
        self.match_aliases = data['match_aliases']
        self.enable_invalid = data['enable_invalid']
        db.session.add(self)
        db.session.flush()
Esempio n. 3
0
class Entity(db.Model):
    __tablename__ = 'entity'

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode)
    normalized = db.Column(db.Unicode)
    attributes = db.Column(HSTORE)
    reviewed = db.Column(db.Boolean, default=False)
    invalid = db.Column(db.Boolean, default=False)
    canonical_id = db.Column(db.Integer,
        db.ForeignKey('entity.id'), nullable=True)
    dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id'))
    creator_id = db.Column(db.Integer, db.ForeignKey('account.id'))
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow,
            onupdate=datetime.utcnow)

    canonical = db.relationship('Entity', backref=backref('aliases', lazy='dynamic'),
        remote_side='Entity.id')

    def to_dict(self, shallow=False):
        d = {
            'id': self.id,
            'name': self.name,
            'dataset': self.dataset.name,
            'reviewed': self.reviewed,
            'invalid': self.invalid,
            'canonical': self.canonical,
            #'normalized': self.normalized,
            'created_at': self.created_at,
            'updated_at': self.updated_at,
        }
        if not shallow:
            d['creator'] = self.creator.to_dict()
            d['attributes'] = self.attributes
            d['num_aliases'] = self.aliases.count()
        return d

    def to_row(self):
        row = self.attributes or {}
        row = row.copy()
        row.update(self.to_dict(shallow=True))
        if self.canonical is not None:
            row['canonical'] = self.canonical.name
        return row

    @property
    def display_name(self):
        return self.name

    @classmethod
    def by_name(cls, dataset, name):
        q = cls.query.filter_by(dataset=dataset)
        attr = Entity.name
        if dataset.normalize_text:
            attr = Entity.normalized
            name = normalize_text(name)
        if dataset.ignore_case:
            attr = func.lower(attr)
            if isinstance(name, basestring):
                name = name.lower()
        q = q.filter(attr==name)
        return q.first()

    @classmethod
    def by_id(cls, id):
        try:
            return cls.query.filter_by(id=int(id)).first()
        except ValueError:
            return None

    @classmethod
    def id_map(cls, ids):
        entities = {}
        for entity in cls.query.filter(cls.id.in_(ids)):
            entities[entity.id] = entity
        return entities

    @classmethod
    def find(cls, dataset, id):
        entity = cls.by_id(id)
        if entity is None:
            raise NotFound("No such value ID: %s" % id)
        return entity

    @classmethod
    def all(cls, dataset=None, query=None, eager_aliases=False, eager=False):
        q = cls.query
        if dataset is not None:
            q = q.filter_by(dataset=dataset)
        if query is not None and len(query.strip()):
            q = q.filter(cls.name.ilike('%%%s%%' % query.strip()))
        if eager_aliases:
            q = q.options(joinedload_all(cls.aliases_static))
        if eager:
            q = q.options(db.joinedload('dataset'))
            q = q.options(db.joinedload('creator'))
        return q


    @classmethod
    def create(cls, dataset, data, account):
        state = EntityState(dataset, None)
        data = EntitySchema().to_python(data, state)
        entity = cls()
        entity.dataset = dataset
        entity.creator = account
        entity.name = data['name']
        entity.normalized = normalize_text(entity.name)
        entity.attributes = data.get('attributes', {})
        entity.reviewed = data['reviewed']
        entity.invalid = data['invalid']
        entity.canonical = data['canonical']
        db.session.add(entity)
        db.session.flush()
        return entity


    def update(self, data, account):
        state = EntityState(self.dataset, self)
        data = EntitySchema().to_python(data, state)
        self.creator = account
        self.name = data['name']
        self.normalized = normalize_text(self.name)
        self.attributes = data['attributes']
        self.reviewed = data['reviewed']
        self.invalid = data['invalid']
        self.canonical = data['canonical']

        # redirect all aliases of this entity
        if self.canonical:
            if self.canonical.canonical_id:
                if self.canonial.canonical_id == self.id:
                    self.canonical.canonical = None
                else:
                    self.canonical = self.canonical.canonical

            for alias in self.aliases:
                alias.canonical = self.canonical
        
        db.session.add(self)
Esempio n. 4
0
class Upload(db.Model):
    __tablename__ = 'upload'

    id = db.Column(db.Integer, primary_key=True)
    mimetype = db.Column(db.Unicode)
    filename = db.Column(db.Unicode)
    data = db.deferred(db.Column(db.LargeBinary))
    dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id'))
    creator_id = db.Column(db.Integer, db.ForeignKey('account.id'))
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    def to_dict(self):
        data = {
            'id': self.id,
            'mimetype': self.mimetype,
            'filename': self.filename,
            'created_at': self.created_at,
            'updated_at': self.updated_at,
            'headers': None,
            'sample': None,
            'rows': 0
        }
        if self.tab is not None:
            data['headers'] = self.tab.headers
            data['sample'] = self.tab.dict[:5]
            data['rows'] = self.tab.height
        data['parse_error'] = self._tab_error
        return data

    @property
    def tab(self):
        if not hasattr(self, '_tab'):
            try:
                self._tab = TablibDataset()
                self._tab.csv = self.data.decode('utf-8')
                self._tab_error = None
            except Exception as e:
                self._tab = None
                self._tab_error = str(e)
        return self._tab

    @classmethod
    def by_id(cls, dataset, id):
        q = cls.query.filter_by(id=id)
        q = q.filter_by(dataset_id=dataset.id)
        return q.first()

    @classmethod
    def find(cls, dataset, id):
        upload = cls.by_id(dataset, id)
        if upload is None:
            raise NotFound("No such upload: %s" % id)
        return upload

    @classmethod
    def all(cls):
        return cls.query

    @classmethod
    def create(cls, dataset, account, file_):
        upload = cls()
        upload.dataset = dataset
        upload.creator = account
        upload.mimetype = file_.mimetype
        upload.filename = file_.filename
        upload.data = file_.read()
        db.session.add(upload)
        db.session.flush()
        return upload
Esempio n. 5
0
class Dataset(db.Model):
    __tablename__ = 'dataset'

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode)
    label = db.Column(db.Unicode)
    ignore_case = db.Column(db.Boolean, default=False)
    match_aliases = db.Column(db.Boolean, default=False)
    public_edit = db.Column(db.Boolean, default=False)
    normalize_text = db.Column(db.Boolean, default=True)
    enable_invalid = db.Column(db.Boolean, default=True)
    algorithm = db.Column(db.Unicode)
    owner_id = db.Column(db.Integer, db.ForeignKey('account.id'))
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow,
            onupdate=datetime.utcnow)

    entities = db.relationship('Entity', backref='dataset',
                             lazy='dynamic')
    aliases = db.relationship('Alias', backref='dataset',
                             lazy='dynamic')

    def as_dict(self):
        return {
            'id': self.id,
            'name': self.name,
            'label': self.label,
            'owner': self.owner.as_dict(),
            'ignore_case': self.ignore_case,
            'match_aliases': self.match_aliases,
            'public_edit': self.public_edit,
            'normalize_text': self.normalize_text,
            'enable_invalid': self.enable_invalid,
            'algorithm': self.algorithm,
            'created_at': self.created_at,
            'updated_at': self.updated_at
            }

    @classmethod
    def by_name(cls, name):
        return cls.query.filter_by(name=name).first()

    @classmethod
    def find(cls, name):
        dataset = cls.by_name(name)
        if dataset is None:
            raise NotFound("No such dataset: %s" % name)
        return dataset

    @classmethod
    def all(cls):
        return cls.query

    @classmethod
    def create(cls, data, account):
        data = DatasetNewSchema().to_python(data)
        dataset = cls()
        dataset.owner = account
        dataset.name = data['name']
        dataset.label = data['label']
        db.session.add(dataset)
        db.session.flush()
        flush_cache(dataset)
        return dataset

    def update(self, data):
        data = DatasetEditSchema().to_python(data)
        self.label = data['label']
        self.normalize_text = data['normalize_text']
        self.ignore_case = data['ignore_case']
        self.public_edit = data['public_edit']
        self.match_aliases = data['match_aliases']
        self.enable_invalid = data['enable_invalid']
        self.algorithm = data['algorithm']
        db.session.add(self)
        db.session.flush()
        flush_cache(self)
Esempio n. 6
0
class Alias(db.Model):
    __tablename__ = 'alias'

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode)
    data = db.Column(JsonType, default=dict)
    is_matched = db.Column(db.Boolean, default=False)
    is_invalid = db.Column(db.Boolean, default=False)
    dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id'))
    creator_id = db.Column(db.Integer, db.ForeignKey('account.id'))
    matcher_id = db.Column(db.Integer,
                           db.ForeignKey('account.id'),
                           nullable=True)
    entity_id = db.Column(db.Integer,
                          db.ForeignKey('entity.id'),
                          nullable=True)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime,
                           default=datetime.utcnow,
                           onupdate=datetime.utcnow)

    def as_dict(self):
        return {
            'id': self.id,
            'name': self.name,
            'entity':
            self.entity.as_dict(shallow=True) if self.entity else None,
            'created_at': self.created_at,
            'creator': self.creator.as_dict(),
            'updated_at': self.updated_at,
            'is_matched': self.is_matched,
            'data': self.data,
            'matcher': self.matcher.as_dict() if self.matcher else None,
            'is_invalid': self.is_invalid,
            'dataset': self.dataset.name
        }

    def as_row(self):
        if self.is_invalid:
            return None
        row = self.entity.as_row() if self.entity else {
            'name': None,
            'id': None
        }
        for k, v in self.data.items():
            if k not in row:
                row[k] = v
        row['alias'] = self.name
        row['alias_id'] = self.id
        return row

    @property
    def display_name(self):
        return self.name

    @classmethod
    def by_name(cls, dataset, name):
        return cls.query.filter_by(dataset=dataset).\
                filter_by(name=name).first()

    @classmethod
    def by_id(cls, dataset, id):
        return cls.query.filter_by(dataset=dataset).\
                filter_by(id=id).first()

    @classmethod
    def all(cls, dataset, eager=False):
        q = cls.query.filter_by(dataset=dataset)
        if eager:
            q = q.options(db.joinedload('matcher'))
            q = q.options(db.joinedload('creator'))
            q = q.options(db.joinedload('entity'))
            q = q.options(db.joinedload('dataset'))
        return q

    @classmethod
    def all_matched(cls, dataset):
        return cls.all(dataset).\
                filter_by(is_matched=True)

    @classmethod
    def all_unmatched(cls, dataset):
        return cls.all(dataset).\
                filter_by(is_matched=False)

    @classmethod
    def all_invalid(cls, dataset):
        return cls.all(dataset).\
                filter_by(is_invalid=True)

    @classmethod
    def find(cls, dataset, id):
        link = cls.by_id(dataset, id)
        if link is None:
            raise NotFound("No such link ID: %s" % id)
        return link

    @classmethod
    def lookup(cls, dataset, data, account, match_entity=True, readonly=False):
        data = AliasLookupSchema().to_python(data)
        if match_entity:
            entity = Entity.by_name(dataset, data['name'])
            if entity is not None:
                return entity
        else:
            entity = None
        alias = cls.by_name(dataset, data['name'])
        if alias is not None:
            return alias
        choices = match_op(data['name'], dataset)
        choices = filter(lambda (c, v, s): s > 99.9, choices)
        if len(choices) == 1:
            c, entity_id, s = choices.pop()
            entity = Entity.by_id(dataset, entity_id)
        if readonly:
            return entity
        alias = cls()
        alias.creator = account
        alias.dataset = dataset
        alias.entity = entity
        alias.is_matched = entity is not None
        alias.name = data['name']
        alias.data = data['data']
        db.session.add(alias)
        db.session.flush()
        if entity is not None:
            add_candidate_to_cache(dataset, alias.name, entity.id)
        return alias

    def match(self, dataset, data, account):
        state = AliasMatchState(dataset)
        data = AliasMatchSchema().to_python(data, state)
        self.is_matched = True
        self.matcher = account
        if data['choice'] == 'INVALID':
            self.entity = None
            self.is_invalid = True
        elif data['choice'] == 'NEW':
            self.entity = Entity.create(dataset, data, account)
            self.is_invalid = False
        else:
            self.entity = data['choice']
            self.is_invalid = False
        db.session.add(self)
        db.session.flush()
Esempio n. 7
0
class Entity(db.Model):
    __tablename__ = 'entity'

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode)
    data = db.Column(JsonType, default=dict)
    dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id'))
    creator_id = db.Column(db.Integer, db.ForeignKey('account.id'))
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime,
                           default=datetime.utcnow,
                           onupdate=datetime.utcnow)

    aliases = db.relationship('Alias', backref='entity', lazy='dynamic')
    aliases_static = db.relationship('Alias')

    def as_dict(self, shallow=False):
        d = {
            'id': self.id,
            'name': self.name,
            'created_at': self.created_at,
            'updated_at': self.updated_at,
        }
        if not shallow:
            d['creator'] = self.creator.as_dict()
            d['dataset'] = self.dataset.name,
            d['data'] = self.data,
        return d

    def as_row(self):
        row = self.data.copy()
        row.update(self.as_dict(shallow=True))
        return row

    @property
    def display_name(self):
        return self.name

    @classmethod
    def by_name(cls, dataset, name):
        return cls.query.filter_by(dataset=dataset).\
                filter_by(name=name).first()

    @classmethod
    def by_id(cls, dataset, id):
        return cls.query.filter_by(dataset=dataset).\
                filter_by(id=id).first()

    @classmethod
    def id_map(cls, dataset, ids):
        entities = {}
        for entity in cls.query.filter_by(dataset=dataset).\
                filter(cls.id.in_(ids)):
            entities[entity.id] = entity
        return entities

    @classmethod
    def find(cls, dataset, id):
        entity = cls.by_id(dataset, id)
        if entity is None:
            raise NotFound("No such value ID: %s" % id)
        return entity

    @classmethod
    def all(cls, dataset, query=None, eager_aliases=False, eager=False):
        q = cls.query.filter_by(dataset=dataset)
        if query is not None and len(query.strip()):
            q = q.filter(cls.name.ilike('%%%s%%' % query.strip()))
        if eager_aliases:
            q = q.options(joinedload_all(cls.aliases_static))
        if eager:
            q = q.options(db.joinedload('dataset'))
            q = q.options(db.joinedload('creator'))
        return q

    @classmethod
    def create(cls, dataset, data, account):
        state = EntityState(dataset, None)
        data = EntitySchema().to_python(data, state)
        entity = cls()
        entity.dataset = dataset
        entity.creator = account
        entity.name = data['name']
        entity.data = data['data']
        db.session.add(entity)
        db.session.flush()
        add_candidate_to_cache(dataset, entity.name, entity.id)
        return entity

    def update(self, data, account):
        state = EntityState(self.dataset, self)
        data = EntitySchema().to_python(data, state)
        self.creator = account
        self.name = data['name']
        self.data = data['data']
        flush_cache(self.dataset)
        db.session.add(self)

    def merge_into(self, data, account):
        from nomenklatura.model.alias import Alias
        state = EntityState(self.dataset, self)
        data = EntityMergeSchema().to_python(data, state)
        target = data.get('target')
        for alias in self.aliases:
            alias.value = target
        alias = Alias()
        alias.name = self.name
        alias.creator = self.creator
        alias.matcher = account
        alias.entity = target
        alias.dataset = self.dataset
        alias.is_matched = True
        db.session.delete(self)
        db.session.add(alias)
        db.session.commit()
        flush_cache(self.dataset)
        return target