Esempio n. 1
0
def Column(*args, **kwargs):
    """Wrap the standard Column to allow to add some FormAlchemy options to a
    model field. Basically label and renderer but all the values are passed to
    :meth:`~formalchemy.fields.AbstractField.set`::

        >>> from sqlalchemy import Integer
        >>> from sqlalchemy.ext.declarative import declarative_base
        >>> from formalchemy import Column
        >>> Base = declarative_base()
        >>> class MyArticle(Base):
        ...     __tablename__ = 'myarticles'
        ...     id = Column(Integer, primary_key=True, label='My id')
        >>> MyArticle.__table__.c.id.info
        {'label': 'My id'}

    """
    info = kwargs.get('info', {})
    drop = set()
    for k, v in kwargs.items():
        if k in column_options:
            info[k] = v
            drop.add(k)
    for k in drop:
        del kwargs[k]
    if info:
        kwargs['info'] = info
    return SAColumn(*args, **kwargs)
Esempio n. 2
0
class User(Base):
    """Track processes and operations on database objects"""
    __tablename__ = 'users'

    id = SAColumn('pr_id', Integer, primary_key=True)

    username
    fullname
    email
    password

    org

    facebookid
    githubid
    googleid
    linkedinid

    data

    group = SAColumn('pr_group',
                     Integer,
                     ForeignKey('processes.pr_id'),
                     nullable=True,
                     index=True)
    parent = relationship('Process', remote_side=[id], backref='children')

    stage = SAColumn('pr_stage', Integer, default=0)
    phase = SAColumn('pr_phase',
                     Text,
                     doc='Process phase: such as ingest or build')

    hostname = SAColumn('pr_host', Text)
    pid = SAColumn('pr_pid', Integer)

    d_vid = SAColumn('pr_d_vid',
                     String(13),
                     ForeignKey('datasets.d_vid'),
                     nullable=False,
                     index=True)
    dataset = relationship('Dataset', backref='process_records')

    t_vid = SAColumn('pr_t_vid',
                     String(15),
                     ForeignKey('tables.t_vid'),
                     nullable=True,
                     index=True)
    table = relationship('Table', backref='process_records')

    s_vid = SAColumn('pr_s_vid',
                     String(17),
                     ForeignKey('datasources.ds_vid'),
                     nullable=True,
                     index=True)
    source = relationship('DataSource', backref='process_records')

    p_vid = SAColumn('pr_p_vid',
                     String(17),
                     ForeignKey('partitions.p_vid'),
                     nullable=True,
                     index=True)
    partition = relationship('Partition', backref='process_records')

    created = SAColumn(
        'pr_created',
        Float,
        doc='Creation date: time in seconds since the epoch as a integer.')

    modified = SAColumn(
        'pr_modified',
        Float,
        doc='Modification date: time in seconds since the epoch as a integer.')

    item_type = SAColumn('pr_type',
                         Text,
                         doc='Item type, such as table, source or partition')

    item_count = SAColumn('pr_count', Integer, doc='Number of items processed')
    item_total = SAColumn('pr_items',
                          Integer,
                          doc='Number of items to be processed')

    message = SAColumn('pr_message', Text)

    state = SAColumn('pr_state', Text)

    exception_class = SAColumn('pr_ex_class', Text)
    exception_trace = SAColumn('pr_ex_trace', Text)

    log_action = SAColumn('pr_action', Text)

    data = SAColumn('pr_data', MutationDict.as_mutable(JSONEncodedObj))

    def __repr__(self):

        return "{} {}/{} {}:{} {} {}".format(self.d_vid, self.hostname,
                                             self.pid,
                                             self.phase if self.phase else '?',
                                             self.stage, self.log_action,
                                             self.message)

    def __str__(self):

        return "{} {}/{} {}:{} {} {}".format(self.d_vid, self.hostname,
                                             self.pid,
                                             self.phase if self.phase else '?',
                                             self.stage, self.log_action,
                                             self.message)

    @property
    def dict(self):
        """A dict that holds key/values for all of the properties in the
        object.

        :return:

        """
        from collections import OrderedDict

        return OrderedDict((p.key, getattr(self, p.key))
                           for p in self.__mapper__.attrs
                           if p.key not in ('partition', 'source', 'table',
                                            'dataset', 'children', 'parent'))

    @staticmethod
    def before_insert(mapper, conn, target):
        from time import time
        target.created = time()

        Process.before_update(mapper, conn, target)

    @staticmethod
    def before_update(mapper, conn, target):
        from time import time
        target.modified = time()
Esempio n. 3
0
class Remote(Base):

    __tablename__ = 'remote'

    id = SAColumn('rm_id', Integer, primary_key=True)

    short_name = SAColumn('rm_short_name', Text, index=True, unique=True)

    service = SAColumn('rm_service', Text, index=True)  # ambry, s3 or fs

    url = SAColumn('rm_url', Text)

    d_vid = SAColumn('rm_d_vid',
                     String(20),
                     ForeignKey('datasets.d_vid'),
                     index=True)

    username = SAColumn('rm_username',
                        Text,
                        doc='Account username, the ARN for S3')

    access = SAColumn('rm_access', Text, doc='Access key or username')
    secret = SAColumn('rm_secret', Text, doc='Secret key or password')

    # These are deprecated. They are properties of a host, not a remote
    docker_url = SAColumn('rm_docker_url', Text)

    # These are deprecated, and should be removed when docker support is changed
    db_name = SAColumn('rm_db_name', Text)
    vol_name = SAColumn('rm_vol_name', Text)
    db_dsn = SAColumn('rm_db_dsn', Text)

    # Base virtual host name, applied to the docker host.
    virtual_host = SAColumn('rm_virtual_host',
                            Text,
                            doc='Virtual host name, for web proxy')

    data = SAColumn('rm_data', MutationDict.as_mutable(JSONEncodedObj))

    # Temp variables, not stored

    account_accessor = None  # Set externally to allow access to the account credentials

    tr_db_password = None

    @property
    def api_token(self):  # old name
        return self.jwt_secret

    @property
    def access_key(self):  #Synonym, to have same name as in account record
        return self.access

    @property
    def is_api(self):
        return self.service in ('ambry', 'docker')

    @property
    def dict(self):
        """A dict that holds key/values for all of the properties in the
        object.

        :return:

        """
        from collections import OrderedDict

        d = OrderedDict([(p.key, getattr(self, p.key))
                         for p in self.__mapper__.attrs
                         if p.key not in ('data', )])

        if 'list' in self.data:
            d['bundle_count'] = len(self.data['list'])
        else:
            d['bundle_count'] = None

        if self.data:
            for k, v in self.data.items():
                d[k] = v

        return d

    @property
    def db_password(self):
        from ambry.util import parse_url_to_dict

        d = parse_url_to_dict(self.db_dsn)

        return d['password']

    @property
    def db_host(self):
        from ambry.util import parse_url_to_dict

        d = parse_url_to_dict(self.db_dsn)

        return d['hostname']

    @property
    def admin_pw(self):
        return self.data.get(
            'admin_pw')  # Set in dockr.py in the ambry_admin module

    def _api_client(self):
        from ambry_client import Client
        from ambry.util import set_url_part

        username = '******'

        try:
            account = self.account_accessor(
                set_url_part(self.url, username=username))

        except KeyError:
            pass

        c = Client(self.url, username, account['secret'])
        return c

    @property
    def api_client(self):
        return self._api_client()

    def update(self):
        """Cache the list into the data section of the record"""

        from ambry.orm.exc import NotFoundError
        from requests.exceptions import ConnectionError, HTTPError
        from boto.exception import S3ResponseError

        d = {}

        try:
            for k, v in self.list(full=True):
                if not v:
                    continue

                d[v['vid']] = {
                    'vid': v['vid'],
                    'vname': v.get('vname'),
                    'id': v.get('id'),
                    'name': v.get('name')
                }

            self.data['list'] = d
        except (NotFoundError, ConnectionError, S3ResponseError,
                HTTPError) as e:
            raise RemoteAccessError("Failed to update {}: {}".format(
                self.short_name, e))

    def list(self, full=False):
        """List all of the bundles in the remote"""

        if self.is_api:
            return self._list_api(full=full)
        else:
            return self._list_fs(full=full)

    def _list_fs(self, full=False):
        assert self.account_accessor
        from fs.errors import ResourceNotFoundError
        from os.path import join
        from json import loads

        remote = self._fs_remote(self.url)

        # HTTP can't list, so we have to use a cached collection of list entries.
        # Use 'ambry remote <remote> update-listing' to create the cache

        if self.url.startswith('http'):
            try:
                for e in loads(
                        remote.getcontents(os.path.join('_meta',
                                                        'list.json'))):
                    if full:
                        yield (e['vname'], e)
                    else:
                        yield e['vname']

            except ResourceNotFoundError:
                return

        try:

            for e in remote.listdir('_meta/vname'):

                if full:
                    r = loads(remote.getcontents(join('_meta/vname', e)))
                    yield (e, r)
                else:
                    yield e
        except ResourceNotFoundError:
            # An old repo, doesn't have the meta/name values.

            for fn in remote.walkfiles(wildcard='*.db'):
                this_name = fn.strip('/').replace('/', '.').replace('.db', '')
                if full:
                    yield this_name
                else:
                    # Isn't any support for this
                    yield (this_name, None)

    def _update_fs_list(self):
        """Cache the full list for http access. This creates a meta file that can be read all at once,
        rather than requiring a list operation like S3 access does"""
        from json import dumps

        full_list = [e[1] for e in self._list_fs(full=True)]

        remote = self._fs_remote(self.url)

        remote.setcontents(os.path.join('_meta', 'list.json'),
                           dumps(full_list, indent=4))

    def _list_api(self, full=False):

        c = self._api_client()

        for d in c.list():
            if full:
                yield (d.name, d)
            else:
                yield d.name

    def find(self, ref):

        if self.is_api:
            return self._find_api(ref)
        else:
            return self._find_fs(ref)

    def _find_fs(self, ref):
        from fs.errors import ResourceNotFoundError
        from ambry.orm.exc import NotFoundError
        import json

        remote = self._fs_remote(self.url)

        path_parts = ['vname', 'vid', 'name', 'id']

        for p in path_parts:
            path = "/_meta/{}/{}".format(p, ref)

            try:
                e = remote.getcontents(path)
                return json.loads(e)
            except ResourceNotFoundError:
                pass

        raise NotFoundError("Failed to find bundle for ref '{}' ".format(ref))

    def _find_api(self, ref):
        c = self._api_client()

        return c.dataset(ref)

    def checkin(self, package, no_partitions=False, force=False, cb=None):
        """
        Check in a bundle package to the remote.

        :param package: A Database, referencing a sqlite database holding the bundle
        :param cb: a two argument progress callback: cb(message, num_records)
        :return:
        """
        from ambry.orm.exc import NotFoundError

        if not os.path.exists(package.path):
            raise NotFoundError("Package path does not exist: '{}' ".format(
                package.path))

        if self.is_api:
            return self._checkin_api(package,
                                     no_partitions=no_partitions,
                                     force=force,
                                     cb=cb)
        else:
            return self._checkin_fs(package,
                                    no_partitions=no_partitions,
                                    force=force,
                                    cb=cb)

    def _checkin_fs(self, package, no_partitions=False, force=False, cb=None):
        from fs.errors import NoPathURLError, NoSysPathError
        from ambry.orm import Partition
        assert self.account_accessor

        remote = self._fs_remote(self.url)

        ds = package.package_dataset

        db_ck = ds.identity.cache_key + '.db'

        if cb:

            def cb_one_arg(n):
                cb('Uploading package', n)
        else:

            def cb_one_arg(n):
                logger.info('Uploading package {} bytes'.format(n))

        with open(package.path) as f:
            remote.makedir(os.path.dirname(db_ck),
                           recursive=True,
                           allow_recreate=True)
            e = remote.setcontents_async(db_ck,
                                         f,
                                         progress_callback=cb_one_arg)
            e.wait()

        if package.library:
            for p in package.session.query(Partition).filter(
                    Partition.type == Partition.TYPE.UNION).all():

                self._put_partition_fs(remote,
                                       p,
                                       package.library,
                                       force=force,
                                       cb=cb)

        self._put_metadata(remote, ds)

        try:
            return remote, remote.getpathurl(db_ck)
        except NoPathURLError:
            pass

        try:
            return remote, remote.getsyspath(db_ck)
        except NoSysPathError:
            pass

        return remote, None

    def _checkin_api(self, package, no_partitions=False, force=False, cb=None):

        c = self._api_client()

        return c.library.checkin(package, force=force, cb=cb)

    @staticmethod
    def _meta_infos(ds):

        import json
        from six import text_type

        identity = ds.identity
        d = identity.dict

        d['summary'] = ds.config.metadata.about.summary
        d['title'] = ds.config.metadata.about.title

        ident = json.dumps(d)

        return ((os.path.join('_meta', 'vid', identity.vid),
                 ident), (os.path.join('_meta', 'id', identity.id_), ident),
                (os.path.join('_meta', 'vname', text_type(identity.vname)),
                 ident), (os.path.join('_meta', 'name',
                                       text_type(identity.name)), ident))

    def _put_metadata(self, fs_remote, ds):
        """Store metadata on a pyfs remote"""

        from six import text_type
        from fs.errors import ResourceNotFoundError

        identity = ds.identity
        d = identity.dict

        d['summary'] = ds.config.metadata.about.summary
        d['title'] = ds.config.metadata.about.title

        meta_stack = self._meta_infos(ds)

        def do_metadata():
            for path, ident in meta_stack:
                fs_remote.setcontents(path, ident)

        try:
            # Assume the directories already exist
            do_metadata()
        except ResourceNotFoundError:
            # Nope, make them and try again.
            parts = ['vid', 'id', 'vname', 'name']
            for p in parts:
                dirname = os.path.join('_meta', p)
                fs_remote.makedir(dirname, allow_recreate=True, recursive=True)

            do_metadata()

    def put_partition(self, cb=None):
        """Store a partition on the remote"""
        raise NotImplementedError()
        pass

    def _put_partition_fs(self, fs_remote, p, library, force=False, cb=None):

        if cb:

            def cb_one_arg(n):
                cb('Uploading partition {}'.format(p.identity.name), n)
        else:
            cb_one_arg = None

        if not library:
            return

        p = library.partition(p.vid)

        with p.datafile.open(mode='rb') as fin:
            fs_remote.makedir(os.path.dirname(p.datafile.path),
                              recursive=True,
                              allow_recreate=True)

            exists = fs_remote.exists(p.datafile.path)

            if force or not exists:
                event = fs_remote.setcontents_async(
                    p.datafile.path, fin, progress_callback=cb_one_arg)
                event.wait()
            else:
                cb('Partition {} already exists on remote'.format(p.vid), 0)

    def _put_partition_api(self, p, cb=None):
        raise NotImplementedError()
        pass

    def checkout(self, ref, cb=None):
        """Checkout a bundle from the remote. Returns a file-like object"""
        if self.is_api:
            return self._checkout_api(ref, cb=cb)
        else:
            return self._checkout_fs(ref, cb=cb)

    def _checkout_api(self, ref, cb=None):
        raise NotImplementedError()

    def _checkout_fs(self, ref, cb=None):
        remote = self._fs_remote(self.url)

        d = self._find_fs(ref)

        return remote.open(d['cache_key'] + '.db', 'rb')

    def get_partition(self):
        """Get a partition from the remote"""
        pass

    def remove(self, ref, cb=None):
        """Check in a bundle to the remote"""

        if self.is_api:
            return self._remove_api(ref, cb)
        else:
            return self._remove_fs(ref, cb)

    def _remove_fs(self, ref, cb=None):
        from fs.errors import ResourceNotFoundError
        from os.path import join

        remote = self._fs_remote(self.url)

        def safe_remove(path):
            try:
                remote.remove(path)
                if cb:
                    cb('Removed {}'.format(path))
            except ResourceNotFoundError as e:
                if cb:
                    cb("Failed to remove '{}': {}".format(path, e))

        info = self._find_fs(ref)

        db_ck = info['cache_key'] + '.db'

        if cb:
            cb('Removing {}'.format(db_ck))

            safe_remove(db_ck)

        for dir, files in remote.walk(info['cache_key']):
            for f in files:
                path = join(dir, f)

                safe_remove(path)

        for p in [
                join('_meta', 'vid', info['vid']),
                join('_meta', 'id', info['id']),
                join('_meta', 'vname', info['vname']),
                join('_meta', 'name', info['name'])
        ]:
            safe_remove(p)

        # FIXME! Doesn't remove partitions

        return info['vid']

    def _remove_api(self, ref, cb=None):

        info = self._find_api(ref)

        c = self._api_client()

        c.library.remove(ref)

    def _fs_remote(self, url):

        from ambry.util import parse_url_to_dict

        d = parse_url_to_dict(url)

        if d['scheme'] == 's3':
            return self.s3(url, access=self.access, secret=self.secret)
        else:
            from fs.opener import fsopendir
            return fsopendir(url)

    @property
    def fs(self):
        """Return a pyfs object"""
        return self._fs_remote(self.url)

    def s3(self, url, account_acessor=None, access=None, secret=None):
        """Setup an S3 pyfs, with account credentials, fixing an ssl matching problem"""
        from ambry.util.ambrys3 import AmbryS3FS
        from ambry.util import parse_url_to_dict
        import ssl

        pd = parse_url_to_dict(url)

        if account_acessor:
            account = account_acessor(pd['hostname'])

            assert account['account_id'] == pd['hostname']
            aws_access_key = account['access_key'],
            aws_secret_key = account['secret']
        else:
            aws_access_key = access
            aws_secret_key = secret

        assert access, url
        assert secret, url

        s3 = AmbryS3FS(
            bucket=pd['netloc'],
            prefix=pd['path'].strip('/') + '/',
            aws_access_key=aws_access_key,
            aws_secret_key=aws_secret_key,
        )

        return s3

    def __str__(self):
        return '{};{}'.format(self.short_name, self.url)

    @staticmethod
    def before_insert(mapper, conn, target):
        Remote.before_update(mapper, conn, target)

    @staticmethod
    def before_update(mapper, conn, target):

        url = target.url

        if not target.service and url:
            if url.startswith('s3:'):
                target.service = 's3'
            elif url.startswith('http'):
                target.service = 'ambry'
            else:
                target.service = 'fs'
Esempio n. 4
0
class Column(Base):
    __tablename__ = 'columns'

    _parent_col = 'c_t_vid'

    vid = SAColumn('c_vid', String(18), primary_key=True)
    id = SAColumn('c_id', String(15))  # Probably not necessary

    sequence_id = SAColumn('c_sequence_id', Integer)
    is_primary_key = SAColumn('c_is_primary_key', Boolean, default=False)

    t_vid = SAColumn('c_t_vid',
                     String(15),
                     ForeignKey('tables.t_vid'),
                     nullable=False,
                     index=True)
    d_vid = SAColumn('c_d_vid',
                     String(13),
                     ForeignKey('datasets.d_vid'),
                     nullable=False,
                     index=True)
    t_id = SAColumn('c_t_id', String(12))

    #source_name = SAColumn('c_source_name', Text, index=True)
    name = SAColumn('c_name', Text, index=True)
    altname = SAColumn('c_altname', Text)
    datatype = SAColumn('c_datatype', Text)
    valuetype = SAColumn('c_valuetype', Text)
    start = SAColumn(
        'c_start',
        Integer,
        doc='For fixed width files, the starting position of the column')
    size = SAColumn(
        'c_size',
        Integer,
        doc='For fixed width files, the ending position of the column')
    width = SAColumn('c_width',
                     Integer,
                     doc='For fixed width files, the width of the column')
    default = SAColumn('c_default', Text)
    illegal_value = SAColumn('c_illegal_value',
                             Text)  # A special value meaning N/A or nan, etc.

    summary = SAColumn('c_summary', Text)
    description = SAColumn('c_description', Text)
    keywords = SAColumn('c_keywords', Text)

    lom = SAColumn(
        'c_lom',
        String(1),
        doc=
        'Level of Measurement: n,o,i,r for Nominal, Ordinal, Interval, Ratio')
    role = SAColumn('c_role',
                    String(1),
                    doc='Role: key, dimension, measure, error, name')
    scale = SAColumn(
        'c_scale',
        Float,
        doc=
        'Number of measure units per natural units. Ie, if 1 == 1000 people, scale = 1000'
    )
    units = SAColumn('c_units', Text)
    universe = SAColumn('c_universe', Text)

    parent = SAColumn('c_parent', Text)
    derivedfrom = SAColumn('c_derivedfrom', Text)
    numerator = SAColumn('c_numerator', String(20))
    denominator = SAColumn('c_denominator', String(20))

    # New column value casters and generators
    _transform = SAColumn('c_transform', Text)

    data = SAColumn('c_data', MutationDict.as_mutable(JSONEncodedObj))

    # This column should really be called 'value labels'
    codes = relationship(Code,
                         backref='column',
                         order_by='asc(Code.key)',
                         cascade='save-update, delete, delete-orphan')

    __table_args__ = (
        UniqueConstraint('c_sequence_id', 'c_t_vid', name='_uc_c_sequence_id'),
        UniqueConstraint('c_name', 'c_t_vid', name='_uc_c_name'),
    )

    # FIXME. These types should be harmonized with   SourceColumn.DATATYPE
    DATATYPE_STR = six.binary_type.__name__
    DATATYPE_UNICODE = six.text_type.__name__
    DATATYPE_INTEGER = 'int'
    DATATYPE_INTEGER64 = 'long' if six.PY2 else 'int'
    DATATYPE_FLOAT = 'float'
    DATATYPE_DATE = 'date'
    DATATYPE_TIME = 'time'
    DATATYPE_TIMESTAMP = 'timestamp'
    DATATYPE_DATETIME = 'datetime'
    DATATYPE_BLOB = 'blob'

    DATATYPE_POINT = 'point'  # Spatalite, sqlite extensions for geo
    DATATYPE_LINESTRING = 'linestring'  # Spatalite, sqlite extensions for geo
    DATATYPE_POLYGON = 'polygon'  # Spatalite, sqlite extensions for geo
    DATATYPE_MULTIPOLYGON = 'multipolygon'  # Spatalite, sqlite extensions for geo
    DATATYPE_GEOMETRY = 'geometry'  # Spatalite, sqlite extensions for geo

    types = {
        # Sqlalchemy, Python, Sql,

        # Here, 'str' means ascii, 'unicode' means not ascii.
        # FIXME: Change names to DATATYPE_ASCII, DATATYPE_NOT_ASCII because it confuses while
        # python2/python3 porting.
        DATATYPE_STR: (sqlalchemy.types.String, six.binary_type, 'VARCHAR'),
        DATATYPE_UNICODE: (sqlalchemy.types.String, six.text_type, 'VARCHAR'),
        DATATYPE_INTEGER: (sqlalchemy.types.Integer, int, 'INTEGER'),
        DATATYPE_INTEGER64: (BigIntegerType, int, 'INTEGER64'),
        DATATYPE_FLOAT: (sqlalchemy.types.Float, float, 'REAL'),
        DATATYPE_DATE: (sqlalchemy.types.Date, datetime.date, 'DATE'),
        DATATYPE_TIME: (sqlalchemy.types.Time, datetime.time, 'TIME'),
        DATATYPE_TIMESTAMP:
        (sqlalchemy.types.DateTime, datetime.datetime, 'TIMESTAMP'),
        DATATYPE_DATETIME:
        (sqlalchemy.types.DateTime, datetime.datetime, 'DATETIME'),
        DATATYPE_POINT: (GeometryType, six.binary_type, 'POINT'),
        DATATYPE_LINESTRING: (GeometryType, six.binary_type, 'LINESTRING'),
        DATATYPE_POLYGON: (GeometryType, six.binary_type, 'POLYGON'),
        DATATYPE_MULTIPOLYGON: (GeometryType, six.binary_type, 'MULTIPOLYGON'),
        DATATYPE_GEOMETRY: (GeometryType, six.binary_type, 'GEOMETRY'),
        DATATYPE_BLOB: (sqlalchemy.types.LargeBinary, buffer, 'BLOB')
    }

    def __init__(self, **kwargs):

        super(Column, self).__init__(**kwargs)

        assert self.sequence_id is not None

        if not self.name:
            self.name = 'column' + str(self.sequence_id)
            # raise ValueError('Column must have a name. Got: {}'.format(kwargs))

        # Don't allow these values to be the empty string
        self.transform = self.transform or None

    @classmethod
    def python_types(cls):
        return [e[1] for e in six.itervalues(cls.types)]

    def type_is_int(self):
        return self.python_type == int

    def type_is_real(self):
        return self.python_type == float

    def type_is_number(self):
        return self.type_is_real or self.type_is_int

    def type_is_text(self):
        return self.datatype == Column.DATATYPE_STR or self.datatype == Column.DATATYPE_UNICODE

    def type_is_geo(self):
        return self.datatype in (Column.DATATYPE_POINT,
                                 Column.DATATYPE_LINESTRING,
                                 Column.DATATYPE_POLYGON,
                                 Column.DATATYPE_MULTIPOLYGON,
                                 Column.DATATYPE_GEOMETRY)

    def type_is_gvid(self):
        return 'gvid' in self.name

    def type_is_time(self):
        return self.datatype in (Column.DATATYPE_TIME,
                                 Column.DATATYPE_TIMESTAMP)

    def type_is_date(self):
        return self.datatype in (Column.DATATYPE_TIMESTAMP,
                                 Column.DATATYPE_DATETIME,
                                 Column.DATATYPE_DATE)

    def type_is_builtin(self):
        """Return False if the datatype is not one of the builtin type"""
        return self.datatype in self.types

    @property
    def sqlalchemy_type(self):
        return self.types[self.datatype][0]

    @property
    def valuetype_class(self):
        """Return the valuetype class, if one is defined, or a built-in type if it isn't"""

        from ambry.valuetype import resolve_value_type

        if self.valuetype:
            return resolve_value_type(self.valuetype)

        else:
            return resolve_value_type(self.datatype)

    @property
    def valuetype_description(self):
        """Return the valuetype class, if one is defined, or a built-in type if it isn't"""

        from ambry.valuetype import resolve_value_type

        return self.valuetype_class.desc

    @property
    def python_type(self):
        """Return the python type for the row, possibly getting it from a valuetype reference """

        from ambry.valuetype import resolve_value_type

        if self.valuetype and resolve_value_type(self.valuetype):
            return resolve_value_type(self.valuetype)._pythontype

        elif self.datatype:
            try:
                return self.types[self.datatype][1]
            except KeyError:
                return resolve_value_type(self.datatype)._pythontype

        else:
            from ambry.exc import ConfigurationError
            raise ConfigurationError(
                "Can't get python_type: neither datatype of valuetype is defined"
            )

    @property
    def role(self):
        '''Return the code for the role,  measure, dimension or error'''
        from ambry.valuetype.core import ROLE

        if not self.valuetype_class:
            return ''

        role = self.valuetype_class.role
        if role == ROLE.UNKNOWN:
            vt_code = self.valuetype_class.vt_code

            if len(vt_code) == 1 or vt_code[1] == '/':
                return vt_code[0]
            else:
                return ''

        return role

    @property
    def is_dimension(self):
        """Return true if the colum is a dimension"""
        from ambry.valuetype.core import ROLE
        return self.role == ROLE.DIMENSION

    @property
    def is_measure(self):
        """Return true if the colum is a dimension"""
        from ambry.valuetype.core import ROLE
        return self.role == ROLE.MEASURE

    @property
    def is_label(self):
        """Return true if the colum is a dimension"""
        from ambry.valuetype.core import ROLE
        return self.role == ROLE.LABEL

    @property
    def is_error(self):
        """Return true if the colum is a dimension"""
        from ambry.valuetype.core import ROLE
        return self.role == ROLE.ERROR

    @property
    def role_description(self):
        from ambry.valuetype.core import role_descriptions
        return role_descriptions.get(self.role, '')

    @property
    def has_nulls(self):
        """Return True if the datatype allows for null values ( it is specified with a '?' at the end ) """
        return self.valuetype.endswith('?')

    @property
    def children(self):
        """"Return the table's other column that have this column as a parent, excluding labels"""
        for c in self.table.columns:
            if c.parent == self.name and not c.valuetype_class.is_label():
                yield c

    @property
    def label(self):
        """"Return first child of the column that is marked as a label. Returns self if the column is a label"""

        if self.valuetype_class.is_label():
            return self

        for c in self.table.columns:
            if c.parent == self.name and c.valuetype_class.is_label():
                return c

        return None

    @property
    def label_or_self(self):
        """List label(), but also returns self is there is no label"""
        l = self.label

        if not l:
            return self

        return l

    @property
    def geoid(self):
        """"Return first child of the column, or self that is marked as a geographic identifier"""

        if self.valuetype_class.is_geoid():
            return self

        for c in self.table.columns:
            if c.parent == self.name and c.valuetype_class.is_geoid():
                return c

    def python_cast(self, v):
        """Cast a value to the type of the column.

        Primarily used to check that a value is valid; it will throw an
        exception otherwise

        """

        if self.type_is_time():
            dt = dateutil.parser.parse(v)

            if self.datatype == Column.DATATYPE_TIME:
                dt = dt.time()
            if not isinstance(dt, self.python_type):
                raise TypeError('{} was parsed to {}, expected {}'.format(
                    v, type(dt), self.python_type))

            return dt
        else:
            # This isn't calling the python_type method -- it's getting a python type, then instantialting it,
            # such as "int(v)"
            return self.python_type(v)

    @property
    def schema_type(self):

        if not self.datatype:
            from .exc import ConfigurationError
            raise ConfigurationError("Column '{}' has no datatype".format(
                self.name))

        # let it fail with KeyError if datatype is unknown.
        pt = self.python_type.__name__
        return self.types[pt][2]

    @classmethod
    def convert_numpy_type(cls, dtype):
        """Convert a numpy dtype into a Column datatype. Only handles common
        types.

        Implemented as a function to decouple from numpy

        """

        m = {
            'int64': cls.DATATYPE_INTEGER64,
            'float64': cls.DATATYPE_FLOAT,
            'object':
            cls.DATATYPE_TEXT  # Hack. Pandas makes strings into object.
        }

        t = m.get(dtype.name, None)

        if not t:
            raise TypeError("Failed to convert numpy type: '{}' ".format(
                dtype.name))

        return t

    @classmethod
    def convert_python_type(cls, py_type_in, name=None):

        type_map = {six.text_type: six.binary_type}

        for col_type, (sla_type, py_type,
                       sql_type) in six.iteritems(cls.types):

            if py_type == type_map.get(py_type_in, py_type_in):
                if col_type == 'blob' and name and name.endswith('geometry'):
                    return cls.DATATYPE_GEOMETRY

                elif sla_type != GeometryType:  # Total HACK. FIXME
                    return col_type

        return None

    @property
    def foreign_key(self):
        return self.fk_vid

    @property
    def dest_header(self):
        """Allows destination tables to be used as source tables when creating schema from a 'partition' source"""
        if self.altname:
            return self.altname
        else:
            return self.name

    @property
    def has_codes(self):
        """Allows destination tables to be used as source tables when creating schema from a 'partition' source"""
        return False

    @property
    def dict(self):
        """A dict that holds key/values for all of the properties in the
        object.

        :return:

        """
        d = {
            p.key: getattr(self, p.key)
            for p in self.__mapper__.attrs
            if p.key not in ('table', 'stats', '_codes', 'data')
        }

        if not d:
            raise Exception(self.__dict__)

        d['schema_type'] = self.schema_type

        if self.data:
            # Copy data fields into top level dict, but don't overwrite existind values.
            for k, v in six.iteritems(self.data):
                if k not in d and k not in ('table', 'stats', '_codes',
                                            'data'):
                    d[k] = v

        return d

    @property
    def nonull_dict(self):
        """Like dict, but does not hold any null values.

        :return:

        """
        return {
            k: v
            for k, v in six.iteritems(self.dict) if v and k != '_codes'
        }

    @staticmethod
    def mangle_name(name):
        """Mangles a column name to a standard form, remoing illegal
        characters.

        :param name:
        :return:

        """
        import re
        try:
            return re.sub('_+', '_',
                          re.sub('[^\w_]', '_', name).lower()).rstrip('_')
        except TypeError:
            raise TypeError('Trying to mangle name with invalid type of: ' +
                            str(type(name)))

    @property
    @memoize
    def reverse_code_map(self):
        """Return a map from a code ( usually a string ) to the  shorter numeric value"""

        return {c.value: (c.ikey if c.ikey else c.key) for c in self.codes}

    @property
    @memoize
    def forward_code_map(self):
        """Return  a map from the short code to the full value """

        return {c.key: c.value for c in self.codes}

    def add_code(self, key, value, description=None, data=None, source=None):
        """

        :param key: The code value that appears in the datasets, either a string or an int
        :param value: The string value the key is mapped to
        :param description:  A more detailed description of the code
        :param data: A data dict to add to the ORM record
        :return: the code record
        """

        # Ignore codes we already have, but will not catch codes added earlier for this same
        # object, since the code are cached

        from six import text_type

        for cd in self.codes:
            if cd.key == text_type(key):
                return cd

        def cast_to_int(s):
            try:
                return int(s)
            except (TypeError, ValueError):
                return None

        cd = Code(c_vid=self.vid,
                  t_vid=self.t_vid,
                  key=text_type(key),
                  ikey=cast_to_int(key),
                  value=value,
                  source=source,
                  description=description,
                  data=data)

        self.codes.append(cd)

        return cd

    @property
    def transform(self):
        return self._transform

    @transform.setter
    def transform(self, v):
        self._transform = self.clean_transform(v)

    @staticmethod
    def make_xform_seg(init_=None,
                       datatype=None,
                       transforms=None,
                       exception=None,
                       column=None):
        return {
            'init': init_,
            'transforms': transforms if transforms else [],
            'exception': exception,
            'datatype': datatype,
            'column': column
        }

    @staticmethod
    def _expand_transform(transform):
        from ambry.dbexceptions import ConfigurationError

        if not bool(transform):
            return []

        transform = transform.rstrip('|')

        segments = []

        for i, seg_str in enumerate(
                transform.split(';')):  #';' seperates pipe stages
            pipes = seg_str.split('|')  # eperates pipes in each stage.

            d = Column.make_xform_seg()

            for pipe in pipes:

                if not pipe.strip():
                    continue

                if pipe[0] == '^':  # First, the initializer
                    if d['init']:
                        raise ConfigurationError(
                            'Can only have one initializer in a pipeline segment'
                        )
                    if i != 0:
                        raise ConfigurationError(
                            'Can only have an initializer in the first pipeline segment'
                        )
                    d['init'] = pipe[1:]
                elif pipe[0] == '!':  # Exception Handler
                    if d['exception']:
                        raise ConfigurationError(
                            'Can only have one exception handler in a pipeline segment'
                        )
                    d['exception'] = pipe[1:]
                else:  # Assume before the datatype
                    d['transforms'].append(pipe)

            segments.append(d)

        return segments

    @property
    def expanded_transform(self):
        """Expands the transform string into segments """

        segments = self._expand_transform(self.transform)

        if segments:

            segments[0]['datatype'] = self.valuetype_class

            for s in segments:
                s['column'] = self

        else:

            segments = [
                self.make_xform_seg(datatype=self.valuetype_class, column=self)
            ]

        # If we want to add the find datatype cast to a transform.
        #segments.append(self.make_xform_seg(transforms=["cast_"+self.datatype], column=self))

        return segments

    @staticmethod
    def clean_transform(transform):

        segments = Column._expand_transform(transform)

        def pipeify_seg(seg):

            o = []

            seg['init'] and o.append('^' + seg['init'])
            o += seg['transforms']
            seg['exception'] and o.append('!' + seg['exception'])

            return '|'.join(o)

        return ';'.join(pipeify_seg(seg) for seg in segments)

    @property
    def row(self):
        from collections import OrderedDict

        # Use an Ordered Dict to make it friendly to creating CSV files.

        name_map = {'name': 'column'}

        d = OrderedDict([('table', self.table.name)] +
                        [(name_map.get(p.key, p.key), getattr(self, p.key))
                         for p in self.__mapper__.attrs if p.key not in [
                             'codes', 'dataset', 'stats', 'table', 'd_vid',
                             'vid', 't_vid', 'id', 'is_primary_key', 'data'
                         ]])

        d['transform'] = d['_transform']
        del d['_transform']

        if self.name == 'id':
            t = self.table
            d['description'] = t.description
            data = t.data
        else:
            data = self.data

        for k, v in six.iteritems(data):
            d['d_' + k] = v

        assert 'data' not in d

        return d

    def __repr__(self):
        return '<column: {}, {}>'.format(self.name, self.vid)

    @staticmethod
    def update_number(target):

        ton = ObjectNumber.parse(target.t_vid)
        con = ColumnNumber(ton, target.sequence_id)
        target.id = str(ton.rev(None))
        target.vid = str(con)
        target.id = str(con.rev(None))
        target.d_vid = str(ObjectNumber.parse(target.t_vid).as_dataset)

    @staticmethod
    def before_insert(mapper, conn, target):
        """event.listen method for Sqlalchemy to set the seqience_id for this
        object and create an ObjectNumber value for the id_"""

        # from identity import ObjectNumber
        # assert not target.fk_vid or not ObjectNumber.parse(target.fk_vid).revision

        if target.sequence_id is None:
            from ambry.orm.exc import DatabaseError
            raise DatabaseError('Must have sequence_id before insertion')

        # Check that the id column is always sequence id 1
        assert (target.name == 'id') == (target.sequence_id == 1), (
            target.name, target.sequence_id)

        Column.before_update(mapper, conn, target)

    @staticmethod
    def before_update(mapper, conn, target):
        """Set the column id number based on the table number and the sequence
        id for the column."""

        assert target.datatype or target.valuetype

        target.name = Column.mangle_name(target.name)

        Column.update_number(target)
Esempio n. 5
0
class Table(Base, DictableMixin):
    __tablename__ = 'tables'

    vid = SAColumn('t_vid', String(15), primary_key=True)
    id = SAColumn('t_id', String(12), primary_key=False)
    d_id = SAColumn('t_d_id', String(10))
    d_vid = SAColumn('t_d_vid',
                     String(13),
                     ForeignKey('datasets.d_vid'),
                     index=True)

    sequence_id = SAColumn('t_sequence_id', Integer, nullable=False)
    name = SAColumn('t_name', String(200), nullable=False)
    altname = SAColumn('t_altname', Text)
    summary = SAColumn('t_summary', Text)
    description = SAColumn('t_description', Text)
    universe = SAColumn('t_universe', String(200))
    keywords = SAColumn('t_keywords', Text)
    type = SAColumn('t_type', String(20))

    # Reference to a column that provides an example of how this table should be used.
    proto_vid = SAColumn('t_proto_vid', String(20), index=True)

    installed = SAColumn('t_installed', String(100))

    data = SAColumn('t_data', MutationDict.as_mutable(JSONEncodedObj))

    c_sequence_id = SAColumn('t_c_sequence_id', Integer, default=1)

    __table_args__ = (
        UniqueConstraint('t_sequence_id', 't_d_vid', name='_uc_tables_1'),
        UniqueConstraint('t_name', 't_d_vid', name='_uc_tables_2'),
    )

    columns = relationship(Column,
                           backref='table',
                           order_by='asc(Column.sequence_id)',
                           cascade='all, delete-orphan',
                           lazy='joined')

    _column_sequence = {}

    @staticmethod
    def mangle_name(name, preserve_case=False):
        import re

        assert name

        try:
            r = re.sub('[^\w_]', '_', name.strip())

            if not preserve_case:
                r = r.lower()

            return r
        except TypeError:
            raise TypeError('Not a valid type for name ' + str(type(name)))

    @property
    def primary_columns(self):
        """Iterate over the primary columns, columns which do not have a parent"""
        for c in self.columns:
            if not c.parent:
                yield c

    @property
    def dimensions(self):
        """Iterate over the dimension columns, regardless of parent/child status

        """
        from ambry.valuetype.core import ROLE

        for c in self.columns:

            if c.role == ROLE.DIMENSION:
                yield c

    @property
    def primary_dimensions(self):
        """Iterate over the primary dimension columns, columns which do not have a parent

        """
        from ambry.valuetype.core import ROLE

        for c in self.columns:

            if not c.parent and c.role == ROLE.DIMENSION:
                yield c

    @property
    def primary_measures(self):
        """Iterate over the primary columns, columns which do not have a parent

        Also sets the property partition_stats to the stats collection for the partition and column.
        """
        from ambry.valuetype.core import ROLE

        for c in self.columns:

            if not c.parent and c.role == ROLE.MEASURE:
                yield c

    def column(self, ref):
        # AFAIK, all of the columns in the relationship will get loaded if any one is accessed,
        # so iterating over the collection only involves one SELECT.
        from .column import Column

        column_name = Column.mangle_name(str(ref))

        for c in self.columns:
            if str(column_name) == c.name or str(ref) == c.id or str(
                    ref) == c.vid:
                return c

        raise NotFoundError(
            "Failed to find column '{}' in table '{}' for ref: '{}' ".format(
                ref, self.name, ref))

    def add_column(self, name, update_existing=False, **kwargs):
        """
        Add a column to the table, or update an existing one.
        :param name: Name of the new or existing column.
        :param update_existing: If True, alter existing column values. Defaults to False
        :param kwargs: Other arguments for the the Column() constructor
        :return: a Column object
        """
        from ..identity import ColumnNumber

        try:
            c = self.column(name)
            extant = True

            if not update_existing:
                return c

        except NotFoundError:

            sequence_id = len(self.columns) + 1

            assert sequence_id

            c = Column(t_vid=self.vid,
                       sequence_id=sequence_id,
                       vid=str(
                           ColumnNumber(ObjectNumber.parse(self.vid),
                                        sequence_id)),
                       name=name,
                       datatype='str')
            extant = False

        # Update possibly existing data
        c.data = dict((list(c.data.items()) if c.data else []) +
                      list(kwargs.get('data', {}).items()))

        for key, value in list(kwargs.items()):

            if key[0] != '_' and key not in [
                    't_vid', 'name', 'sequence_id', 'data'
            ]:

                # Don't update the type if the user has specfied a custom type
                if key == 'datatype' and not c.type_is_builtin():
                    continue

                # Don't change a datatype if the value is set and the new value is unknown
                if key == 'datatype' and value == 'unknown' and c.datatype:
                    continue

                # Don't change a datatype if the value is set and the new value is unknown
                if key == 'description' and not value:
                    continue

                try:
                    setattr(c, key, value)
                except AttributeError:
                    raise AttributeError(
                        "Column record has no attribute {}".format(key))

            if key == 'is_primary_key' and isinstance(value,
                                                      str) and len(value) == 0:
                value = False
                setattr(c, key, value)

        # If the id column has a description and the table does not, add it to
        # the table.
        if c.name == 'id' and c.is_primary_key and not self.description:
            self.description = c.description

        if not extant:
            self.columns.append(c)

        return c

    def add_id_column(self, description=None):
        from . import Column
        self.add_column(
            name='id',
            datatype=Column.DATATYPE_INTEGER,
            is_primary_key=True,
            description=self.description if not description else description)

    def is_empty(self):
        """Return True if the table has no columns or the only column is the id"""
        if len(self.columns) == 0:
            return True

        if len(self.columns) == 1 and self.columns[0].name == 'id':
            return True

        return False

    @property
    def header(self):
        """Return an array of column names in the same order as the column
        definitions, to be used zip with a row when reading a CSV file.

        >> row = dict(zip(table.header, row))

        """

        return [c.name for c in self.columns]

    @property
    def dict(self):
        INCLUDE_FIELDS = [
            'id', 'vid', 'd_id', 'd_vid', 'sequence_id', 'name', 'altname',
            'vname', 'description', 'universe', 'keywords', 'installed',
            'proto_vid', 'type', 'codes'
        ]

        d = {
            k: v
            for k, v in six.iteritems(self.__dict__) if k in INCLUDE_FIELDS
        }

        if self.data:
            for k in self.data:
                assert k not in d, "Value '{}' is a table field and should not be in data ".format(
                    k)
                d[k] = self.data[k]

        d['is_geo'] = False

        for c in self.columns:
            if c in ('geometry', 'wkt', 'wkb', 'lat'):
                d['is_geo'] = True

        d['foreign_indexes'] = list(
            set([
                c.data['index'].split(":")[0] for c in self.columns
                if c.data.get('index', False)
            ]))

        return d

    def update_from_stats(self, stats):
        """Update columns based on partition statistics"""

        sd = dict(stats)

        for c in self.columns:

            if c not in sd:
                continue

            stat = sd[c]

            if stat.size and stat.size > c.size:
                c.size = stat.size

            c.lom = stat.lom

    def update_id(self, sequence_id=None, force=True):
        """Alter the sequence id, and all of the names and ids derived from it. This
        often needs to be don after an IntegrityError in a multiprocessing run"""
        from ..identity import ObjectNumber

        if sequence_id:
            self.sequence_id = sequence_id

        assert self.d_vid

        if self.id is None or force:
            dataset_id = ObjectNumber.parse(self.d_vid).rev(None)
            self.d_id = str(dataset_id)
            self.id = str(TableNumber(dataset_id, self.sequence_id))

        if self.vid is None or force:
            dataset_vid = ObjectNumber.parse(self.d_vid)
            self.vid = str(TableNumber(dataset_vid, self.sequence_id))

    @property
    def transforms(self):
        """Return an array of arrays of column transforms.

        #The return value is an list of list, with each list being a segment of column transformations, and
        #each segment having one entry per column.

        """

        tr = []
        for c in self.columns:
            tr.append(c.expanded_transform)

        return six.moves.zip_longest(*tr)

    @property
    def row(self):
        from collections import OrderedDict
        import six

        # Use an Ordered Dict to make it friendly to creating CSV files.
        SKIP_KEYS = [
            'id', 'd_id', 'd_vid', 'dataset', 'columns', 'data', 'partitions',
            'sources', 'process_records'
        ]

        d = OrderedDict([(p.key, getattr(self, p.key))
                         for p in self.__mapper__.attrs
                         if p.key not in SKIP_KEYS])

        for k, v in six.iteritems(self.data):
            d['d_' + k] = v

        return d

    def __str__(self):
        from tabulate import tabulate

        headers = 'Seq Vid Name Datatype ValueType'.split()
        rows = [(c.sequence_id, c.vid, c.name, c.datatype, c.valuetype)
                for c in self.columns]

        return ('Dest Table: {}\n'.format(self.name)) + tabulate(rows, headers)

    def _repr_html_(self):
        from tabulate import tabulate
        from ambry.util import drop_empty

        def record_gen():
            for i, row in enumerate([c.row for c in self.columns]):
                if i == 0:
                    yield row.keys()
                yield row.values()

        records = list(record_gen())

        records = drop_empty(records)

        return "<h2>{}</h2>".format(self.name) + tabulate(
            records[1:], headers=records[0], tablefmt="html")

    @staticmethod
    def before_insert(mapper, conn, target):
        """event.listen method for Sqlalchemy to set the seqience_id for this
        object and create an ObjectNumber value for the id"""
        if target.sequence_id is None:
            from ambry.orm.exc import DatabaseError
            raise DatabaseError('Must have sequence id before insertion')

        Table.before_update(mapper, conn, target)

    @staticmethod
    def before_update(mapper, conn, target):
        """Set the Table ID based on the dataset number and the sequence number
        for the table."""

        target.name = Table.mangle_name(target.name)

        if isinstance(target, Column):
            raise TypeError('Got a column instead of a table')

        target.update_id(target.sequence_id, False)
Esempio n. 6
0
class Process(Base):
    """Track processes and operations on database objects"""
    __tablename__ = 'processes'

    id = SAColumn('pr_id', Integer, primary_key=True)

    group = SAColumn('pr_group', Integer, ForeignKey('processes.pr_id'), nullable=True, index=True)
    parent = relationship('Process',  remote_side=[id], backref='children')

    stage = SAColumn('pr_stage', Integer, default=0)
    phase = SAColumn('pr_phase', Text, doc='Process phase: such as ingest or build')

    hostname = SAColumn('pr_host', Text)
    pid = SAColumn('pr_pid', Integer)

    d_vid = SAColumn('pr_d_vid', String(13), ForeignKey('datasets.d_vid'), nullable=False, index=True)
    dataset = relationship('Dataset', backref='process_records')

    t_vid = SAColumn('pr_t_vid', String(15), ForeignKey('tables.t_vid'), nullable=True, index=True)
    table = relationship('Table', backref='process_records')

    s_vid = SAColumn('pr_s_vid', String(17), ForeignKey('datasources.ds_vid'), nullable=True, index=True)
    source = relationship('DataSource', backref='process_records')

    p_vid = SAColumn('pr_p_vid', String(17), ForeignKey('partitions.p_vid'), nullable=True, index=True)
    partition = relationship('Partition', backref='process_records')

    created = SAColumn('pr_created', Float,
                        doc='Creation date: time in seconds since the epoch as a integer.')

    modified = SAColumn('pr_modified', Float,
                        doc='Modification date: time in seconds since the epoch as a integer.')

    item_type = SAColumn('pr_type', Text, doc='Item type, such as table, source or partition')

    item_count = SAColumn('pr_count', Integer, doc='Number of items processed')
    item_total = SAColumn('pr_items', Integer, doc='Number of items to be processed')

    message = SAColumn('pr_message', Text)

    state = SAColumn('pr_state', Text)

    exception_class = SAColumn('pr_ex_class', Text)
    exception_trace = SAColumn('pr_ex_trace', Text)

    log_action = SAColumn('pr_action', Text)

    data = SAColumn('pr_data', MutationDict.as_mutable(JSONEncodedObj))

    def __repr__(self):

        return "{} {}/{} {}:{} {} {}".format(
            self.d_vid, self.hostname, self.pid, self.phase if self.phase else '?', self.stage ,
            self.log_action, self.message)

    def __str__(self):

        return "{} {}/{} {}:{} {} {}".format(
            self.d_vid, self.hostname, self.pid, self.phase if self.phase else '?', self.stage ,
            self.log_action, self.message)

    @property
    def log_str(self):
        import platform
        import os

        parts = []

        # This bit only gets executed when records stored in the database from one node or process are
        # read from another. It won't print out in normall logging,
        if self.hostname != platform.node() or self.pid != os.getpid():
            hostpid = "({}@{})".format(self.pid, self.hostname)
            parts.append(hostpid)

        am = {
            'start': ">",
            'add': '+',
            'update': '.',
            'done': "<",
            '': '?',
            None: '?'
        }

        phase_str = self.phase if self.phase else '?'

        if self.stage:
            phase_str = phase_str + ':' + str(self.stage)

        parts.append(phase_str)

        action_char = am.get(self.log_action,'')

        if self.state == 'error':
            action_char = '!'

        parts.append(action_char)

        if self.s_vid:
            parts.append(self.s_vid)

        if self.t_vid:
            parts.append(self.t_vid)

        if self.p_vid:
            parts.append(self.p_vid)

        parts.append(self.message if self.message else '')

        if self.item_count:
            ic = 'processed '+str(self.item_count)

            if self.item_total:
                ic += ' of {}'.format(self.item_total)

            if self.item_type:
                ic += ' '+self.item_type

            parts.append(ic)



        return ' '.join(parts)

    @property
    def dict(self):
        """A dict that holds key/values for all of the properties in the
        object.

        :return:

        """
        from collections import OrderedDict

        return  OrderedDict( (p.key,getattr(self, p.key)) for p in self.__mapper__.attrs
             if p.key not in ('partition', 'source', 'table','dataset', 'children', 'parent'))


    @staticmethod
    def before_insert(mapper, conn, target):
        from time import time
        target.created = time()

        Process.before_update(mapper, conn, target)

    @staticmethod
    def before_update(mapper, conn, target):
        from time import time
        target.modified = time()
Esempio n. 7
0
class DataSource(DataSourceBase, Base, DictableMixin):
    """A source of data, such as a remote file or bundle"""

    __tablename__ = 'datasources'

    vid = SAColumn('ds_vid', String(17), primary_key=True)
    sequence_id = SAColumn('ds_sequence_id', INTEGER)

    name = SAColumn('ds_name', Text)
    d_vid = SAColumn('ds_d_vid',
                     String(13),
                     ForeignKey('datasets.d_vid'),
                     nullable=False,
                     doc='Dataset vid')

    title = SAColumn('ds_title', Text)

    st_vid = SAColumn('ds_st_vid',
                      String(22),
                      ForeignKey('sourcetables.st_vid'),
                      nullable=True)
    source_table_name = SAColumn('ds_st_name', Text)
    _source_table = relationship(SourceTable, backref='sources')

    t_vid = SAColumn('ds_t_vid',
                     String(15),
                     ForeignKey('tables.t_vid'),
                     nullable=True,
                     doc='Table vid')
    dest_table_name = SAColumn('ds_dt_name', Text)
    _dest_table = relationship(Table, backref='sources')

    stage = SAColumn('ds_stage', INTEGER,
                     default=0)  # Order in which to process sources.
    pipeline = SAColumn('ds_pipeline', Text)

    time = SAColumn('ds_time', Text)
    space = SAColumn('ds_space', Text)
    grain = SAColumn('ds_grain', Text)
    epsg = SAColumn(
        'ds_epsg',
        INTEGER,
        doc='EPSG SRID for the reference system of a geographic dataset. ')
    segment = SAColumn('ds_segment', Text)
    start_line = SAColumn('ds_start_line', INTEGER)
    end_line = SAColumn('ds_end_line', INTEGER)
    comment_lines = SAColumn('ds_comment_lines',
                             MutationList.as_mutable(JSONEncodedObj))
    header_lines = SAColumn('ds_header_lines',
                            MutationList.as_mutable(JSONEncodedObj))
    description = SAColumn('ds_description', Text)
    file = SAColumn('ds_file', Text)

    filetype = SAColumn('ds_filetype', Text)  # tsv, csv, fixed, partition
    encoding = SAColumn('ds_encoding', Text)

    hash = SAColumn('ds_hash', Text)

    reftype = SAColumn('ds_reftype', Text)  # null, zip, ref, template
    ref = SAColumn('ds_ref', Text)

    state = SAColumn('ds_state', Text)

    account_acessor = None

    __table_args__ = (UniqueConstraint('ds_d_vid',
                                       'ds_name',
                                       name='_uc_ds_d_vid'), )
Esempio n. 8
0
class ColumnStat(Base):

    """Table for per column, per partition stats."""
    __tablename__ = 'colstats'

    p_vid = SAColumn('cs_p_vid', String(20), ForeignKey('partitions.p_vid'), primary_key=True,
                     nullable=False, index=True)
    #partition = relationship('Partition', backref='stats')

    c_vid = SAColumn('cs_c_vid', String(20), ForeignKey('columns.c_vid'), primary_key=True,
                     nullable=False, index=True)
    column = relationship('Column', backref='stats')

    d_vid = SAColumn('cs_d_vid', String(20), ForeignKey('datasets.d_vid'), nullable=False, index=True)
    dataset = relationship('Dataset', backref='stats')

    lom = SAColumn('cs_lom', String(1))
    count = SAColumn('cs_count', BigIntegerType)
    mean = SAColumn('cs_mean', Float)
    std = SAColumn('cs_std', Float)
    min = SAColumn('cs_min', Float)
    p25 = SAColumn('cs_p25', Float)
    p50 = SAColumn('cs_p50', Float)
    p75 = SAColumn('cs_p75', Float)
    max = SAColumn('cs_max', Float)
    nuniques = SAColumn('cs_nuniques', BigIntegerType)

    width = SAColumn('cs_width', Integer)

    skewness = SAColumn('cs_skewness', Float)
    kurtosis = SAColumn('cs_kurtosis', Float)

    uvalues = SAColumn('f_uvalues', MutationDict.as_mutable(JSONEncodedObj))
    hist = SAColumn('f_hist', MutationList.as_mutable(JSONEncodedObj))

    text_hist = SAColumn('cs_text_hist', String)

    __table_args__ = (
        UniqueConstraint('cs_p_vid', 'cs_c_vid', name='u_cols_stats'),
    )


    @property
    def dict(self):

        # Javascript does not  have these values; the spec says they are all mapped to null
        to_nulls = [ float('nan'), float('-inf'), float('inf')]

        def nullify(k, v):
            import math

            if isinstance(v, float) and (math.isnan(v) or math.isinf(v)) :
                return None
            else:
                return v


        d =  {p.key: nullify(p.key, getattr(self,p.key)) for p in self.__mapper__.attrs if p.key not in
                ('data','column', 'table','partition', 'dataset')}

        return  d
Esempio n. 9
0
class SourceColumn(Base):

    __tablename__ = 'sourcecolumns'

    _parent_col = 'sc_st_vid'

    DATATYPE = Constant()
    DATATYPE.INT = int.__name__
    DATATYPE.FLOAT = float.__name__
    DATATYPE.STRING = six.binary_type.__name__
    DATATYPE.UNICODE = six.text_type.__name__
    DATATYPE.DATE = datetime.date.__name__
    DATATYPE.TIME = datetime.time.__name__
    DATATYPE.DATETIME = datetime.datetime.__name__
    DATATYPE.UNKNOWN = unknown.__name__

    type_map = {
        DATATYPE.INT: int,
        DATATYPE.FLOAT: float,
        DATATYPE.STRING: six.binary_type,
        DATATYPE.UNICODE: six.text_type,
        DATATYPE.DATE: datetime.date,
        DATATYPE.TIME: datetime.time,
        DATATYPE.DATETIME: datetime.datetime,
        DATATYPE.DATETIME: unknown
    }

    column_type_map = {  # FIXME The Column types should be harmonized with these types
        DATATYPE.INT: Column.DATATYPE_INTEGER,
        DATATYPE.FLOAT: Column.DATATYPE_FLOAT,
        DATATYPE.STRING: Column.DATATYPE_STR,
        DATATYPE.UNICODE: Column.DATATYPE_STR,
        DATATYPE.DATE: Column.DATATYPE_DATE,
        DATATYPE.TIME: Column.DATATYPE_TIME,
        DATATYPE.DATETIME: Column.DATATYPE_DATETIME,
        DATATYPE.UNKNOWN: Column.DATATYPE_STR
    }

    vid = SAColumn('sc_vid', String(21), primary_key=True)

    d_vid = SAColumn('sc_d_vid',
                     String(13),
                     ForeignKey('datasets.d_vid'),
                     nullable=False)
    st_vid = SAColumn('sc_st_vid',
                      String(17),
                      ForeignKey('sourcetables.st_vid'),
                      nullable=False)

    position = SAColumn('sc_position',
                        Integer,
                        doc='Integer position of column')

    source_header = SAColumn(
        'sc_source_header',
        Text,
        doc='Column header, After coalescing but before mangling.')
    dest_header = SAColumn('sc_dest_header',
                           Text,
                           doc='Original header, mangled')

    datatype = SAColumn('sc_datatype',
                        Text,
                        doc='Basic data type, usually intuited')
    valuetype = SAColumn(
        'sc_valuetype',
        Text,
        doc='Describes the meaning of the value: state, county, address, etc.')
    has_codes = SAColumn('sc_has_codes',
                         Boolean,
                         default=False,
                         doc='If True column also has codes of different type')

    start = SAColumn('sc_start',
                     Integer,
                     doc='For fixed width, the column starting position')
    width = SAColumn('sc_width',
                     Integer,
                     doc='For Fixed width, the field width')
    size = SAColumn(
        'sc_size',
        Integer,
        doc='Max size of the column values, after conversion to strings.')

    summary = SAColumn('sc_summary', Text, doc='Short text description')
    description = SAColumn('sc_description', Text, doc='Long text description')

    value_labels = SAColumn('sc_value_labels',
                            MutationDict.as_mutable(JSONEncodedObj))

    _next_column_number = None  # Set in next_config_number()

    __table_args__ = (UniqueConstraint('sc_st_vid',
                                       'sc_source_header',
                                       name='_uc_sourcecolumns'), )

    @property
    def name(self):
        return self.source_header

    @property
    def python_datatype(self):
        return self.type_map[self.datatype]

    @property
    def column_datatype(self):
        """Return the data type using the values defined for the schema"""
        return self.column_type_map[self.datatype]

    @staticmethod
    def mangle_name(name):
        """Mangles a column name to a standard form, removing illegal characters.

        :param name:
        :return:

        """
        import re

        try:
            return re.sub('_+', '_',
                          re.sub('[^\w_]', '_', name).lower()).rstrip('_')
        except TypeError:
            raise TypeError('Trying to mangle name with invalid type of: ' +
                            str(type(name)))

    @property
    def row(self):
        from collections import OrderedDict

        # Use an Ordered Dict to make it friendly to creating CSV files.

        d = OrderedDict([('table', self.table.name)] +
                        [(p.key, getattr(self, p.key))
                         for p in self.__mapper__.attrs if p.key not in [
                             'vid', 'st_vid', 'table', 'dataset', 'ds_id',
                             'd_vid', 'source', 'value_labels'
                         ]])

        return d

    @property
    def dict(self):
        """A dict that holds key/values for all of the properties in the
        object.

        :return:

        """
        from collections import OrderedDict
        SKIP_KEYS = ()
        return OrderedDict((p.key, getattr(self, p.key))
                           for p in self.__mapper__.attrs
                           if p.key not in SKIP_KEYS)

    def update(self, **kwargs):

        if 'table' in kwargs:
            del kwargs[
                'table']  # In source_schema.csv, this is the name of the table, not the object

        for k, v in list(kwargs.items()):
            if hasattr(self, k):

                if k == 'dest_header':
                    # Don't reset the dest header on updates.
                    if self.dest_header and self.dest_header != self.source_header:
                        continue

                setattr(self, k, v)
Esempio n. 10
0
class Config(Base):

    __tablename__ = 'config'
    __table_args__ = (UniqueConstraint('co_d_vid',
                                       'co_type',
                                       'co_group',
                                       'co_key',
                                       name='_type_group_key_uc'), )

    id = SAColumn('co_id', String(32), primary_key=True)
    sequence_id = SAColumn('co_sequence_id',
                           Integer,
                           nullable=False,
                           index=True)

    d_vid = SAColumn('co_d_vid',
                     String(16),
                     ForeignKey('datasets.d_vid'),
                     index=True,
                     doc='Dataset vid')
    type = SAColumn('co_type',
                    String(200),
                    doc='Type of the config: metadata, process, sync, etc...')
    group = SAColumn('co_group',
                     String(200),
                     doc='Group of the config: identity, about, etc...')
    key = SAColumn('co_key', String(200), doc='Key of the config')
    value = SAColumn('co_value',
                     JSONAlchemy(Text()),
                     doc='Value of the config key.')
    modified = SAColumn(
        'co_modified',
        Integer(),
        doc='Modification date: time in seconds since the epoch as a integer.')

    # Foreign key constraints may it hard to dump all of the configs to a new bundle database in
    # ambry.orm.database.Database#copy_dataset, so I've removed the foreign key constraint.
    # TODO: Write test for that note.

    parent_id = SAColumn(String(32),
                         ForeignKey('config.co_id'),
                         nullable=True,
                         doc='Id of the parent config.')

    parent = relationship('Config', remote_side=[id])
    children = relationship('Config')

    def incver(self):
        """Increment all of the version numbers and return a new object"""
        from . import incver
        return incver(self, ['d_vid', 'id', 'parent_id'])

    @property
    def dict(self):
        return {p.key: getattr(self, p.key) for p in self.__mapper__.attrs}

    def __repr__(self):
        return u('<config: {} {},{},{} = {}>').format(self.id, self.d_vid,
                                                      self.group, self.key,
                                                      self.value)

    @property
    def dotted_key(self):
        return '{}.{}.{}'.format(self.type, self.group, self.key)

    def update_sequence_id(self, session, dataset):
        assert dataset.vid == self.d_vid
        assert session
        # NOTE: This next_sequence_id uses a different algorithm than dataset.next_sequence_id
        # FIXME replace this one with dataset.next_sequence_id
        self.sequence_id = next_sequence_id(session, dataset._sequence_ids,
                                            self.d_vid, Config)
        self.id = str(GeneralNumber1('F', self.d_vid, self.sequence_id))

    @staticmethod
    def before_insert(mapper, conn, target):

        if not target.sequence_id:
            from ambry.orm.exc import DatabaseError
            assert bool(target.d_vid)
            raise DatabaseError('Must set a sequence id before inserting')

        if not target.id:
            target.id = str(
                GeneralNumber1('F', target.d_vid, target.sequence_id))

        Config.before_update(mapper, conn, target)

    @staticmethod
    def before_update(mapper, conn, target):

        if object_session(target).is_modified(target,
                                              include_collections=False):
            target.modified = time()
Esempio n. 11
0
class SourceTable(Base):
    __tablename__ = 'sourcetables'

    vid = SAColumn('st_vid', String(22), primary_key=True)
    sequence_id = SAColumn('st_sequence_id', Integer, nullable=False)
    d_vid = SAColumn('st_d_vid',
                     String(16),
                     ForeignKey('datasets.d_vid'),
                     nullable=False)
    name = SAColumn('st_name', String(50), nullable=False)

    columns = relationship(SourceColumn,
                           backref='table',
                           order_by='asc(SourceColumn.position)',
                           cascade='all, delete-orphan',
                           lazy='joined')

    __table_args__ = (UniqueConstraint('st_d_vid',
                                       'st_name',
                                       name='_uc_sourcetables'), )

    def column(self, source_header_or_pos):
        """
        Return a column by name or position

        :param source_header_or_pos: If a string, a source header name. If an integer, column position
        :return:
        """
        for c in self.columns:
            if c.source_header == source_header_or_pos:
                assert c.st_vid == self.vid
                return c
            elif c.position == source_header_or_pos:
                assert c.st_vid == self.vid
                return c

        else:
            return None

    def add_column(self, position, source_header, datatype, **kwargs):
        """
        Add a column to the source table.
        :param position: Integer position of the column started from 1.
        :param source_header: Name of the column, as it exists in the source file
        :param datatype: Python datatype ( str, int, float, None ) for the column
        :param kwargs:  Other source record args.
        :return:
        """
        from ..identity import GeneralNumber2

        c = self.column(source_header)
        c_by_pos = self.column(position)

        datatype = 'str' if datatype == 'unicode' else datatype

        assert not c or not c_by_pos or c.vid == c_by_pos.vid

        # Convert almost anything to True / False
        if 'has_codes' in kwargs:
            FALSE_VALUES = ['False', 'false', 'F', 'f', '', None, 0, '0']
            kwargs['has_codes'] = False if kwargs[
                'has_codes'] in FALSE_VALUES else True

        if c:

            # Changing the position can result in conflicts
            assert not c_by_pos or c_by_pos.vid == c.vid

            c.update(position=position,
                     datatype=datatype.__name__
                     if isinstance(datatype, type) else datatype,
                     **kwargs)

        elif c_by_pos:

            # FIXME This feels wrong; there probably should not be any changes to the both
            # of the table, since then it won't represent the previouls source. Maybe all of the sources
            # should get their own tables initially, then affterward the duplicates can be removed.

            assert not c or c_by_pos.vid == c.vid

            c_by_pos.update(source_header=source_header,
                            datatype=datatype.__name__ if isinstance(
                                datatype, type) else datatype,
                            **kwargs)

        else:

            assert not c and not c_by_pos

            # Hacking an id number, since I don't want to create a new Identity ObjectNUmber type
            c = SourceColumn(vid=str(
                GeneralNumber2('C', self.d_vid, self.sequence_id,
                               int(position))),
                             position=position,
                             st_vid=self.vid,
                             d_vid=self.d_vid,
                             datatype=datatype.__name__ if isinstance(
                                 datatype, type) else datatype,
                             source_header=source_header,
                             **kwargs)

            self.columns.append(c)

        return c

    @property
    def column_map(self):
        return {c.source_header: c.dest_header for c in self.columns}

    @property
    def column_index_map(self):
        return {c.source_header: c.position for c in self.columns}

    @property
    def headers(self):
        return [c.source_header for c in self.columns]

    @property
    def widths(self):
        widths = [c.width for c in self.columns]
        if not all(bool(e) for e in widths):
            from ambry.dbexceptions import ConfigurationError
            raise ConfigurationError(
                'The widths array for source table {} has zero or null entries '
                .format(self.name))

        widths = [int(w) for w in widths]

        return widths

    def update_id(self, sequence_id=None):
        """Alter the sequence id, and all of the names and ids derived from it. This
        often needs to be don after an IntegrityError in a multiprocessing run"""
        from ..identity import GeneralNumber1

        if sequence_id:
            self.sequence_id = sequence_id

        self.vid = str(GeneralNumber1('T', self.d_vid, self.sequence_id))

    def __str__(self):
        from tabulate import tabulate

        headers = 'Pos Source_Header Dest_Header Datatype '.split()
        rows = [(c.position, c.source_header, c.dest_header, c.datatype)
                for c in self.columns]

        return ('Source Table: {}\n'.format(self.name)) + tabulate(
            rows, headers)
Esempio n. 12
0
class Dataset(Base):
    __tablename__ = 'datasets'

    vid = SAColumn('d_vid', String(13), primary_key=True)
    id = SAColumn('d_id', String(10))
    name = SAColumn('d_name', String(200), nullable=False, index=True)
    vname = SAColumn('d_vname',
                     String(200),
                     unique=True,
                     nullable=False,
                     index=True)
    fqname = SAColumn('d_fqname', String(200), unique=True, nullable=False)
    cache_key = SAColumn('d_cache_key',
                         String(200),
                         unique=True,
                         nullable=False,
                         index=True)
    source = SAColumn('d_source', String(200), nullable=False)
    dataset = SAColumn('d_dataset', String(200), nullable=False)
    subset = SAColumn('d_subset', String(200))
    variation = SAColumn('d_variation', String(200))
    btime = SAColumn('d_btime', String(200))
    bspace = SAColumn('d_bspace', String(200))
    revision = SAColumn('d_revision', Integer, nullable=False)
    version = SAColumn('d_version', String(20), nullable=False)

    space_coverage = SAColumn('d_scov',
                              MutationList.as_mutable(JSONEncodedObj))
    time_coverage = SAColumn('d_tcov', MutationList.as_mutable(JSONEncodedObj))
    grain_coverage = SAColumn('d_gcov',
                              MutationList.as_mutable(JSONEncodedObj))

    # Sequence IDs for various objects. We need records if these IDs to be able to
    # construct objects in multi-process environments. The sequence numbers become part of the VIDs and must
    # be unique
    p_sequence_id = SAColumn('d_p_sequence_id', Integer, default=1)
    t_sequence_id = SAColumn('d_t_sequence_id', Integer, default=1)
    st_sequence_id = SAColumn('d_st_sequence_id', Integer, default=1)

    state = SAColumn('d_state',
                     String(20),
                     doc='Indicates last operation on the dataset'
                     )  # Note! Different from Bundle.state!

    upstream = SAColumn('d_upstream',
                        String(200),
                        doc='The URL of the upstream source')

    data = SAColumn('d_data', MutationDict.as_mutable(JSONEncodedObj))

    # ---- Relationships

    tables = relationship('Table',
                          backref='dataset',
                          cascade='all, delete-orphan')

    partitions = relationship('Partition',
                              backref='dataset',
                              cascade='all, delete-orphan')

    configs = relationship('Config',
                           backref='dataset',
                           cascade='all, delete-orphan')

    files = relationship('File',
                         backref='dataset',
                         cascade='all, delete-orphan')

    source_tables = relationship('SourceTable',
                                 backref='dataset',
                                 cascade='all, delete-orphan')

    source_columns = relationship('SourceColumn',
                                  backref='dataset',
                                  cascade='all, delete-orphan')

    sources = relationship('DataSource',
                           backref='dataset',
                           cascade='all, delete-orphan')

    codes = relationship('Code',
                         backref='dataset',
                         cascade='all, delete-orphan')

    path = None  # Set by the Library and other queries.
    _database = None  # Reference to the database, when dataset is retrieved from a database object

    _sequence_ids = {}  # Cache of sequence numbers ( Is this still used? )

    def __init__(self, *args, **kwargs):

        super(Dataset, self).__init__(*args, **kwargs)

        if self.vid and not self.id:
            self.revision = ObjectNumber.parse(self.vid).revision
            self.id = str(ObjectNumber.parse(self.vid).rev(None))

        if not self.id:
            dn = DatasetNumber(None, self.revision)
            self.vid = str(dn)
            self.id = str(dn.rev(None))
        elif not self.vid:
            try:
                self.vid = str(ObjectNumber.parse(self.id).rev(self.revision))
            except ValueError as e:
                raise ValueError('Could not parse id value; ' + e.message)

        if not self.revision:
            self.revision = 1

        if self.cache_key is None:
            self.cache_key = self.identity.name.cache_key

        if not self.name:
            self.name = str(self.identity.name)

        if not self.vname:
            self.vname = str(self.identity.vname)

        if not self.fqname:
            self.fqname = str(self.identity.fqname)

        if not self.version:
            self.version = str(self.identity.version)

        assert self.vid[0] == 'd'

    def incver(self):
        """Increment all of the version numbers"""
        d = {}
        for p in self.__mapper__.attrs:
            if p.key in ['vid', 'vname', 'fqname', 'version', 'cache_key']:
                continue
            if p.key == 'revision':
                d[p.key] = self.revision + 1
            else:
                d[p.key] = getattr(self, p.key)

        n = Dataset(**d)

        return n

    def commit(self):
        self._database.commit()

    def rollback(self):
        self._database.rollback()

    def rollback(self):
        self._database.close()

    @property
    def session(self):
        return self._database.session

    def query(self, *args, **kwargs):
        return self.session.query(*args, **kwargs)

    def close(self):
        return self._database.close()

    def close_session(self):
        return self._database.close_session()

    @property
    def identity(self):
        from ..identity import Identity
        return Identity.from_dict(self.dict)

    @property
    def config(self):
        return ConfigAccessor(self)

    def next_sequence_id(self, table_class, force_query=False):
        """Return the next sequence id for a object, identified by the vid of the parent object, and the database prefix
        for the child object. On the first call, will load the max sequence number
        from the database, but subsequence calls will run in process, so this isn't suitable for
        multi-process operation -- all of the tables in a dataset should be created by one process

        The child table must have a sequence_id value.

        """

        from . import next_sequence_id
        from sqlalchemy.orm import object_session

        # NOTE: This next_sequence_id uses a different algorithm than dataset.next_sequence_id
        # FIXME replace this one with dataset.next_sequence_id
        return next_sequence_id(object_session(self),
                                self._sequence_ids,
                                self.vid,
                                table_class,
                                force_query=force_query)

    def new_unique_object(self,
                          table_class,
                          sequence_id=None,
                          force_query=False,
                          **kwargs):
        """Use next_sequence_id to create a new child of the dataset, with a unique id"""
        from sqlalchemy.exc import IntegrityError
        from sqlalchemy.orm.exc import FlushError

        # If a sequence ID was specified, the caller is certain
        #  that there is no potential for conflicts,
        # so there is no need to commit here.
        if not sequence_id:
            commit = True
            sequence_id = self.next_sequence_id(table_class,
                                                force_query=force_query)
        else:
            commit = False

        o = table_class(d_vid=self.vid, **kwargs)

        o.update_id(sequence_id)

        if commit is False:
            return o

        self.commit()

        if self._database.driver == 'sqlite':
            # The Sqlite database can't have concurrency, so there no problem.
            self.session.add(o)
            self.commit()
            return o
        else:  # Postgres. Concurrency is a bitch.
            table_name = table_class.__tablename__
            child_sequence_id = table_class.sequence_id.property.columns[
                0].name

        try:

            self.session.add(o)
            self.commit()
            return o

        except (IntegrityError, FlushError) as e:
            self.rollback()
            self.session.merge(self)
            print 'Failed'
            return None

        return
        # This is horrible, but it's the only thing that has worked for both
        # Sqlite and Postgres in both single processes and multiprocesses.
        d_vid = self.vid
        while True:
            try:
                self.session.add(o)
                self.commit()
                return o

            except (IntegrityError, FlushError) as e:

                self.rollback()

                self.session.expunge_all()
                ds = self._database.dataset(d_vid)
                sequence_id = ds.next_sequence_id(table_class,
                                                  force_query=True)

                o.update_id(sequence_id)

            except Exception as e:

                print(
                    'Completely failed to get a new {} sequence_id; {}'.format(
                        table_class, e))
                self.rollback()
                import traceback

                # This bit is helpful in a multiprocessing run.
                tb = traceback.format_exc()

                print(tb)
                raise

    def table(self, ref):
        from .exc import NotFoundError
        from .table import Table

        table_name = Table.mangle_name(str(ref))

        for t in self.tables:
            if table_name == t.name or str(ref) == t.id or str(ref) == t.vid:
                return t

        raise NotFoundError(
            "Failed to find table for ref '{}' in dataset '{}'".format(
                ref, self.name))

    def new_table(self, name, add_id=True, **kwargs):
        '''Add a table to the schema, or update it it already exists.

        If updating, will only update data.
        '''
        from . import Table
        from .exc import NotFoundError

        try:
            table = self.table(name)
            extant = True
        except NotFoundError:

            extant = False

            if 'sequence_id' not in kwargs:
                kwargs['sequence_id'] = self._database.next_sequence_id(
                    Dataset, self.vid, Table)

            table = Table(name=name, d_vid=self.vid, **kwargs)

            table.update_id()

        # Update possibly extant data
        table.data = dict((list(table.data.items()) if table.data else []) +
                          list(kwargs.get('data', {}).items()))

        for key, value in list(kwargs.items()):

            if not key:
                continue
            if key[0] != '_' and key not in [
                    'vid', 'id', 'id_', 'd_id', 'name', 'sequence_id', 'table',
                    'column', 'data'
            ]:
                setattr(table, key, value)

        if add_id:
            table.add_id_column()

        if not extant:
            self.tables.append(table)

        return table

    def new_partition(self, table, **kwargs):
        """ Creates new partition and returns it.

        Args:
            table (orm.Table):

        Returns:
            orm.Partition
        """

        from . import Partition

        # Create the basic partition record, with a sequence ID.

        if isinstance(table, string_types):
            table = self.table(table)

        if 'sequence_id' in kwargs:
            sequence_id = kwargs['sequence_id']
            del kwargs['sequence_id']
        else:
            sequence_id = self._database.next_sequence_id(
                Dataset, self.vid, Partition)

        p = Partition(t_vid=table.vid,
                      table_name=table.name,
                      sequence_id=sequence_id,
                      dataset=self,
                      d_vid=self.vid,
                      **kwargs)

        p.update_id()

        return p

    def partition(self, ref=None, **kwargs):
        """ Returns partition by ref. """
        from .exc import NotFoundError
        from six import text_type

        if ref:

            for p in self.partitions:  # This is slow for large datasets, like Census years.
                if (text_type(ref) == text_type(p.name)
                        or text_type(ref) == text_type(p.id)
                        or text_type(ref) == text_type(p.vid)):
                    return p

            raise NotFoundError(
                "Failed to find partition for ref '{}' in dataset '{}'".format(
                    ref, self.name))

        elif kwargs:
            from ..identity import PartitionNameQuery

            pnq = PartitionNameQuery(**kwargs)
            return self._find_orm

    def _find_orm(self, pnq):
        """Return a Partition object from the database based on a PartitionId.

        An ORM object is returned, so changes can be persisted.

        """
        # import sqlalchemy.orm.exc
        from ..identity import PartitionNameQuery, NameQuery
        from ambry.orm import Partition as OrmPartition  # , Table
        from sqlalchemy.orm import joinedload  # , joinedload_all

        assert isinstance(
            pnq,
            PartitionNameQuery), "Expected PartitionNameQuery, got {}".format(
                type(pnq))

        pnq = pnq.with_none()

        q = self.bundle.database.session.query(OrmPartition)

        if pnq.fqname is not NameQuery.ANY:
            q = q.filter(OrmPartition.fqname == pnq.fqname)
        elif pnq.vname is not NameQuery.ANY:
            q = q.filter(OrmPartition.vname == pnq.vname)
        elif pnq.name is not NameQuery.ANY:
            q = q.filter(OrmPartition.name == str(pnq.name))
        else:
            if pnq.time is not NameQuery.ANY:
                q = q.filter(OrmPartition.time == pnq.time)

            if pnq.space is not NameQuery.ANY:
                q = q.filter(OrmPartition.space == pnq.space)

            if pnq.grain is not NameQuery.ANY:
                q = q.filter(OrmPartition.grain == pnq.grain)

            if pnq.format is not NameQuery.ANY:
                q = q.filter(OrmPartition.format == pnq.format)

            if pnq.segment is not NameQuery.ANY:
                q = q.filter(OrmPartition.segment == pnq.segment)

            if pnq.table is not NameQuery.ANY:

                if pnq.table is None:
                    q = q.filter(OrmPartition.t_id is None)
                else:
                    tr = self.bundle.schema.table(pnq.table)

                    if not tr:
                        raise ValueError(
                            "Didn't find table named {} in {} bundle path = {}"
                            .format(pnq.table, pnq.vname,
                                    self.bundle.database.path))

                    q = q.filter(OrmPartition.t_id == tr.id_)

        ds = self.bundle.dataset

        q = q.filter(OrmPartition.d_vid == ds.vid)

        q = q.order_by(OrmPartition.vid.asc()).order_by(
            OrmPartition.segment.asc())

        q = q.options(joinedload(OrmPartition.table))

        return q

    def delete_tables_partitions(self):
        self.t_sequence_id = 1
        self.p_sequence_id = 1
        return self._database.delete_tables_partitions(self)

    def delete_partitions(self):
        self.p_sequence_id = 1
        return self._database.delete_partitions(self)

    def new_source(self, name, **kwargs):
        from .source import DataSource
        from ..identity import GeneralNumber1

        if 'sequence_id' not in kwargs:
            kwargs['sequence_id'] = self.next_sequence_id(DataSource)

        if 'd_vid' not in kwargs:
            kwargs['d_vid'] = self.vid
        else:
            assert kwargs['d_vid'] == self.vid

        if 'vid' not in kwargs:
            kwargs['vid'] = str(
                GeneralNumber1('S', self.vid, int(kwargs['sequence_id'])))

        source = DataSource(name=name, **kwargs)

        object_session(self).add(source)

        return source

    def source_file(self, name):
        from .source import DataSource

        source = object_session(self)\
            .query(DataSource)\
            .filter(DataSource.name == name)\
            .filter(DataSource.d_vid == self.vid)\
            .first()

        if not source:  # Try as a source vid
            source = object_session(self) \
                .query(DataSource) \
                .filter(DataSource.vid == name) \
                .filter(DataSource.d_vid == self.vid) \
                .first()

        if not source:
            from .exc import NotFoundError
            raise NotFoundError(
                "Failed to find source for name : '{}' ".format(name))

        return source

    def new_source_table(self, name, sequence_id=None):
        from .source_table import SourceTable

        extant = next(iter(e for e in self.source_tables if e.name == name),
                      None)

        if extant:
            return extant

        if not sequence_id:
            sequence_id = self._database.next_sequence_id(
                Dataset, self.vid, SourceTable)

        assert sequence_id

        table = SourceTable(name=name, d_vid=self.vid, sequence_id=sequence_id)

        table.update_id()

        self.source_tables.append(table)

        assert table.sequence_id

        return table

    def source_table(self, name):

        for st in self.source_tables:
            if st.name == name:
                return st

        return None

    def bsfile(self, path):
        """Return a Build Source file ref, creating a new one if the one requested does not exist"""
        from sqlalchemy.orm.exc import NoResultFound
        from ambry.orm.exc import NotFoundError

        try:

            f =  object_session(self)\
                .query(File)\
                .filter(File.d_vid == self.vid)\
                .filter(File.major_type == File.MAJOR_TYPE.BUILDSOURCE)\
                .filter(File.path == path)\
                .one()

            return f

        except NoResultFound:
            raise NotFoundError(
                "Failed to find file for path '{}' ".format(path))

    def new_bsfile(self, file_const, path):
        import time

        fr = File(
            d_vid=self.vid,
            major_type=File.MAJOR_TYPE.BUILDSOURCE,
            minor_type=file_const,
            path=path,
            #modified = int(time.time()), # In case content isn't set, which is where modified is set normally
            source='fs')

        self.files.append(fr)
        object_session(self).add(fr)
        return fr

    def find_or_new_bsfile(self, file_const, path):
        from ambry.orm.exc import NotFoundError
        try:
            return self.bsfile(path)
        except NotFoundError:
            import time
            f = self.new_bsfile(file_const, path)
            return f

    @property
    def dict(self):
        d = {
            'id': self.id,
            'vid': self.vid,
            'name': self.name,
            'vname': self.vname,
            'fqname': self.fqname,
            'cache_key': self.cache_key,
            'source': self.source,
            'dataset': self.dataset,
            'subset': self.subset,
            'variation': self.variation,
            'btime': self.btime,
            'bspace': self.bspace,
            'revision': self.revision,
            'version': self.version,
            'upstream': self.upstream
        }

        if self.data:
            for k in self.data:
                assert k not in d
                d[k] = self.data[k]

        return d

    def row(self, fields):
        """Return a row for fields, for CSV files, pretty printing, etc, give a set of fields to return"""

        d = self.dict

        row = [None] * len(fields)

        for i, f in enumerate(fields):
            if f in d:
                row[i] = d[f]

        return row

    def __repr__(self):
        return """<datasets: id={} vid={} name={} source={} ds={} ss={} var={} rev={}>""".format(
            self.id, self.vid, self.name, self.source, self.dataset,
            self.subset, self.variation, self.revision)
Esempio n. 13
0
class Code(Base):
    """Code entries for variables."""
    __tablename__ = 'codes'

    c_vid = SAColumn('cd_c_vid',
                     String(20),
                     ForeignKey('columns.c_vid'),
                     primary_key=True,
                     index=True,
                     nullable=False)

    d_vid = SAColumn('cd_d_vid',
                     String(20),
                     ForeignKey('datasets.d_vid'),
                     primary_key=True,
                     nullable=False,
                     index=True)

    key = SAColumn(
        'cd_skey', String(20), primary_key=True, nullable=False,
        index=True)  # String version of the key, the value in the dataset
    ikey = SAColumn('cd_ikey', Integer,
                    index=True)  # Set only if the key is actually an integer

    value = SAColumn('cd_value', Text,
                     nullable=False)  # The value the key maps to
    description = SAColumn('cd_description', Text)

    source = SAColumn('cd_source', Text)

    data = SAColumn('cd_data', MutationDict.as_mutable(JSONEncodedObj))

    def __init__(self, **kwargs):

        for p in self.__mapper__.attrs:
            if p.key in kwargs:
                setattr(self, p.key, kwargs[p.key])
                del kwargs[p.key]

        if self.data:
            self.data.update(kwargs)

    def __repr__(self):
        return "<code: {}->{} >".format(self.key, self.value)

    def update(self, f):
        """Copy another files properties into this one."""

        for p in self.__mapper__.attrs:

            if p.key == 'oid':
                continue
            try:
                setattr(self, p.key, getattr(f, p.key))

            except AttributeError:
                # The dict() method copies data property values into the main dict,
                # and these don't have associated class properties.
                continue

    @property
    def insertable_dict(self):

        d = {('cd_' + k).strip('_'): v for k, v in iteritems(self.dict)}

        # the `key` property is not named after its db column
        d['cd_skey'] = d['cd_key']
        del d['cd_key']

        return d

    @staticmethod
    def before_insert(mapper, conn, target):

        target.d_vid = str(ObjectNumber.parse(target.c_vid).as_dataset)
Esempio n. 14
0
class Account(Base):

    __tablename__ = 'accounts'

    id = SAColumn('ac_id', Integer, primary_key=True)
    d_vid = SAColumn('ac_d_vid',
                     String(20),
                     ForeignKey('datasets.d_vid'),
                     index=True)
    user_id = SAColumn('ac_user_id', Text, index=True, doc='Ambry User')
    organization_id = SAColumn('ac_org_id',
                               Text,
                               index=True,
                               doc='Ambry Organization')

    major_type = SAColumn(
        'ac_major_type',
        Text,
        doc='Major type, often name of service or account providing company')
    minor_type = SAColumn('ac_minor_type',
                          Text,
                          doc='Minor type, subtype of the major type')

    # Foreign account identifier, often a bucket name or domain name.
    # The key used to reference the account
    account_id = SAColumn('ac_account_id', Text, unique=True)

    url = SAColumn('ac_org', Text, doc='URL of service')

    access_key = SAColumn('ac_access', Text, doc='Access token or username')

    encrypted_secret = SAColumn('ac_secret',
                                Text,
                                doc='Symmetrically encrypted secret')

    encrypted_password = SAColumn('ac_password',
                                  Text,
                                  doc='Asymmetrically encrypted user password')

    name = SAColumn('ac_name', Text, doc='Person\'s name')
    email = SAColumn('ac_email', Text, doc='Email for foreign account')
    org = SAColumn('ac_url', Text, doc='Organization name')
    comment = SAColumn('ac_comment', Text)  # Access token or username
    data = SAColumn('ac_data', MutationDict.as_mutable(JSONEncodedObj))

    __table_args__ = (UniqueConstraint('ac_account_id',
                                       'ac_access',
                                       name='_uc_account_1'), )

    secret_password = None  # Must be set to encrypt or decrypt secret

    def incver(self):
        """Increment all of the version numbers and return a new object"""
        from . import incver
        return incver(self, ['d_vid'])

    @staticmethod
    def sym_encrypt(password, v):
        return encrypt(password, v).encode('base64')

    @staticmethod
    def sym_decrypt(password, v):
        import binascii

        try:
            return decrypt(password, v.decode('base64'))
        except SC_DecryptionException as e:
            raise AccountDecryptionError('Wrong password ')
        except binascii.Error as e:
            raise AccountDecryptionError('Bad password: {}'.format(e))

    @property
    def secret(self):
        assert self.secret_password  # The encryption password
        if self.encrypted_secret:
            return self.sym_decrypt(self.secret_password,
                                    self.encrypted_secret)
        else:
            return None

    @secret.setter
    def secret(self, v):
        assert self.secret_password  # The encryption password
        self.encrypted_secret = self.sym_encrypt(self.secret_password, v)

    def decrypt_secret(self, password=None):

        if not password:
            password = self.secret_password

        if not self.encrypted_secret:
            return None

        if self.major_type == 'user':
            return None  # These can't be decrypted, only tested.

        if password:
            try:
                return self.sym_decrypt(password, self.encrypted_secret)
            except AccountDecryptionError as e:
                raise AccountDecryptionError(
                    "Decryption error for account '{}': {}".format(
                        self.account_id, e))
        else:
            raise MissingPasswordError(
                'Must have a password to get or set the secret')

    def encrypt_secret(self, v, password=None):

        if not password:
            password = self.secret_password

        if password:
            self.encrypted_secret = self.sym_encrypt(password, v)
        else:
            raise MissingPasswordError(
                'Must have a password to get or set the secret')

        return self.encrypted_secret

    @property
    def password(self):
        raise NotImplemented('Use test()')

    @password.setter
    def password(self, v):
        assert self.secret_password
        self.encrypted_password = self.sym_encrypt(self.secret_password, v)

    def encrypt_password(self, v):
        from passlib.hash import pbkdf2_sha512
        assert v is not None
        self.encrypted_password = pbkdf2_sha512.encrypt(v,
                                                        rounds=50000,
                                                        salt_size=16)

    def test(self, v):
        """Test the password against a value"""
        from passlib.hash import pbkdf2_sha512
        assert self.encrypted_password is not None
        return pbkdf2_sha512.verify(v, self.encrypted_password)

    @staticmethod
    def before_insert(mapper, conn, target):
        Account.before_update(mapper, conn, target)

    @staticmethod
    def before_update(mapper, conn, target):
        pass

    @classmethod
    def prop_map(cls):

        prop_map = {
            'service': 'major_type',
            'host': 'url',
            'organization': 'org',
            'apikey': 'secret',
            'access': 'access_key',
            'access_key': 'access_key',
            'secret': 'secret',
            'name': 'name',
            'org': 'org',
            'url': 'url',
            'email': 'email',
        }

        for p in cls.__mapper__.attrs:
            prop_map[p.key] = p.key

        return prop_map

    @property
    def dict(self):
        """A dict that holds key/values for all of the properties in the
        object.

        :return:

        """

        d = {
            p.key: getattr(self, p.key)
            for p in self.__mapper__.attrs if p.key not in ('data')
        }

        d['secret'] = 'not available'

        if self.secret_password:
            try:
                d['secret'] = self.decrypt_secret()
            except AccountDecryptionError:
                pass

        if self.data:
            for k, v in self.data.items():
                d[k] = v

        return {k: v for k, v in d.items()}
Esempio n. 15
0
class Plot(Base):
    """Records of plots, links measures, dimensions and other plot configuratoin"""
    __tablename__ = 'plots'

    id = SAColumn('f_id', Integer, primary_key=True)

    d_vid = SAColumn('pl_d_vid',
                     String(20),
                     ForeignKey('datasets.d_vid'),
                     nullable=False,
                     index=True)
    dataset = relationship('Dataset', backref='plots')

    p_vid = SAColumn('pl_p_vid',
                     String(20),
                     ForeignKey('partitions.p_vid'),
                     nullable=False,
                     index=True)
    partition = relationship('Partition', backref='plots')

    title = SAColumn('pl_title', Text)
    description = SAColumn('pl_description', Text)

    type = SAColumn('pl_type', String(20))

    measure1 = SAColumn('pl_measure1',
                        String(20),
                        ForeignKey('columns.c_vid'),
                        nullable=True)
    measure2 = SAColumn('pl_measure2',
                        String(20),
                        ForeignKey('columns.c_vid'),
                        nullable=True)

    dimension1 = SAColumn('pl_dimension1',
                          String(20),
                          ForeignKey('columns.c_vid'),
                          nullable=True)
    dimension2 = SAColumn('pl_dimension2',
                          String(20),
                          ForeignKey('columns.c_vid'),
                          nullable=True)
    dimension3 = SAColumn('pl_dimension3',
                          String(20),
                          ForeignKey('columns.c_vid'),
                          nullable=True)

    d1text = SAColumn('pl_d1text', Text)
    d2text = SAColumn('pl_d2text', Text)
    d3text = SAColumn('pl_d3text', Text)

    error1 = SAColumn('pl_error1',
                      String(20),
                      ForeignKey('columns.c_vid'),
                      nullable=True)
    error2 = SAColumn('pl_error2',
                      String(20),
                      ForeignKey('columns.c_vid'),
                      nullable=True)

    multiple1 = SAColumn('pl_multiple1',
                         String(20),
                         ForeignKey('columns.c_vid'),
                         nullable=True)
    multiple2 = SAColumn('pl_multiple2',
                         String(20),
                         ForeignKey('columns.c_vid'),
                         nullable=True)

    data = SAColumn('pl_data', MutationDict.as_mutable(JSONEncodedObj))

    def dataframe(self,
                  filtered_dims={},
                  unstack=False,
                  df_class=None,
                  add_code=False):
        """
        Yield rows in a reduced format, with one dimension as an index, one measure column per
        secondary dimension, and all other dimensions filtered.


        :param measure: The column names of one or more measures
        :param p_dim: The primary dimension. This will be the index of the dataframe.
        :param s_dim: a secondary dimension. The returned frame will be unstacked on this dimension
        :param unstack:
        :param filtered_dims: A dict of dimension columns names that are filtered, mapped to the dimension value
        to select.
        :param add_code: When substitution a label for a column, also add the code value.
        :return:
        """

        measure = self.table.column(measure)
        p_dim = self.table.column(p_dim)

        assert measure
        assert p_dim

        if s_dim:
            s_dim = self.table.column(s_dim)

        from six import text_type

        def maybe_quote(v):
            from six import string_types
            if isinstance(v, string_types):
                return '"{}"'.format(v)
            else:
                return v

        all_dims = [p_dim.name] + filtered_dims.keys()

        if s_dim:
            all_dims.append(s_dim.name)

        if filtered_dims:
            all_dims += filtered_dims.keys()

        all_dims = [text_type(c) for c in all_dims]

        # "primary_dimensions" means something different here, all of the dimensions in the
        # dataset that do not have children.
        primary_dims = [text_type(c.name) for c in self.primary_dimensions]

        if set(all_dims) != set(primary_dims):
            raise ValueError(
                "The primary, secondary and filtered dimensions must cover all dimensions"
                + " {} != {}".format(sorted(all_dims), sorted(primary_dims)))

        columns = []

        p_dim_label = None
        s_dim_label = None

        if p_dim.label:

            # For geographic datasets, also need the gvid
            if p_dim.type_is_gvid:
                columns.append(p_dim.name)

            p_dim = p_dim_label = p_dim.label
            columns.append(p_dim_label.name)

        else:
            columns.append(p_dim.name)

        if s_dim:

            if s_dim.label:
                s_dim = s_dim_label = s_dim.label
                columns.append(s_dim_label.name)
            else:
                columns.append(s_dim.name)

        columns.append(measure.name)

        # Create the predicate to filter out the filtered dimensions
        if filtered_dims:
            code = ' and '.join("row.{} == {}".format(k, maybe_quote(v))
                                for k, v in filtered_dims.items())

            predicate = eval('lambda row: {}'.format(code))
        else:
            predicate = lambda row: True

        df = self.analysis.dataframe(predicate,
                                     columns=columns,
                                     df_class=df_class)

        if unstack:
            # Need to set the s_dim in the index to get a hierarchical index, required for unstacking.
            # The final df will have only the p_dim as an index.

            if s_dim:
                df = df.set_index([p_dim.name, s_dim.name])

                df = df.unstack()

                df.columns = df.columns.get_level_values(
                    1)  # [' '.join(col).strip() for col in df.columns.values]

            else:
                # Can't actually unstack without a second dimension.
                df = df.set_index(p_dim.name)

            df.reset_index()

        return df
Esempio n. 16
0
class Partition(Base):
    __tablename__ = 'partitions'

    STATES = Constant()
    STATES.SYNCED = 'synced'
    STATES.CLEANING = 'cleaning'
    STATES.CLEANED = 'cleaned'
    STATES.PREPARING = 'preparing'
    STATES.PREPARED = 'prepared'
    STATES.BUILDING = 'building'
    STATES.BUILT = 'built'
    STATES.COALESCING = 'coalescing'
    STATES.COALESCED = 'coalesced'
    STATES.ERROR = 'error'
    STATES.FINALIZING = 'finalizing'
    STATES.FINALIZED = 'finalized'
    STATES.INSTALLING = 'installing'
    STATES.INSTALLED = 'installed'

    TYPE = Constant
    TYPE.SEGMENT = 's'
    TYPE.UNION = 'u'

    sequence_id = SAColumn('p_sequence_id', Integer)
    vid = SAColumn('p_vid', String(16), primary_key=True, nullable=False)
    id = SAColumn('p_id', String(13), nullable=False)
    d_vid = SAColumn('p_d_vid', String(13), ForeignKey('datasets.d_vid'), nullable=False, index=True)
    t_vid = SAColumn('p_t_vid', String(15), ForeignKey('tables.t_vid'), nullable=False, index=True)
    name = SAColumn('p_name', String(200), nullable=False, index=True)
    vname = SAColumn('p_vname', String(200), unique=True, nullable=False, index=True)
    fqname = SAColumn('p_fqname', String(200), unique=True, nullable=False, index=True)

    title = SAColumn('p_title', String())
    description = SAColumn('p_description', String())
    notes = SAColumn('p_notes', String())

    cache_key = SAColumn('p_cache_key', String(200), unique=True, nullable=False, index=True)
    parent_vid = SAColumn('p_p_vid', String(16), ForeignKey('partitions.p_vid'), nullable=True, index=True)
    ref = SAColumn('p_ref', String(16), index=True,
                   doc='VID reference to an eariler version to use instead of this one.')
    type = SAColumn('p_type', String(20), default=TYPE.UNION,
                    doc='u - normal partition, s - segment')
    table_name = SAColumn('p_table_name', String(50))
    time = SAColumn('p_time', String(20))  # FIXME: add helptext
    space = SAColumn('p_space', String(50))
    grain = SAColumn('p_grain', String(50))
    variant = SAColumn('p_variant', String(50))
    format = SAColumn('p_format', String(50))
    segment = SAColumn('p_segment', Integer,
                       doc='Part of a larger partition. segment_id is usually also a source ds_id')
    epsg = SAColumn('p_epsg', Integer, doc='EPSG SRID for the reference system of a geographic dataset. ')

    # The partition could hold data that is considered a dimension -- if multiple datasets
    # were joined, that dimension would be a dimension column, but it only has a single
    # value in each partition.
    # That could be part of the name, or it could be declared in a table, with a single value for all of the
    # rows in a partition.

    min_id = SAColumn('p_min_id', BigIntegerType)
    max_id = SAColumn('p_max_id', BigIntegerType)
    count = SAColumn('p_count', Integer)
    state = SAColumn('p_state', String(50))
    data = SAColumn('p_data', MutationDict.as_mutable(JSONEncodedObj))

    space_coverage = SAColumn('p_scov', MutationList.as_mutable(JSONEncodedObj))
    time_coverage = SAColumn('p_tcov', MutationList.as_mutable(JSONEncodedObj))
    grain_coverage = SAColumn('p_gcov', MutationList.as_mutable(JSONEncodedObj))

    installed = SAColumn('p_installed', String(100))
    _location = SAColumn('p_location', String(100))  # Location of the data file

    __table_args__ = (
        # ForeignKeyConstraint( [d_vid, d_location], ['datasets.d_vid','datasets.d_location']),
        UniqueConstraint('p_sequence_id', 'p_d_vid', name='_uc_partitions_1'),
    )

    # For the primary table for the partition. There is one per partition, but a table
    # can be primary in multiple partitions.
    table = relationship('Table', backref='partitions', foreign_keys='Partition.t_vid')

    stats = relationship(ColumnStat, backref='partition', cascade='all, delete, delete-orphan')

    children = relationship('Partition', backref=backref('parent', remote_side=[vid]), cascade='all')

    _bundle = None  # Set when returned from a bundle.
    _datafile = None  # TODO: Unused variable.
    _datafile_writer = None  # TODO: Unused variable.
    _stats_dict = None

    @property
    def identity(self):
        """Return this partition information as a PartitionId."""

        if self.dataset is None:
            # The relationship will be null until the object is committed
            s = object_session(self)

            ds = s.query(Dataset).filter(Dataset.id_ == self.d_id).one()
        else:
            ds = self.dataset

        d = {
            'id': self.id,
            'vid': self.vid,
            'name': self.name,
            'vname': self.vname,
            'ref': self.ref,
            'space': self.space,
            'time': self.time,
            'table': self.table_name,
            'grain': self.grain,
            'variant': self.variant,
            'segment': self.segment,
            'format': self.format if self.format else 'db'
        }

        return PartitionIdentity.from_dict(dict(list(ds.dict.items()) + list(d.items())))

    @property
    def display(self):
        """Return an acessor object to get display titles and descriptions"""
        return PartitionDisplay(self)

    @property
    def bundle(self):
        return self._bundle  # Set externally, such as Bundle.wrap_partition

    @property
    def is_segment(self):
        return self.type == self.TYPE.SEGMENT

    @property
    def headers(self):
        return [c.name for c in self.table.columns]

    def __repr__(self):
        return '<partition: {} {}>'.format(self.vid, self.vname)

    def set_stats(self, stats):

        self.stats[:] = []  # Delete existing stats

        for c in self.table.columns:

            if c.name not in stats:
                continue

            d = stats[c.name].dict

            del d['name']
            del d['flags']
            cs = ColumnStat(p_vid=self.vid, d_vid=self.d_vid, c_vid=c.vid, **d)
            self.stats.append(cs)

    def parse_gvid_or_place(self, gvid_or_place):
        try:
            return parse_to_gvid(gvid_or_place)
        except KeyError:

            places = list(self._bundle._library.search.search_identifiers(gvid_or_place))

            if not places:
                err_msg = "Failed to find space identifier '{}' in full " \
                          "text identifier search  for partition '{}'" \
                    .format(gvid_or_place, str(self.identity))
                self._bundle.error(err_msg)
                return None

            return parse_to_gvid(places[0].vid)

    def set_coverage(self, stats):
        """"Extract time space and grain coverage from the stats and store them in the partition"""
        from ambry.util.datestimes import expand_to_years

        scov = set()
        tcov = set()
        grains = set()

        def summarize_maybe(gvid):
            try:
                return parse_to_gvid(gvid).summarize()
            except:
                return None

        def simplifiy_maybe(values, column):

            parsed = []

            for gvid in values:
                # The gvid should not be a st
                if gvid is None or gvid == 'None':
                    continue
                try:
                    parsed.append(parse_to_gvid(gvid))
                except ValueError as e:
                    if self._bundle:
                        self._bundle.warn("While analyzing geo coverage in final partition stage, " +
                                           "Failed to parse gvid '{}' in {}.{}: {}"
                                           .format(str(gvid), column.table.name, column.name, e))

            try:
                return isimplify(parsed)
            except:
                return None

        def int_maybe(year):
            try:
                return int(year)
            except:
                return None

        for c in self.table.columns:

            if c.name not in stats:
                continue

            try:
                if stats[c.name].is_gvid or stats[c.name].is_geoid:
                    scov |= set(x for x in simplifiy_maybe(stats[c.name].uniques, c))
                    grains |= set(summarize_maybe(gvid) for gvid in stats[c.name].uniques)

                elif stats[c.name].is_year:
                    tcov |= set(int_maybe(x) for x in stats[c.name].uniques)

                elif stats[c.name].is_date:
                    # The fuzzy=True argument allows ignoring the '-' char in dates produced by .isoformat()
                    try:
                        tcov |= set(parser.parse(x, fuzzy=True).year if isinstance(x, string_types) else x.year for x in
                                    stats[c.name].uniques)
                    except ValueError:
                        pass

            except Exception as e:
                self._bundle.error("Failed to set coverage for column '{}', partition '{}': {}"
                                   .format(c.name, self.identity.vname, e))
                raise

        # Space Coverage

        if 'source_data' in self.data:

            for source_name, source in list(self.data['source_data'].items()):
                scov.add(self.parse_gvid_or_place(source['space']))

        if self.identity.space:  # And from the partition name
            try:
                scov.add(self.parse_gvid_or_place(self.identity.space))
            except ValueError:
                # Couldn't parse the space as a GVid
                pass

        # For geo_coverage, only includes the higher level summary levels, counties, states,
        # places and urban areas.
        self.space_coverage = sorted([str(x) for x in scov if bool(x) and x.sl
                                      in (10, 40, 50, 60, 160, 400)])

        #
        # Time Coverage

        # From the source
        # If there was a time value in the source that this partition was created from, then
        # add it to the years.
        if 'source_data' in self.data:
            for source_name, source in list(self.data['source_data'].items()):
                if 'time' in source:
                    for year in expand_to_years(source['time']):
                        if year:
                            tcov.add(year)

        # From the partition name
        if self.identity.name.time:
            for year in expand_to_years(self.identity.name.time):
                if year:
                    tcov.add(year)

        self.time_coverage = [t for t in tcov if t]

        #
        # Grains

        if 'source_data' in self.data:
            for source_name, source in list(self.data['source_data'].items()):
                if 'grain' in source:
                    grains.add(source['grain'])

        self.grain_coverage = sorted(str(g) for g in grains if g)

    @property
    def dict(self):
        """A dict that holds key/values for all of the properties in the
        object.

        :return:

        """

        d = {p.key: getattr(self, p.key) for p in self.__mapper__.attrs
             if p.key not in ('table', 'dataset', '_codes', 'stats', 'data', 'process_records')}

        if self.data:
            # Copy data fields into top level dict, but don't overwrite existind values.
            for k, v in six.iteritems(self.data):
                if k not in d and k not in ('table', 'stats', '_codes', 'data'):
                    d[k] = v

        return d

    @property
    def detail_dict(self):
        """A more detailed dict that includes the descriptions, sub descriptions, table
        and columns."""

        d = self.dict

        def aug_col(c):
            d = c.dict
            d['stats'] = [s.dict for s in c.stats]
            return d

        d['table'] = self.table.dict
        d['table']['columns'] = [aug_col(c) for c in self.table.columns]

        return d

    @property
    def stats_dict(self):

        class Bunch(object):
            """Dict and object access to properties"""

            def __init__(self, o):
                self.__dict__.update(o)

            def __str__(self):
                return str(self.__dict__)

            def __repr__(self):
                return repr(self.__dict__)

            def keys(self):
                return list(self.__dict__.keys())

            def items(self):
                return list(self.__dict__.items())

            def iteritems(self):
                return iter(self.__dict__.items())

            def __getitem__(self, k):
                if k in self.__dict__:
                    return self.__dict__[k]
                else:
                    from . import ColumnStat
                    return ColumnStat(hist=[])

        if not self._stats_dict:
            cols = {s.column.name: Bunch(s.dict) for s in self.stats}

            self._stats_dict = Bunch(cols)

        return self._stats_dict

    def build_sample(self):

        name = self.table.name

        count = int(
            self.database.connection.execute('SELECT count(*) FROM "{}"'.format(name)).fetchone()[0])

        skip = count / 20

        if count > 100:
            sql = 'SELECT * FROM "{}" WHERE id % {} = 0 LIMIT 20'.format(name, skip)
        else:
            sql = 'SELECT * FROM "{}" LIMIT 20'.format(name)

        sample = []

        for j, row in enumerate(self.database.connection.execute(sql)):
            sample.append(list(row.values()))

        self.record.data['sample'] = sample

        s = self.bundle.database.session
        s.merge(self.record)
        s.commit()

    @property
    def row(self):
        # Use an Ordered Dict to make it friendly to creating CSV files.
        SKIP_KEYS = [
            'sequence_id', 'vid', 'id', 'd_vid', 't_vid', 'min_key', 'max_key',
            'installed', 'ref', 'count', 'state', 'data', 'space_coverage',
            'time_coverage', 'grain_coverage', 'name', 'vname', 'fqname', 'cache_key'
        ]

        d = OrderedDict([('table', self.table.name)] +
                        [(p.key, getattr(self, p.key)) for p in self.__mapper__.attrs
                         if p.key not in SKIP_KEYS])
        return d

    def update(self, **kwargs):

        if 'table' in kwargs:
            del kwargs['table']  # In source_schema.csv, this is the name of the table, not the object

        for k, v in list(kwargs.items()):
            if hasattr(self, k):
                setattr(self, k, v)

    def finalize(self, ps=None):

        self.state = self.STATES.FINALIZING

        # Write the stats for this partition back into the partition

        with self.datafile.writer as w:
            for i, c in enumerate(self.table.columns, 1):
                wc = w.column(i)
                assert wc.pos == c.sequence_id, (c.name, wc.pos, c.sequence_id)
                wc.name = c.name
                wc.description = c.description
                wc.type = c.python_type.__name__
                self.count = w.n_rows
            w.finalize()

        if self.type == self.TYPE.UNION:
            ps.update('Running stats ', state='running')
            stats = self.datafile.run_stats()

            self.set_stats(stats)
            self.set_coverage(stats)

        self._location = 'build'

        self.title = PartitionDisplay(self).title
        self.description = PartitionDisplay(self).description

        self.state = self.STATES.FINALIZED

    # =============
    # These methods are a bit non-cohesive, since they require the _bundle value to be set, which is
    # set externally, when the object is retured from a bundle.

    def clean(self):
        """Remove all built files and return the partition to a newly-created state"""
        if self.datafile:
            self.datafile.remove()

    @property
    def location(self):

        base_location = self._location

        if not base_location:
            return None

        if self._bundle.build_fs.exists(base_location):
            if self._bundle.build_fs.hashsyspath(base_location):
                return self._bundle.build_fs.getsyspath(base_location)

        return base_location

    @location.setter
    def location(self, v):
        self._location = v

    @property
    def datafile(self):
        from ambry.exc import NotFoundError

        if self.is_local:
            # Use the local version, if it exists
            logger.debug('datafile: Using local datafile {}'.format(self.vname))
            return self.local_datafile
        else:
            # If it doesn't try to get the remote.
            try:
                logger.debug('datafile: Using remote datafile {}'.format(self.vname))
                return self.remote_datafile
            except NotFoundError:
                # If the remote doesnt exist, return the local, so the caller can call  exists() on it,
                # get its path, etc.
                return self.local_datafile

    @property
    def local_datafile(self):
        """Return the datafile for this partition, from the build directory, the remote, or the warehouse"""
        from ambry_sources import MPRowsFile
        from fs.errors import ResourceNotFoundError
        from ambry.orm.exc import NotFoundError

        try:
            return MPRowsFile(self._bundle.build_fs, self.cache_key)

        except ResourceNotFoundError:
            raise NotFoundError(
                'Could not locate data file for partition {} (local)'.format(self.identity.fqname))

    @property
    def remote(self):
        """
        Return the remote for this partition

        :return:

        """
        from ambry.exc import NotFoundError

        ds = self.dataset

        if 'remote_name' not in ds.data:
            raise NotFoundError('Could not determine remote for partition: {}'.format(self.identity.fqname))

        return self._bundle.library.remote(ds.data['remote_name'])

    @property
    def remote_datafile(self):

        from fs.errors import ResourceNotFoundError
        from ambry.exc import AccessError, NotFoundError
        from boto.exception import S3ResponseError

        try:

            from ambry_sources import MPRowsFile

            remote = self.remote

            datafile = MPRowsFile(remote.fs, self.cache_key)

            if not datafile.exists:
                raise NotFoundError(
                    'Could not locate data file for partition {} from remote {} : file does not exist'
                        .format(self.identity.fqname, remote))

        except ResourceNotFoundError as e:
            raise NotFoundError('Could not locate data file for partition {} (remote): {}'
                                .format(self.identity.fqname, e))
        except S3ResponseError as e:
            # HACK. It looks like we get the response error with an access problem when
            # we have access to S3, but the file doesn't exist.
            raise NotFoundError("Can't access MPR file for {} in remote {}".format(self.cache_key, remote.fs))

        return datafile

    @property
    def is_local(self):
        """Return true is the partition file is local"""
        from ambry.orm.exc import NotFoundError
        try:
            if self.local_datafile.exists:
                return True
        except NotFoundError:
            pass

        return False

    def localize(self, ps=None):
        """Copy a non-local partition file to the local build directory"""
        from filelock import FileLock
        from ambry.util import ensure_dir_exists
        from ambry_sources import MPRowsFile
        from fs.errors import ResourceNotFoundError

        if self.is_local:
            return

        local = self._bundle.build_fs

        b = self._bundle.library.bundle(self.identity.as_dataset().vid)

        remote = self._bundle.library.remote(b)

        lock_path = local.getsyspath(self.cache_key + '.lock')

        ensure_dir_exists(lock_path)

        lock = FileLock(lock_path)

        if ps:
            ps.add_update(message='Localizing {}'.format(self.identity.name),
                          partition=self,
                          item_type='bytes',
                          state='downloading')

        if ps:
            def progress(bts):
                if ps.rec.item_total is None:
                    ps.rec.item_count = 0

                if not ps.rec.data:
                    ps.rec.data = {}  # Should not need to do this.
                    return self

                item_count = ps.rec.item_count + bts
                ps.rec.data['updates'] = ps.rec.data.get('updates', 0) + 1

                if ps.rec.data['updates'] % 32 == 1:
                    ps.update(message='Localizing {}'.format(self.identity.name),
                              item_count=item_count)
        else:
            from ambry.bundle.process import call_interval
            @call_interval(5)
            def progress(bts):
                self._bundle.log("Localizing {}. {} bytes downloaded".format(self.vname, bts))

        def exception_cb(e):
            raise e

        with lock:
            # FIXME! This won't work with remote ( http) API, only FS ( s3:, file:)

            if self.is_local:
                return self

            try:
                with remote.fs.open(self.cache_key + MPRowsFile.EXTENSION, 'rb') as f:
                    event = local.setcontents_async(self.cache_key + MPRowsFile.EXTENSION,
                                                    f,
                                                    progress_callback=progress,
                                                    error_callback=exception_cb)
                    event.wait()
                    if ps:
                        ps.update_done()
            except ResourceNotFoundError as e:
                from ambry.orm.exc import NotFoundError
                raise NotFoundError("Failed to get MPRfile '{}' from {}: {} "
                                    .format(self.cache_key, remote.fs, e))

        return self

    @property
    def reader(self):
        from ambry.orm.exc import NotFoundError
        from fs.errors import ResourceNotFoundError
        """The reader for the datafile"""

        try:
            return self.datafile.reader
        except ResourceNotFoundError:
            raise NotFoundError("Failed to find partition file, '{}' "
                                .format(self.datafile.path))

    def select(self, predicate=None, headers=None):
        """
        Select rows from the reader using a predicate to select rows and and itemgetter to return a
        subset of elements
        :param predicate: If defined, a callable that is called for each row, and if it returns true, the
        row is included in the output.
        :param headers: If defined, a list or tuple of header names to return from each row
        :return: iterable of results

        WARNING: This routine works from the reader iterator, which returns RowProxy objects. RowProxy objects
        are reused, so if you construct a list directly from the output from this method, the list will have
        multiple copies of a single RowProxy, which will have as an inner row the last result row. If you will
        be directly constructing a list, use a getter that extracts the inner row, or which converts the RowProxy
        to a dict:

            list(s.datafile.select(lambda r: r.stusab == 'CA', lambda r: r.dict ))

        """

        # FIXME; in Python 3, use yield from
        with self.reader as r:
            for row in r.select(predicate, headers):
                yield row

    def __iter__(self):
        """ Iterator over the partition, returning RowProxy objects.
        :return: a generator
        """

        with self.reader as r:
            for row in r:
                yield row

    @property
    def analysis(self):
        """Return an AnalysisPartition proxy, which wraps this partition to provide acess to
        dataframes, shapely shapes and other analysis services"""
        if isinstance(self, PartitionProxy):
            return AnalysisPartition(self._obj)
        else:
            return AnalysisPartition(self)

    @property
    def measuredim(self):
        """Return a MeasureDimension proxy, which wraps the partition to provide access to
        columns in terms of measures and dimensions"""

        if isinstance(self, PartitionProxy):
            return MeasureDimensionPartition(self._obj)
        else:
            return MeasureDimensionPartition(self)

    # ============================

    def update_id(self, sequence_id=None):
        """Alter the sequence id, and all of the names and ids derived from it. This
        often needs to be done after an IntegrityError in a multiprocessing run"""

        if sequence_id:
            self.sequence_id = sequence_id

        self._set_ids(force=True)

        if self.dataset:
            self._update_names()

    def _set_ids(self, force=False):

        if not self.sequence_id:
            from .exc import DatabaseError

            raise DatabaseError('Sequence ID must be set before insertion')

        if not self.vid or force:
            assert bool(self.d_vid)
            assert bool(self.sequence_id)
            don = ObjectNumber.parse(self.d_vid)
            assert don.revision
            on = don.as_partition(self.sequence_id)
            self.vid = str(on.rev(don.revision))
            self.id = str(on.rev(None))

        if not self.data:
            self.data = {}

    def _update_names(self):
        """Update the derived names"""

        d = dict(
            table=self.table_name,
            time=self.time,
            space=self.space,
            grain=self.grain,
            variant=self.variant,
            segment=self.segment
        )

        assert self.dataset

        name = PartialPartitionName(**d).promote(self.dataset.identity.name)

        self.name = str(name.name)
        self.vname = str(name.vname)
        self.cache_key = name.cache_key
        self.fqname = str(self.identity.fqname)

    @staticmethod
    def before_insert(mapper, conn, target):
        """event.listen method for Sqlalchemy to set the sequence for this
        object and create an ObjectNumber value for the id_"""

        target._set_ids()

        if target.name and target.vname and target.cache_key and target.fqname and not target.dataset:
            return

        Partition.before_update(mapper, conn, target)

    @staticmethod
    def before_update(mapper, conn, target):
        target._update_names()

    @staticmethod
    def before_delete(mapper, conn, target):
        pass