Exemple #1
0
    def new_table(self, ds, n):
        t =  Table(ds, sequence_id = n, name = 'table{}'.format(n))

        for i in range(5):
            c = Column(sequence_id = i, name = 'column_{}_{}'.format(n,i), datatype = 'Integer')
            t.columns.append(c)

            for i in range(5):
                c.add_code(str(i), str(i))

        return t
Exemple #2
0
    def mangle_column_name(self, i, n):
        """
        Override this method to change the way that column names from the source are altered to
        become column names in the schema

        :param i: column number
        :param n: original column name
        :return:
        """
        from ambry.orm import Column

        if not n:
            return 'column{}'.format(i)

        mn = Column.mangle_name(n.strip())

        if mn in self.col_map:
            col = self.col_map[mn]['col']
            if col:
                return col
            else:
                return mn

        else:
            return mn
Exemple #3
0
    def mangle_column_name(self, i, n):
        """
        Override this method to change the way that column names from the source are altered to
        become column names in the schema

        :param i: column number
        :param n: original column name
        :return:
        """
        from ambry.orm import Column

        if not n:
            return 'column{}'.format(i)

        mn =  Column.mangle_name(n.strip())

        if mn in self.col_map:
            col =  self.col_map[mn]['col']
            if col:
                return col
            else:
                return mn

        else:
            return mn
Exemple #4
0
    def meta_find_int(self):
        """Look for columns that have non-int values and convert the schema entry to real. """
        import sqlite3
        from ambry.orm import Column

        conn = sqlite3.connect(self.filesystem.build_path('chis12d.db'))
        conn.row_factory = sqlite3.Row

        notint = set()

        lr = self.init_log_rate(500)
        for row in conn.execute('SELECT * FROm chis12d limit 10000'):
            row = dict(zip(row.keys(), row))

            for k, v in row.items():
                if int(v) != v:
                    notint.add(Column.mangle_name(k))
            lr()

        with self.session:
            t = self.schema.table('chis_puf')
            for c in t.columns:

                if c.name in notint:
                    self.log(c.name)
                    c.datatype = c.DATATYPE_REAL
                else:
                    c.datatype = c.DATATYPE_INTEGER

        self.schema.write_schema()
    def meta_find_int(self):
        """Look for columns that have non-int values and convert the schema entry to real. """
        import sqlite3
        from ambry.orm import Column
        
        
        conn = sqlite3.connect(self.filesystem.build_path('chis12d.db'))
        conn.row_factory = sqlite3.Row

        notint = set()

        lr = self.init_log_rate(500)
        for row in conn.execute('SELECT * FROm chis12d limit 10000'):
            row = dict(zip(row.keys(),row))
    
            for k,v in row.items():
                if int(v) != v:
                    notint.add(Column.mangle_name(k))
            lr()
            
        with self.session:
            t = self.schema.table('chis_puf')
            for c in t.columns:
                
                if c.name in notint:
                    self.log(c.name)
                    c.datatype = c.DATATYPE_REAL
                else:
                    c.datatype = c.DATATYPE_INTEGER
            
        self.schema.write_schema()
Exemple #6
0
    def column(self, ref):
        # AFAIK, all of the columns in the relationship will get loaded if any one is accessed,
        # so iterating over the collection only involves one SELECT.
        from .column import Column

        column_name = Column.mangle_name(str(ref))

        for c in self.columns:
            if str(column_name) == c.name or str(ref) == c.id or str(ref) == c.vid:
                return c

        raise NotFoundError(
            "Failed to find column '{}' in table '{}' for ref: '{}' ".format(ref, self.name, ref))
Exemple #7
0
    def column(self, ref):
        # AFAIK, all of the columns in the relationship will get loaded if any one is accessed,
        # so iterating over the collection only involves one SELECT.
        from .column import Column

        column_name = Column.mangle_name(str(ref))

        for c in self.columns:
            if str(column_name) == c.name or str(ref) == c.id or str(
                    ref) == c.vid:
                return c

        raise NotFoundError(
            "Failed to find column '{}' in table '{}' for ref: '{}' ".format(
                ref, self.name, ref))
Exemple #8
0
    def load_shapefile(self, path, logger=None):
        """Load a shapefile into the partition. Loads the features and inserts
        them using an inserter.

        :param path:
        :return:

        """

        from osgeo import ogr  # , osr
        from ..geo.sfschema import ogr_inv_type_map, mangle_name
        from ..orm import Column, Geometry
        from ..geo.util import get_type_from_geometry

        if path.startswith('http'):
            shape_url = path
            path = self.bundle.filesystem.download_shapefile(shape_url)

        driver = ogr.GetDriverByName("ESRI Shapefile")

        dataSource = driver.Open(path, 0)

        layer = dataSource.GetLayer()

        to_srs = ogr.osr.SpatialReference()
        to_srs.ImportFromEPSG(Geometry.DEFAULT_SRS)

        dfn = layer.GetLayerDefn()

        col_defs = []

        for i in range(0, dfn.GetFieldCount()):
            field = dfn.GetFieldDefn(i)

            col_defs.append(
                (Column.mangle_name(
                    mangle_name(
                        field.GetName())),
                    Column.types[
                        ogr_inv_type_map[
                            field.GetType()]][1]))

        col_type = None
        for c in self.table.columns:
            if c.name == 'geometry':
                col_type = c.datatype.upper()
                break

        assert col_type is not None

        with self.inserter() as ins:
            for feature in layer:
                d = {}
                for i in range(0, dfn.GetFieldCount()):
                    name, type_ = col_defs[i]
                    try:
                        d[name] = feature.GetFieldAsString(i)
                    except TypeError as e:
                        self.bundle.logger.error(
                            "Type error for column '{}', type={}: {}".format(
                                name,
                                type_,
                                e))
                        raise

                g = feature.GetGeometryRef()
                g.TransformTo(to_srs)

                type_ = get_type_from_geometry(g)

                if type_ != col_type:
                    if type_ == 'POLYGON' and col_type == 'MULTIPOLYGON':
                        g = ogr.ForceToMultiPolygon(g)
                    else:
                        raise Exception(
                            "Don't know how to handle this conversion case : {} -> {}".format(type_, col_type))

                d['geometry'] = g.ExportToWkt()

                ins.insert(d)

                if logger:
                    logger(
                        "Importing shapefile to '{}'".format(
                            self.identity.name))
Exemple #9
0
    def add_column(self, name, update_existing=False, **kwargs):
        """
        Add a column to the table, or update an existing one.
        :param name: Name of the new or existing column.
        :param update_existing: If True, alter existing column values. Defaults to False
        :param kwargs: Other arguments for the the Column() constructor
        :return: a Column object
        """
        from ..identity import ColumnNumber

        try:
            c = self.column(name)
            extant = True

            if not update_existing:
                return c

        except NotFoundError:

            sequence_id = len(self.columns) + 1

            assert sequence_id

            c = Column(t_vid=self.vid,
                       sequence_id=sequence_id,
                       vid=str(
                           ColumnNumber(ObjectNumber.parse(self.vid),
                                        sequence_id)),
                       name=name,
                       datatype='str')
            extant = False

        # Update possibly existing data
        c.data = dict((list(c.data.items()) if c.data else []) +
                      list(kwargs.get('data', {}).items()))

        for key, value in list(kwargs.items()):

            if key[0] != '_' and key not in [
                    't_vid', 'name', 'sequence_id', 'data'
            ]:

                # Don't update the type if the user has specfied a custom type
                if key == 'datatype' and not c.type_is_builtin():
                    continue

                # Don't change a datatype if the value is set and the new value is unknown
                if key == 'datatype' and value == 'unknown' and c.datatype:
                    continue

                # Don't change a datatype if the value is set and the new value is unknown
                if key == 'description' and not value:
                    continue

                try:
                    setattr(c, key, value)
                except AttributeError:
                    raise AttributeError(
                        "Column record has no attribute {}".format(key))

            if key == 'is_primary_key' and isinstance(value,
                                                      str) and len(value) == 0:
                value = False
                setattr(c, key, value)

        # If the id column has a description and the table does not, add it to
        # the table.
        if c.name == 'id' and c.is_primary_key and not self.description:
            self.description = c.description

        if not extant:
            self.columns.append(c)

        return c
Exemple #10
0
    def load_shapefile(self, path, logger=None):
        """Load a shapefile into the partition. Loads the features and inserts
        them using an inserter.

        :param path:
        :return:

        """

        from osgeo import ogr, osr
        from ..geo.sfschema import ogr_inv_type_map, mangle_name
        from ..orm import Column, Geometry
        from ..geo.util import get_type_from_geometry

        if path.startswith('http'):
            shape_url = path
            path = self.bundle.filesystem.download_shapefile(shape_url)

        driver = ogr.GetDriverByName("ESRI Shapefile")

        dataSource = driver.Open(path, 0)

        layer = dataSource.GetLayer()

        to_srs = ogr.osr.SpatialReference()
        to_srs.ImportFromEPSG(Geometry.DEFAULT_SRS)

        dfn = layer.GetLayerDefn()

        col_defs = []

        for i in range(0, dfn.GetFieldCount()):
            field = dfn.GetFieldDefn(i)

            col_defs.append(
                (Column.mangle_name(
                    mangle_name(
                        field.GetName())),
                    Column.types[
                        ogr_inv_type_map[
                            field.GetType()]][1]))

        col_type = None
        for c in self.table.columns:
            if c.name == 'geometry':
                col_type = c.datatype.upper()
                break

        assert col_type is not None

        with self.inserter() as ins:
            for feature in layer:
                d = {}
                for i in range(0, dfn.GetFieldCount()):
                    name, type_ = col_defs[i]
                    try:
                        d[name] = feature.GetFieldAsString(i)
                    except TypeError as e:
                        self.bundle.logger.error(
                            "Type error for column '{}', type={}: {}".format(
                                name,
                                type_,
                                e))
                        raise

                g = feature.GetGeometryRef()
                g.TransformTo(to_srs)

                type_ = get_type_from_geometry(g)

                if type_ != col_type:
                    if type_ == 'POLYGON' and col_type == 'MULTIPOLYGON':
                        g = ogr.ForceToMultiPolygon(g)
                    else:
                        raise Exception(
                            "Don't know how to handle this conversion case : {} -> {}".format(type_, col_type))

                d['geometry'] = g.ExportToWkt()

                ins.insert(d)

                if logger:
                    logger(
                        "Importing shapefile to '{}'".format(
                            self.identity.name))
Exemple #11
0
    def add_column(self, name, update_existing=False, **kwargs):
        """
        Add a column to the table, or update an existing one.
        :param name: Name of the new or existing column.
        :param update_existing: If True, alter existing column values. Defaults to False
        :param kwargs: Other arguments for the the Column() constructor
        :return: a Column object
        """
        from ..identity import ColumnNumber

        try:
            c = self.column(name)
            extant = True

            if not update_existing:
                return c

        except NotFoundError:

            sequence_id = len(self.columns) + 1

            assert sequence_id

            c = Column(t_vid=self.vid,
                       sequence_id=sequence_id,
                       vid=str(ColumnNumber(ObjectNumber.parse(self.vid), sequence_id)),
                       name=name,
                       datatype='str')
            extant = False

        # Update possibly existing data
        c.data = dict((list(c.data.items()) if c.data else []) + list(kwargs.get('data', {}).items()))

        for key, value in list(kwargs.items()):

            if key[0] != '_' and key not in ['t_vid', 'name',  'sequence_id', 'data']:

                # Don't update the type if the user has specfied a custom type
                if key == 'datatype' and not c.type_is_builtin():
                    continue

                # Don't change a datatype if the value is set and the new value is unknown
                if key == 'datatype' and value == 'unknown' and c.datatype:
                    continue

                # Don't change a datatype if the value is set and the new value is unknown
                if key == 'description' and not value:
                    continue

                try:
                    setattr(c, key, value)
                except AttributeError:
                    raise AttributeError("Column record has no attribute {}".format(key))

            if key == 'is_primary_key' and isinstance(value, str) and len(value) == 0:
                value = False
                setattr(c, key, value)

        # If the id column has a description and the table does not, add it to
        # the table.
        if c.name == 'id' and c.is_primary_key and not self.description:
            self.description = c.description

        if not extant:
            self.columns.append(c)

        return c
Exemple #12
0
    def __iter__(self):

        from ambry_sources import get_source
        from itertools import izip, chain
        from ambry.etl import Slice
        from ambry.orm import Column
        from ambry.bundle.process import CallInterval

        table = self.source.dest_table

        if isinstance(table, str):
            table = self.table(table)

        start = int(table.data["start"])
        length = int(table.data["length"])

        slca_str = ",".join(str(e[4]) for e in self.header_cols)
        slcb_str = "{}:{}".format(start - 1, start + length - 1)

        # Slice for the stusab, logrecno, etc.
        slca, slc_code = Slice.make_slicer(slca_str)
        # Slice for the data columns
        slcb, slc_code = Slice.make_slicer(slcb_str)

        columns = [c.name for c in table.columns]

        # Columns before the first data column, by removing the
        # data columns, which are presumed to all be at the end.
        preamble_cols = columns[: -2 * len(slcb(range(1, 300)))]
        data_columns = columns[len(preamble_cols) :]

        header_cols = [e[0] for e in self.header_cols]

        # A few sanity checks
        assert preamble_cols[-1] == "jam_flags"
        assert data_columns[0][-3:] == "001"
        assert data_columns[1][-3:] == "m90"

        all_cols = [Column.mangle_name(c) for c in header_cols + data_columns]

        yield all_cols

        def progress(read_len, total_len, source_name):
            self.bundle.log("Downloading {}; {} bytes".format(source_name, total_len))

        cache = self.library.download_cache

        row_n = 0
        for spec1, spec2 in self.generate_source_specs():

            s1 = get_source(spec1, cache, callback=CallInterval(progress, 10, source_name=spec1.url))
            s2 = get_source(spec2, cache, callback=CallInterval(progress, 10, source_name=spec1.url))

            for i, (row1, row2) in enumerate(izip(s1, s2)):
                # Interleave the slices of the of the data rows, prepend
                # the stusab, logrecno, etc.

                row_n += 1
                if self.limited_run and row_n > 10000:
                    return

                yield slca(row1) + tuple(chain(*zip(slcb(row1), slcb(row2))))