def new_table(self, ds, n): t = Table(ds, sequence_id = n, name = 'table{}'.format(n)) for i in range(5): c = Column(sequence_id = i, name = 'column_{}_{}'.format(n,i), datatype = 'Integer') t.columns.append(c) for i in range(5): c.add_code(str(i), str(i)) return t
def mangle_column_name(self, i, n): """ Override this method to change the way that column names from the source are altered to become column names in the schema :param i: column number :param n: original column name :return: """ from ambry.orm import Column if not n: return 'column{}'.format(i) mn = Column.mangle_name(n.strip()) if mn in self.col_map: col = self.col_map[mn]['col'] if col: return col else: return mn else: return mn
def meta_find_int(self): """Look for columns that have non-int values and convert the schema entry to real. """ import sqlite3 from ambry.orm import Column conn = sqlite3.connect(self.filesystem.build_path('chis12d.db')) conn.row_factory = sqlite3.Row notint = set() lr = self.init_log_rate(500) for row in conn.execute('SELECT * FROm chis12d limit 10000'): row = dict(zip(row.keys(), row)) for k, v in row.items(): if int(v) != v: notint.add(Column.mangle_name(k)) lr() with self.session: t = self.schema.table('chis_puf') for c in t.columns: if c.name in notint: self.log(c.name) c.datatype = c.DATATYPE_REAL else: c.datatype = c.DATATYPE_INTEGER self.schema.write_schema()
def meta_find_int(self): """Look for columns that have non-int values and convert the schema entry to real. """ import sqlite3 from ambry.orm import Column conn = sqlite3.connect(self.filesystem.build_path('chis12d.db')) conn.row_factory = sqlite3.Row notint = set() lr = self.init_log_rate(500) for row in conn.execute('SELECT * FROm chis12d limit 10000'): row = dict(zip(row.keys(),row)) for k,v in row.items(): if int(v) != v: notint.add(Column.mangle_name(k)) lr() with self.session: t = self.schema.table('chis_puf') for c in t.columns: if c.name in notint: self.log(c.name) c.datatype = c.DATATYPE_REAL else: c.datatype = c.DATATYPE_INTEGER self.schema.write_schema()
def column(self, ref): # AFAIK, all of the columns in the relationship will get loaded if any one is accessed, # so iterating over the collection only involves one SELECT. from .column import Column column_name = Column.mangle_name(str(ref)) for c in self.columns: if str(column_name) == c.name or str(ref) == c.id or str(ref) == c.vid: return c raise NotFoundError( "Failed to find column '{}' in table '{}' for ref: '{}' ".format(ref, self.name, ref))
def column(self, ref): # AFAIK, all of the columns in the relationship will get loaded if any one is accessed, # so iterating over the collection only involves one SELECT. from .column import Column column_name = Column.mangle_name(str(ref)) for c in self.columns: if str(column_name) == c.name or str(ref) == c.id or str( ref) == c.vid: return c raise NotFoundError( "Failed to find column '{}' in table '{}' for ref: '{}' ".format( ref, self.name, ref))
def load_shapefile(self, path, logger=None): """Load a shapefile into the partition. Loads the features and inserts them using an inserter. :param path: :return: """ from osgeo import ogr # , osr from ..geo.sfschema import ogr_inv_type_map, mangle_name from ..orm import Column, Geometry from ..geo.util import get_type_from_geometry if path.startswith('http'): shape_url = path path = self.bundle.filesystem.download_shapefile(shape_url) driver = ogr.GetDriverByName("ESRI Shapefile") dataSource = driver.Open(path, 0) layer = dataSource.GetLayer() to_srs = ogr.osr.SpatialReference() to_srs.ImportFromEPSG(Geometry.DEFAULT_SRS) dfn = layer.GetLayerDefn() col_defs = [] for i in range(0, dfn.GetFieldCount()): field = dfn.GetFieldDefn(i) col_defs.append( (Column.mangle_name( mangle_name( field.GetName())), Column.types[ ogr_inv_type_map[ field.GetType()]][1])) col_type = None for c in self.table.columns: if c.name == 'geometry': col_type = c.datatype.upper() break assert col_type is not None with self.inserter() as ins: for feature in layer: d = {} for i in range(0, dfn.GetFieldCount()): name, type_ = col_defs[i] try: d[name] = feature.GetFieldAsString(i) except TypeError as e: self.bundle.logger.error( "Type error for column '{}', type={}: {}".format( name, type_, e)) raise g = feature.GetGeometryRef() g.TransformTo(to_srs) type_ = get_type_from_geometry(g) if type_ != col_type: if type_ == 'POLYGON' and col_type == 'MULTIPOLYGON': g = ogr.ForceToMultiPolygon(g) else: raise Exception( "Don't know how to handle this conversion case : {} -> {}".format(type_, col_type)) d['geometry'] = g.ExportToWkt() ins.insert(d) if logger: logger( "Importing shapefile to '{}'".format( self.identity.name))
def add_column(self, name, update_existing=False, **kwargs): """ Add a column to the table, or update an existing one. :param name: Name of the new or existing column. :param update_existing: If True, alter existing column values. Defaults to False :param kwargs: Other arguments for the the Column() constructor :return: a Column object """ from ..identity import ColumnNumber try: c = self.column(name) extant = True if not update_existing: return c except NotFoundError: sequence_id = len(self.columns) + 1 assert sequence_id c = Column(t_vid=self.vid, sequence_id=sequence_id, vid=str( ColumnNumber(ObjectNumber.parse(self.vid), sequence_id)), name=name, datatype='str') extant = False # Update possibly existing data c.data = dict((list(c.data.items()) if c.data else []) + list(kwargs.get('data', {}).items())) for key, value in list(kwargs.items()): if key[0] != '_' and key not in [ 't_vid', 'name', 'sequence_id', 'data' ]: # Don't update the type if the user has specfied a custom type if key == 'datatype' and not c.type_is_builtin(): continue # Don't change a datatype if the value is set and the new value is unknown if key == 'datatype' and value == 'unknown' and c.datatype: continue # Don't change a datatype if the value is set and the new value is unknown if key == 'description' and not value: continue try: setattr(c, key, value) except AttributeError: raise AttributeError( "Column record has no attribute {}".format(key)) if key == 'is_primary_key' and isinstance(value, str) and len(value) == 0: value = False setattr(c, key, value) # If the id column has a description and the table does not, add it to # the table. if c.name == 'id' and c.is_primary_key and not self.description: self.description = c.description if not extant: self.columns.append(c) return c
def load_shapefile(self, path, logger=None): """Load a shapefile into the partition. Loads the features and inserts them using an inserter. :param path: :return: """ from osgeo import ogr, osr from ..geo.sfschema import ogr_inv_type_map, mangle_name from ..orm import Column, Geometry from ..geo.util import get_type_from_geometry if path.startswith('http'): shape_url = path path = self.bundle.filesystem.download_shapefile(shape_url) driver = ogr.GetDriverByName("ESRI Shapefile") dataSource = driver.Open(path, 0) layer = dataSource.GetLayer() to_srs = ogr.osr.SpatialReference() to_srs.ImportFromEPSG(Geometry.DEFAULT_SRS) dfn = layer.GetLayerDefn() col_defs = [] for i in range(0, dfn.GetFieldCount()): field = dfn.GetFieldDefn(i) col_defs.append( (Column.mangle_name( mangle_name( field.GetName())), Column.types[ ogr_inv_type_map[ field.GetType()]][1])) col_type = None for c in self.table.columns: if c.name == 'geometry': col_type = c.datatype.upper() break assert col_type is not None with self.inserter() as ins: for feature in layer: d = {} for i in range(0, dfn.GetFieldCount()): name, type_ = col_defs[i] try: d[name] = feature.GetFieldAsString(i) except TypeError as e: self.bundle.logger.error( "Type error for column '{}', type={}: {}".format( name, type_, e)) raise g = feature.GetGeometryRef() g.TransformTo(to_srs) type_ = get_type_from_geometry(g) if type_ != col_type: if type_ == 'POLYGON' and col_type == 'MULTIPOLYGON': g = ogr.ForceToMultiPolygon(g) else: raise Exception( "Don't know how to handle this conversion case : {} -> {}".format(type_, col_type)) d['geometry'] = g.ExportToWkt() ins.insert(d) if logger: logger( "Importing shapefile to '{}'".format( self.identity.name))
def add_column(self, name, update_existing=False, **kwargs): """ Add a column to the table, or update an existing one. :param name: Name of the new or existing column. :param update_existing: If True, alter existing column values. Defaults to False :param kwargs: Other arguments for the the Column() constructor :return: a Column object """ from ..identity import ColumnNumber try: c = self.column(name) extant = True if not update_existing: return c except NotFoundError: sequence_id = len(self.columns) + 1 assert sequence_id c = Column(t_vid=self.vid, sequence_id=sequence_id, vid=str(ColumnNumber(ObjectNumber.parse(self.vid), sequence_id)), name=name, datatype='str') extant = False # Update possibly existing data c.data = dict((list(c.data.items()) if c.data else []) + list(kwargs.get('data', {}).items())) for key, value in list(kwargs.items()): if key[0] != '_' and key not in ['t_vid', 'name', 'sequence_id', 'data']: # Don't update the type if the user has specfied a custom type if key == 'datatype' and not c.type_is_builtin(): continue # Don't change a datatype if the value is set and the new value is unknown if key == 'datatype' and value == 'unknown' and c.datatype: continue # Don't change a datatype if the value is set and the new value is unknown if key == 'description' and not value: continue try: setattr(c, key, value) except AttributeError: raise AttributeError("Column record has no attribute {}".format(key)) if key == 'is_primary_key' and isinstance(value, str) and len(value) == 0: value = False setattr(c, key, value) # If the id column has a description and the table does not, add it to # the table. if c.name == 'id' and c.is_primary_key and not self.description: self.description = c.description if not extant: self.columns.append(c) return c
def __iter__(self): from ambry_sources import get_source from itertools import izip, chain from ambry.etl import Slice from ambry.orm import Column from ambry.bundle.process import CallInterval table = self.source.dest_table if isinstance(table, str): table = self.table(table) start = int(table.data["start"]) length = int(table.data["length"]) slca_str = ",".join(str(e[4]) for e in self.header_cols) slcb_str = "{}:{}".format(start - 1, start + length - 1) # Slice for the stusab, logrecno, etc. slca, slc_code = Slice.make_slicer(slca_str) # Slice for the data columns slcb, slc_code = Slice.make_slicer(slcb_str) columns = [c.name for c in table.columns] # Columns before the first data column, by removing the # data columns, which are presumed to all be at the end. preamble_cols = columns[: -2 * len(slcb(range(1, 300)))] data_columns = columns[len(preamble_cols) :] header_cols = [e[0] for e in self.header_cols] # A few sanity checks assert preamble_cols[-1] == "jam_flags" assert data_columns[0][-3:] == "001" assert data_columns[1][-3:] == "m90" all_cols = [Column.mangle_name(c) for c in header_cols + data_columns] yield all_cols def progress(read_len, total_len, source_name): self.bundle.log("Downloading {}; {} bytes".format(source_name, total_len)) cache = self.library.download_cache row_n = 0 for spec1, spec2 in self.generate_source_specs(): s1 = get_source(spec1, cache, callback=CallInterval(progress, 10, source_name=spec1.url)) s2 = get_source(spec2, cache, callback=CallInterval(progress, 10, source_name=spec1.url)) for i, (row1, row2) in enumerate(izip(s1, s2)): # Interleave the slices of the of the data rows, prepend # the stusab, logrecno, etc. row_n += 1 if self.limited_run and row_n > 10000: return yield slca(row1) + tuple(chain(*zip(slcb(row1), slcb(row2))))