def test_partition_index(): partition_size = 345600 partitioning = Partitioning(partition_size) timestamp = pytz.utc.localize(datetime(1970, 1, 5, 0, 0)) index = partitioning.index(timestamp) eq_(index, 0) timestamp = pytz.utc.localize(datetime(1970, 1, 5, 3, 0)) index = partitioning.index(timestamp) eq_(index, 1)
def __init__(self, datasource, entitytype, granularity, partition_size, type): self.id = None self.datasource = datasource self.entitytype = entitytype self.granularity = granularity self.partition_size = partition_size self.type = type self.version = 4 self.partitioning = Partitioning(partition_size)
def test_generate_index(conn, dataset): partition_size = 86400 start = dataset.datasource.tzinfo.localize( datetime.datetime(2013, 4, 25, 9, 45)) end = dataset.datasource.tzinfo.localize( datetime.datetime(2013, 4, 27, 9, 45)) partitioning = Partitioning(partition_size) for timestamp in dataset.granularity.range(start, end): partition_index = partitioning.index(timestamp) args = partition_size, timestamp with closing(conn.cursor()) as cursor: cursor.callproc("trend.timestamp_to_index", args) postgresql_partition_index, = cursor.fetchone() eq_(postgresql_partition_index, partition_index)
def test_index_to_interval(): partition_size = 3600 partitioning = Partitioning(partition_size) # 0 = '1970-01-01T00:00:00+00:00' # (0, 0) = divmod(0, 3600) start, end = partitioning.index_to_interval(0) expected_start = pytz.utc.localize(datetime(1970, 1, 1, 0, 0)) expected_end = pytz.utc.localize(datetime(1970, 1, 1, 1, 0)) eq_(start, expected_start) eq_(end, expected_end) # 1365022800 = '2013-04-03T21:00:00+00:00' # (379173, 0) = divmod(1365022800, 3600) start, end = partitioning.index_to_interval(379173) expected_start = pytz.utc.localize(datetime(2013, 4, 3, 21, 0)) expected_end = pytz.utc.localize(datetime(2013, 4, 3, 22, 0)) eq_(start, expected_start) eq_(end, expected_end) partition_size = 4 * 86400 partitioning = Partitioning(partition_size) start, end = partitioning.index_to_interval(0) expected_start = pytz.utc.localize(datetime(1970, 1, 1, 0, 0)) expected_end = pytz.utc.localize(datetime(1970, 1, 5, 0, 0)) eq_(start, expected_start) eq_(end, expected_end) start, end = partitioning.index_to_interval(3963) expected_start = pytz.utc.localize(datetime(2013, 5, 27, 0, 0)) expected_end = pytz.utc.localize(datetime(2013, 5, 31, 0, 0)) eq_(start, expected_start) eq_(end, expected_end) granularity = create_granularity("86400") # Test if all timestamps in between match for t in granularity.range(expected_start, expected_end): print(t) ok_(expected_start <= t) ok_(t <= expected_end)
class TrendStore(object): """ All data belonging to a specific datasource, entitytype and granularity. """ def __init__(self, datasource, entitytype, granularity, partition_size, type): self.id = None self.datasource = datasource self.entitytype = entitytype self.granularity = granularity self.partition_size = partition_size self.type = type self.version = 4 self.partitioning = Partitioning(partition_size) def __str__(self): return self.make_table_basename() def make_table_basename(self): granularity_name = DATA_TABLE_POSTFIXES.get( self.granularity.name, self.granularity.name) return "{}_{}_{}".format(self.datasource.name, self.entitytype.name, granularity_name) def make_table_name(self, timestamp): table_basename = self.make_table_basename() if self.type == "view": return table_basename else: index = self.partitioning.index(timestamp) return "{}_{}".format(table_basename, index) def base_table(self): return Table("trend", self.make_table_basename()) def partition(self, timestamp): if self.type == "view": index = None start = None end = None else: index = self.partitioning.index(timestamp) start, end = self.partitioning.index_to_interval(index) name = self.make_table_name(timestamp) return Partition(index, name, self, start, end, 4) def index_to_interval(self, partition_index): return self.partitioning.index_to_interval(partition_index) def check_columns_exist(self, column_names, data_types): def f(cursor): base_table = self.base_table() for column_name, data_type in zip(column_names, data_types): if not column_exists(cursor, base_table, column_name): create_column(cursor, base_table, column_name, data_type) assure_trendstore_trend_link(cursor, self, column_name) return f def check_column_types(self, column_names, data_types): """ Check if database column types match trend datatype and correct it if necessary. """ def f(cursor): table = self.base_table() current_data_types = get_data_types(cursor, table, column_names) changes = [] for column_name, current_data_type, data_type in \ zip(column_names, current_data_types, data_types): required_data_type = datatype.max_datatype(current_data_type, data_type) if required_data_type != current_data_type: changes.append((column_name, required_data_type)) logging.info( "Column {0:s} requires change from type {1} to {2}".format( column_name, current_data_type, required_data_type)) query = ( "SELECT trend.modify_trendstore_columns(" "%s, " "%s::trend.column_info[]" ")") args = self.id, changes cursor.execute(query, args) return f def table_names(self, start, end): timestamps = self.granularity.range(start, end) table_names = map(self.make_table_name, timestamps) #HACK for dealing with intervals that are small but span two tables # (e.g. 2012-1-5 0:00 - 2012-1-5 1:00 for qtr tables) end_table = self.make_table_name(end) table_names.append(end_table) return list(set(table_names)) def tables(self, start, end): make_table = partial(Table, "trend") table_names = self.table_names(start, end) return map(make_table, table_names) def get_trend(self, cursor, trend_name): query = ( "SELECT t.id, t.name " "FROM trend.trendstore_trend_link ttl " "JOIN trend.trend t ON t.id = ttl.trend_id " "WHERE ttl.trendstore_id = %s AND t.name = %s") args = self.id, trend_name cursor.execute(query, args) if cursor.rowcount > 0: return cursor.fetchone() def get_trends(self, cursor): query = ( "SELECT t.id, t.name FROM trend.trendstore_trend_link ttl " "JOIN trend.trend t ON t.id = ttl.trend_id " "WHERE ttl.trendstore_id = %s") args = (self.id, ) cursor.execute(query, args) return cursor.fetchall() def create(self, cursor): column_names = ["datasource_id", "entitytype_id", "granularity", "partition_size", "type", "version"] columns = map(Column, column_names) args = (self.datasource.id, self.entitytype.id, self.granularity.name, self.partition_size, self.type, self.version) query = schema.trendstore.insert(columns).returning("id") query.execute(cursor, args) trendstore_id, = cursor.fetchone() self.id = trendstore_id return self def save(self, cursor): if self.id is None: return self.create(cursor) else: args = (self.datasource.id, self.entitytype.id, self.granularity.name, self.partition_size, self.type, self.version, self.id) query = ( "UPDATE trend.trendstore SET " "datasource_id = %s, " "entitytype_id = %s, " "granularity = %s, " "partition_size = %s, " "type = %s, " "version = %s " "WHERE id = %s") cursor.execute(query, args) return self column_names = ["id", "datasource_id", "entitytype_id", "granularity", "partition_size", "type", "version"] columns = map(Column, column_names) get_query = schema.trendstore.select(columns).where_(ands([ Eq(Column("datasource_id")), Eq(Column("entitytype_id")), Eq(Column("granularity"))])) get_by_id_query = schema.trendstore.select(columns).where_(Eq(Column("id"))) @classmethod def get(cls, cursor, datasource, entitytype, granularity): args = datasource.id, entitytype.id, granularity.name cls.get_query.execute(cursor, args) if cursor.rowcount > 1: raise Exception("more than 1 ({}) trendstore matches".format( cursor.rowcount)) elif cursor.rowcount == 1: trendstore_id, datasource_id, entitytype_id, granularity_str, \ partition_size, type, version = cursor.fetchone() trendstore = TrendStore(datasource, entitytype, granularity, partition_size, type) trendstore.id = trendstore_id return trendstore @classmethod def get_by_id(cls, cursor, id): args = (id,) cls.get_by_id_query.execute(cursor, args) if cursor.rowcount == 1: trendstore_id, datasource_id, entitytype_id, granularity_str, \ partition_size, type, version = cursor.fetchone() datasource = get_datasource_by_id(cursor, datasource_id) entitytype = get_entitytype_by_id(cursor, entitytype_id) granularity = create_granularity(granularity_str) trendstore = TrendStore(datasource, entitytype, granularity, partition_size, type) trendstore.id = trendstore_id return trendstore def has_trend(self, cursor, trend_name): query = ( "SELECT 1 FROM trend.trendstore_trend_link ttl " "JOIN trend.trend t ON t.id = ttl.trend_id " "WHERE ttl.trendstore_id = %s AND t.name = %s") args = self.id, trend_name cursor.execute(query, args) return cursor.rowcount > 0 def store(self, datapackage): if datapackage.is_empty(): return DbTransaction() else: partition = self.partition(datapackage.timestamp) return store(partition, datapackage) def store_raw(self, raw_datapackage): if raw_datapackage.is_empty(): return DbTransaction() if len(raw_datapackage.rows) <= LARGE_BATCH_THRESHOLD: insert_action = BatchInsert else: insert_action = CopyFrom return DbTransaction( RefineRawDataPackage(k(raw_datapackage)), SetTimestamp(read("datapackage")), SetPartition(self), GetTimestamp(), insert_action(read("partition"), read("datapackage")), MarkModified(read("partition"), read("timestamp")) ) def clear_timestamp(self, timestamp): def f(cursor): query = ( "DELETE FROM {} " "WHERE timestamp = %s").format(self.base_table().render()) args = timestamp, cursor.execute(query, args) return f