Python Partitioning Examples, minerva.storage.trend.partitioning.Partitioning Python Examples

Example #1

0

Show file

File: test_partitioning.py Project: hendrikx-itc/python-minerva

def test_partition_index():
    partition_size = 345600
    partitioning = Partitioning(partition_size)

    timestamp = pytz.utc.localize(datetime(1970, 1, 5, 0, 0))
    index = partitioning.index(timestamp)
    eq_(index, 0)

    timestamp = pytz.utc.localize(datetime(1970, 1, 5, 3, 0))
    index = partitioning.index(timestamp)
    eq_(index, 1)

Example #2

0

Show file

 def __init__(self, datasource, entitytype, granularity, partition_size,
              type):
     self.id = None
     self.datasource = datasource
     self.entitytype = entitytype
     self.granularity = granularity
     self.partition_size = partition_size
     self.type = type
     self.version = 4
     self.partitioning = Partitioning(partition_size)

Example #3

0

Show file

File: test_store_v4.py Project: hendrikx-itc/python-minerva

def test_generate_index(conn, dataset):
    partition_size = 86400

    start = dataset.datasource.tzinfo.localize(
            datetime.datetime(2013, 4, 25, 9, 45))
    end = dataset.datasource.tzinfo.localize(
            datetime.datetime(2013, 4, 27, 9, 45))

    partitioning = Partitioning(partition_size)

    for timestamp in dataset.granularity.range(start, end):
        partition_index = partitioning.index(timestamp)

        args = partition_size, timestamp

        with closing(conn.cursor()) as cursor:
            cursor.callproc("trend.timestamp_to_index", args)

            postgresql_partition_index, = cursor.fetchone()

        eq_(postgresql_partition_index, partition_index)

Example #4

0

Show file

File: test_partitioning.py Project: hendrikx-itc/python-minerva

def test_index_to_interval():
    partition_size = 3600

    partitioning = Partitioning(partition_size)

    # 0 = '1970-01-01T00:00:00+00:00'
    # (0, 0) = divmod(0, 3600)
    start, end = partitioning.index_to_interval(0)

    expected_start = pytz.utc.localize(datetime(1970, 1, 1, 0, 0))
    expected_end = pytz.utc.localize(datetime(1970, 1, 1, 1, 0))

    eq_(start, expected_start)
    eq_(end, expected_end)

    # 1365022800 = '2013-04-03T21:00:00+00:00'
    # (379173, 0) = divmod(1365022800, 3600)
    start, end = partitioning.index_to_interval(379173)

    expected_start = pytz.utc.localize(datetime(2013, 4, 3, 21, 0))
    expected_end = pytz.utc.localize(datetime(2013, 4, 3, 22, 0))

    eq_(start, expected_start)
    eq_(end, expected_end)

    partition_size = 4 * 86400
    partitioning = Partitioning(partition_size)

    start, end = partitioning.index_to_interval(0)
    expected_start = pytz.utc.localize(datetime(1970, 1, 1, 0, 0))
    expected_end = pytz.utc.localize(datetime(1970, 1, 5, 0, 0))

    eq_(start, expected_start)
    eq_(end, expected_end)

    start, end = partitioning.index_to_interval(3963)
    expected_start = pytz.utc.localize(datetime(2013, 5, 27, 0, 0))
    expected_end = pytz.utc.localize(datetime(2013, 5, 31, 0, 0))

    eq_(start, expected_start)
    eq_(end, expected_end)

    granularity = create_granularity("86400")

    # Test if all timestamps in between match
    for t in granularity.range(expected_start, expected_end):
        print(t)
        ok_(expected_start <= t)
        ok_(t <= expected_end)

Example #5

0

Show file

class TrendStore(object):
    """
    All data belonging to a specific datasource, entitytype and granularity.
    """
    def __init__(self, datasource, entitytype, granularity, partition_size,
                 type):
        self.id = None
        self.datasource = datasource
        self.entitytype = entitytype
        self.granularity = granularity
        self.partition_size = partition_size
        self.type = type
        self.version = 4
        self.partitioning = Partitioning(partition_size)

    def __str__(self):
        return self.make_table_basename()

    def make_table_basename(self):
        granularity_name = DATA_TABLE_POSTFIXES.get(
            self.granularity.name, self.granularity.name)

        return "{}_{}_{}".format(self.datasource.name, self.entitytype.name,
                granularity_name)

    def make_table_name(self, timestamp):
        table_basename = self.make_table_basename()

        if self.type == "view":
            return table_basename
        else:
            index = self.partitioning.index(timestamp)

            return "{}_{}".format(table_basename, index)

    def base_table(self):
        return Table("trend", self.make_table_basename())

    def partition(self, timestamp):
        if self.type == "view":
            index = None
            start = None
            end = None
        else:
            index = self.partitioning.index(timestamp)
            start, end = self.partitioning.index_to_interval(index)

        name = self.make_table_name(timestamp)

        return Partition(index, name, self, start, end, 4)

    def index_to_interval(self, partition_index):
        return self.partitioning.index_to_interval(partition_index)

    def check_columns_exist(self, column_names, data_types):
        def f(cursor):
            base_table = self.base_table()

            for column_name, data_type in zip(column_names, data_types):
                if not column_exists(cursor, base_table, column_name):
                    create_column(cursor, base_table, column_name, data_type)

                    assure_trendstore_trend_link(cursor, self, column_name)

        return f

    def check_column_types(self, column_names, data_types):
        """
        Check if database column types match trend datatype and correct it if
        necessary.
        """
        def f(cursor):
            table = self.base_table()
            current_data_types = get_data_types(cursor, table, column_names)

            changes = []

            for column_name, current_data_type, data_type in \
                    zip(column_names, current_data_types, data_types):
                required_data_type = datatype.max_datatype(current_data_type, data_type)

                if required_data_type != current_data_type:
                    changes.append((column_name, required_data_type))

                    logging.info(
                        "Column {0:s} requires change from type {1} to {2}".format(
                            column_name, current_data_type, required_data_type))

            query = (
                "SELECT trend.modify_trendstore_columns("
                "%s, "
                "%s::trend.column_info[]"
                ")")
            args = self.id, changes

            cursor.execute(query, args)

        return f

    def table_names(self, start, end):
        timestamps = self.granularity.range(start, end)

        table_names = map(self.make_table_name, timestamps)

        #HACK for dealing with intervals that are small but span two tables
        # (e.g. 2012-1-5 0:00 - 2012-1-5 1:00 for qtr tables)
        end_table = self.make_table_name(end)

        table_names.append(end_table)

        return list(set(table_names))

    def tables(self, start, end):
        make_table = partial(Table, "trend")
        table_names = self.table_names(start, end)

        return map(make_table, table_names)

    def get_trend(self, cursor, trend_name):
        query = (
            "SELECT t.id, t.name "
            "FROM trend.trendstore_trend_link ttl "
            "JOIN trend.trend t ON t.id = ttl.trend_id "
            "WHERE ttl.trendstore_id = %s AND t.name = %s")

        args = self.id, trend_name

        cursor.execute(query, args)

        if cursor.rowcount > 0:
            return cursor.fetchone()

    def get_trends(self, cursor):
        query = (
            "SELECT t.id, t.name FROM trend.trendstore_trend_link ttl "
            "JOIN trend.trend t ON t.id = ttl.trend_id "
            "WHERE ttl.trendstore_id = %s")

        args = (self.id, )

        cursor.execute(query, args)

        return cursor.fetchall()

    def create(self, cursor):
        column_names = ["datasource_id", "entitytype_id", "granularity",
                "partition_size", "type", "version"]

        columns = map(Column, column_names)

        args = (self.datasource.id, self.entitytype.id, self.granularity.name,
                self.partition_size, self.type, self.version)

        query = schema.trendstore.insert(columns).returning("id")

        query.execute(cursor, args)

        trendstore_id, = cursor.fetchone()

        self.id = trendstore_id

        return self

    def save(self, cursor):
        if self.id is None:
            return self.create(cursor)
        else:
            args = (self.datasource.id, self.entitytype.id, self.granularity.name,
                    self.partition_size, self.type, self.version, self.id)

            query = (
                "UPDATE trend.trendstore SET "
                    "datasource_id = %s, "
                    "entitytype_id = %s, "
                    "granularity = %s, "
                    "partition_size = %s, "
                    "type = %s, "
                    "version = %s "
                "WHERE id = %s")

            cursor.execute(query, args)

            return self

    column_names = ["id", "datasource_id", "entitytype_id", "granularity",
            "partition_size", "type", "version"]

    columns = map(Column, column_names)

    get_query = schema.trendstore.select(columns).where_(ands([
        Eq(Column("datasource_id")),
        Eq(Column("entitytype_id")),
        Eq(Column("granularity"))]))

    get_by_id_query = schema.trendstore.select(columns).where_(Eq(Column("id")))

    @classmethod
    def get(cls, cursor, datasource, entitytype, granularity):
        args = datasource.id, entitytype.id, granularity.name

        cls.get_query.execute(cursor, args)

        if cursor.rowcount > 1:
            raise Exception("more than 1 ({}) trendstore matches".format(
                    cursor.rowcount))
        elif cursor.rowcount == 1:
            trendstore_id, datasource_id, entitytype_id, granularity_str, \
                    partition_size, type, version = cursor.fetchone()

            trendstore = TrendStore(datasource, entitytype, granularity,
                    partition_size, type)

            trendstore.id = trendstore_id

            return trendstore

    @classmethod
    def get_by_id(cls, cursor, id):
        args = (id,)

        cls.get_by_id_query.execute(cursor, args)

        if cursor.rowcount == 1:
            trendstore_id, datasource_id, entitytype_id, granularity_str, \
                    partition_size, type, version = cursor.fetchone()

            datasource = get_datasource_by_id(cursor, datasource_id)
            entitytype = get_entitytype_by_id(cursor, entitytype_id)

            granularity = create_granularity(granularity_str)

            trendstore = TrendStore(datasource, entitytype, granularity,
                    partition_size, type)

            trendstore.id = trendstore_id

            return trendstore

    def has_trend(self, cursor, trend_name):
        query = (
            "SELECT 1 FROM trend.trendstore_trend_link ttl "
            "JOIN trend.trend t ON t.id = ttl.trend_id "
            "WHERE ttl.trendstore_id = %s AND t.name = %s")

        args = self.id, trend_name

        cursor.execute(query, args)

        return cursor.rowcount > 0

    def store(self, datapackage):
        if datapackage.is_empty():
            return DbTransaction()
        else:
            partition = self.partition(datapackage.timestamp)
            return store(partition, datapackage)

    def store_raw(self, raw_datapackage):
        if raw_datapackage.is_empty():
            return DbTransaction()

        if len(raw_datapackage.rows) <= LARGE_BATCH_THRESHOLD:
            insert_action = BatchInsert
        else:
            insert_action = CopyFrom

        return DbTransaction(
            RefineRawDataPackage(k(raw_datapackage)),
            SetTimestamp(read("datapackage")),
            SetPartition(self),
            GetTimestamp(),
            insert_action(read("partition"), read("datapackage")),
            MarkModified(read("partition"), read("timestamp"))
        )

    def clear_timestamp(self, timestamp):
        def f(cursor):
            query = (
                "DELETE FROM {} "
                "WHERE timestamp = %s").format(self.base_table().render())
            args = timestamp,

            cursor.execute(query, args)

        return f