Example #1
0
    def is_complete(self, interval, trendstore, filter=None, ratio=1):
        """
        Returns False when trend data is considered incomplete for a
        specific interval.

        Trend data is considered to be complete if:

            Two row counts are done: row count for interval : (start, end) and
            a row count for the same interval a week earlier.

            The row counts are both non zero and their ratio is more than
            specified ratio
        """
        complete = False
        row_count = partial(self.count, trendstore, filter=filter)

        count = row_count(interval)
        ref_count = row_count([get_previous_timestamp(ts, 7 * 86400)
                for ts in interval])

        try:
            if count / ref_count >= ratio:
                complete = True
        except (ZeroDivisionError, TypeError):
            pass

        return complete
def test_previous_timestamp():
    """
    Test previous timestamp retrieval
    """
    tz = pytz.timezone("Europe/Amsterdam")

    granularity = 3600
    ts = tz.localize(datetime(2013, 4, 2, 10, 0, 0))
    previous_timestamp = tz.localize(datetime(2013, 4, 2, 9, 0, 0))
    assert_equal(get_previous_timestamp(ts, granularity), previous_timestamp)

    granularity = 86400
    ts = get_most_recent_timestamp(tz.localize(datetime(2013, 4, 2, 10, 13, 0)),
            granularity)
    previous_timestamp = tz.localize(datetime(2013, 4, 1, 0, 0, 0))
    assert_equal(get_previous_timestamp(ts, granularity), previous_timestamp)
Example #3
0
    def is_complete(self, interval, datasource, gp, entitytype_name,
        filter=None, ratio=1):
        """
        Returns False when trend data is considered incomplete for a
        specific interval.

        Trend data is considered to be complete if:

            Two row counts are done:	row count for interval : (start, end) and
            a row count for the same interval a week earlier.

            The row counts are both non zero and their ratio is more than
            specified ratio

            If ref row count is zero, the check is done for the interval one
            day earlier (instead of a week earlier)
        """
        def _ratio(n, d):
            try:
                return n / d
            except (ZeroDivisionError, TypeError):
                return None

        complete = False
        row_count = partial(self.count, datasource, gp, entitytype_name,
            filter=filter)

        count = row_count(interval)
        ref_count = row_count([get_previous_timestamp(ts, 7 * 86400)
                for ts in interval])

        complete = _ratio(count, ref_count) >= ratio

        # Plan B: Try to compare with day earlier
        if ref_count == 0:
            ref_count = row_count([get_previous_timestamp(ts, 1 * 86400)
                    for ts in interval])
            complete = _ratio(count, ref_count) >= ratio

        return complete
Example #4
0
def aggregate(conn, schema, source, target, trend_names, timestamp):
    """
    Basic aggregation of trend data

    :param conn: psycopg2 database connection
    :param schema: schema where source and target data is located
    :param source: tuple (datasource, gp, entitytype_name) specifying source
    :param target: tuple (datasource, gp, entitytype_name) specifying target
    :param trend_names: trends to aggregate
    :param timestamp: non-naive timestamp specifying end of interval to aggregate
    """
    target_gp = target[1]
    interval = (get_previous_timestamp(timestamp, target_gp), timestamp)

    (ds, gp, et_name) = source
    source_table_names = get_table_names(
        [ds], gp, et_name, interval[0], interval[1])

    target_table_name = make_table_name(*(target + (timestamp,)))

    if column_exists(conn, schema, source_table_names[-1], "samples"):
        select_samples_part = "SUM(samples)"
        select_samples_column = "samples,"
    else:
        select_samples_part = "COUNT(*)"
        select_samples_column = ""

    select_parts = []

    for source_table_name in source_table_names:
        select_parts.append(
            "SELECT "
            "entity_id, '{1}', {2} {3} "
            "FROM \"{0}\".\"{4}\" "
            "WHERE timestamp > %s AND timestamp <= %s ".format(
                schema,
                timestamp.strftime("%Y-%m-%d %H:%M:%S"),
                select_samples_column,
                ",".join(["\"{0}\"".format(tn) for tn in trend_names]),
                source_table_name))

    query = (
        "INSERT INTO \"{0}\".\"{1}\" (entity_id, timestamp, samples, {2}) "
        "SELECT entity_id, '{4}', {5}, {6} FROM "
        "( {3} ) AS sources "
        "GROUP BY entity_id".format(
            schema,
            target_table_name,
            ",".join(["\"{0}\"".format(tn) for tn in trend_names]),
            " UNION ALL ".join(select_parts),
            timestamp.strftime("%Y-%m-%d %H:%M:%S"),
            select_samples_part,
            ",".join(["SUM(\"{0}\")".format(tn) for tn in trend_names])))

    retry = True
    attempt = 0

    #Strategy followed in code below is like trend_storage.store() function
    while retry is True:
        retry = False
        attempt += 1

        if attempt > MAX_RETRIES:
            raise MaxRetriesError("Max retries ({0}) reached".format(MAX_RETRIES))
        try:
            with closing(conn.cursor()) as cursor:
                cursor.execute(query, len(source_table_names) * interval)
        except psycopg2.DatabaseError as exc:
            conn.rollback()
            columns = [("samples", "integer")]
            columns.extend(zip(trend_names,
                get_data_types(conn, schema, source_table_names[-1], trend_names)))

            if exc.pgcode == psycopg2.errorcodes.NUMERIC_VALUE_OUT_OF_RANGE:
                max_values = []
                for source_table_name in source_table_names:
                    query_max_values = (
                        "SELECT {0} FROM "
                        "(SELECT "
                        " {1} "
                        "FROM \"{2}\".\"{3}\" "
                        "WHERE timestamp > %s AND timestamp <= %s "
                        "GROUP BY entity_id) AS sums"
                    ).format(
                            ",".join(["MAX(\"{0}\")".format(tn) for tn in trend_names]),
                            ",".join(["SUM(\"{0}\") AS \"{0}\"".format(tn) for tn in trend_names]),
                            schema,
                            source_table_name)

                    with closing(conn.cursor()) as cursor:
                        cursor.execute(query_max_values, interval)
                        max_values.append(cursor.fetchone())

                data_types = [datatype.extract_from_value(v)
                        for v in map(max, zip(*max_values))]
                check_column_types(conn, schema, target_table_name, trend_names,
                        data_types)

                retry = True
            elif exc.pgcode == psycopg2.errorcodes.UNIQUE_VIOLATION:
                raise NonRecoverableError("{0}, {1!s} in query '{2}'".format(
                    exc.pgcode, exc, query))
                # TODO: remove unique violating record from target
                # retry = True
            elif exc.pgcode == psycopg2.errorcodes.UNDEFINED_COLUMN:
                column_names, data_types = zip(*columns)
                add_missing_columns(conn, schema, target_table_name,
                        zip(column_names, data_types))
                retry = True
            elif exc.pgcode == psycopg2.errorcodes.UNDEFINED_TABLE:
                column_names, data_types = zip(*columns)
                create_trend_table(conn, schema, target_table_name, column_names,
                        data_types)
                retry = True
            else:
                raise NonRecoverableError("{0}, {1!s} in query '{2}'".format(
                    exc.pgcode, exc, query))
        else:
            conn.commit()