Esempio n. 1
0
    def search_unique_datasets_query(expressions, select_fields, limit):
        """
        'unique' here refer to that the query results do not contain datasets
        having the same 'id' more than once.

        We are not dealing with dataset_source table here and we are not joining
        dataset table with dataset_location table. We are aggregating stuff
        in dataset_location per dataset basis if required. It returns the construted
        query.
        """

        # expressions involving DATASET_SOURCE cannot not done for now
        for expression in expressions:
            assert expression.field.required_alchemy_table != DATASET_SOURCE, \
                'Joins with dataset_source cannot be done for this query'

        # expressions involving 'uri' and 'uris' will be handled different
        expressions = [
            expression for expression in expressions
            if expression.field.required_alchemy_table != DATASET_LOCATION
        ]

        if select_fields:
            select_columns = []
            for field in select_fields:
                if field.name in {'uri', 'uris'}:
                    # All active URIs, from newest to oldest
                    uris_field = func.array(
                        select([
                            _dataset_uri_field(SELECTED_DATASET_LOCATION)
                        ]).where(
                            and_(
                                SELECTED_DATASET_LOCATION.c.dataset_ref ==
                                DATASET.c.id,
                                SELECTED_DATASET_LOCATION.c.archived == None)).
                        order_by(SELECTED_DATASET_LOCATION.c.added.desc(),
                                 SELECTED_DATASET_LOCATION.c.id.desc()).label(
                                     'uris')).label('uris')
                    select_columns.append(uris_field)
                else:
                    select_columns.append(
                        field.alchemy_expression.label(field.name))
        else:
            select_columns = _DATASET_SELECT_FIELDS

        raw_expressions = PostgresDbAPI._alchemify_expressions(expressions)

        # We don't need 'DATASET_LOCATION table in the from expression
        select_fields_ = [
            field for field in select_fields
            if field.name not in {'uri', 'uris'}
        ]

        from_expression = PostgresDbAPI._from_expression(
            DATASET, expressions, select_fields_)
        where_expr = and_(DATASET.c.archived == None, *raw_expressions)

        return (select(select_columns).select_from(from_expression).where(
            where_expr).limit(limit))
Esempio n. 2
0
def ts_locs_array(
    config: ColumnElement,
    text: ColumnElement,
    tsquery: ColumnElement,
) -> ColumnElement:
    options = f"HighlightAll = TRUE, StartSel = {TS_START}, StopSel = {TS_STOP}"
    delimited = func.ts_headline(config, text, tsquery, options)
    parts = func.unnest(func.string_to_array(delimited, TS_START)).alias()
    part = column(parts.name)
    part_len = func.length(part) - len(TS_STOP)
    match_pos = func.sum(part_len).over(rows=(None, -1)) + len(TS_STOP)
    match_len = func.strpos(part, TS_STOP) - 1
    return func.array(
        select([postgresql.array([match_pos, match_len])
                ]).select_from(parts).offset(1).as_scalar(), )
 def invalid_query(self, session, exclude=u'Unclassified'):
     member = orm.aliased(Languoid, flat=True)
     extent = func.array(
         session.query(member.pk).filter_by(
             active=True, level=LanguoidLevel.language).join(
                 TreeClosureTable,
                 TreeClosureTable.child_pk == member.pk).filter_by(
                     parent_pk=Languoid.pk).order_by(member.pk).as_scalar())
     cte = session.query(Languoid.id, extent.label('extent'))\
         .filter_by(active=True, level=LanguoidLevel.family)\
         .filter(~Languoid.name.startswith(exclude)).cte()
     dup = orm.aliased(cte)
     return session.query(cte.c.id)\
         .filter(session.query(dup).filter(
             dup.c.id != cte.c.id, dup.c.extent == cte.c.extent).exists())\
         .order_by(cte.c.extent, cte.c.id)
Esempio n. 4
0
 def invalid_query(self, session, exclude=u'Unclassified'):
     member = orm.aliased(Languoid, flat=True)
     extent = func.array(
         session.query(member.pk)
         .filter_by(active=True, level=LanguoidLevel.language)
         .join(TreeClosureTable, TreeClosureTable.child_pk == member.pk)
         .filter_by(parent_pk=Languoid.pk)
         .order_by(member.pk).as_scalar())
     cte = session.query(Languoid.id, extent.label('extent'))\
         .filter_by(active=True, level=LanguoidLevel.family)\
         .filter(~Languoid.name.startswith(exclude)).cte()
     dup = orm.aliased(cte)
     return session.query(cte.c.id)\
         .filter(session.query(dup).filter(
             dup.c.id != cte.c.id, dup.c.extent == cte.c.extent).exists())\
         .order_by(cte.c.extent, cte.c.id)
Esempio n. 5
0
    return table.c.uri_scheme + ':' + table.c.uri_body


# Fields for selecting dataset with uris
# Need to alias the table, as queries may join the location table for filtering.
SELECTED_DATASET_LOCATION = DATASET_LOCATION.alias('selected_dataset_location')
_DATASET_SELECT_FIELDS = (
    DATASET,
    # All active URIs, from newest to oldest
    func.array(
        select([
            _dataset_uri_field(SELECTED_DATASET_LOCATION)
        ]).where(
            and_(
                SELECTED_DATASET_LOCATION.c.dataset_ref == DATASET.c.id,
                SELECTED_DATASET_LOCATION.c.archived == None
            )
        ).order_by(
            SELECTED_DATASET_LOCATION.c.added.desc(),
            SELECTED_DATASET_LOCATION.c.id.desc()
        ).label('uris')
    ).label('uris')
)

PGCODE_UNIQUE_CONSTRAINT = '23505'
PGCODE_FOREIGN_KEY_VIOLATION = '23503'

_LOG = logging.getLogger(__name__)


def _split_uri(uri):
Esempio n. 6
0
def traffic_history_query():
    timestamptz = TIMESTAMP(timezone=True)

    events = union_all(
        select([TrafficCredit.amount,
                TrafficCredit.timestamp,
                literal("Credit").label('type')]
               ).where(TrafficCredit.user_id == literal_column('arg_user_id')),

        select([(-TrafficVolume.amount).label('amount'),
                TrafficVolume.timestamp,
                cast(TrafficVolume.type, TEXT).label('type')]
               ).where(TrafficVolume.user_id == literal_column('arg_user_id'))
    ).cte('traffic_events')

    def round_time(time_expr, ceil=False):
        round_func = func.ceil if ceil else func.trunc
        step_epoch = func.extract('epoch', literal_column('arg_step'))
        return cast(func.to_timestamp(round_func(func.extract('epoch', time_expr) / step_epoch) * step_epoch), timestamptz)

    balance = select([TrafficBalance.amount, TrafficBalance.timestamp])\
        .select_from(User.__table__.outerjoin(TrafficBalance))\
        .where(User.id == literal_column('arg_user_id'))\
        .cte('balance')

    balance_amount = select([balance.c.amount]).as_scalar()
    balance_timestamp = select([balance.c.timestamp]).as_scalar()

    # Bucket layout
    # n = interval / step
    # 0: Aggregates all prior traffic_events so that the balance value can be calculated
    # 1 - n: Traffic history entry
    # n+1: Aggregates all data after the last point in time, will be discarded
    buckets = select([literal_column('bucket'),
            (func.row_number().over(order_by=literal_column('bucket')) - 1).label('index')]
    ).select_from(
        func.generate_series(
            round_time(cast(literal_column('arg_start'), timestamptz)) - literal_column('arg_step'),
            round_time(cast(literal_column('arg_start'), timestamptz) + literal_column('arg_interval')),
            literal_column('arg_step')
        ).alias('bucket')
    ).order_by(
        literal_column('bucket')
    ).cte('buckets')

    def cond_sum(condition, label, invert=False):
        return func.sum(case(
            [(condition, events.c.amount if not invert else -events.c.amount)],
            else_=None)).label(label)


    hist = select([buckets.c.bucket,
                   cond_sum(events.c.type == 'Credit', 'credit'),
                   cond_sum(events.c.type == 'Ingress', 'ingress', invert=True),
                   cond_sum(events.c.type == 'Egress', 'egress', invert=True),
                   func.sum(events.c.amount).label('amount'),
                   cond_sum(and_(balance_timestamp != None, events.c.timestamp < balance_timestamp), 'before_balance'),
                   cond_sum(or_(balance_timestamp == None, events.c.timestamp >= balance_timestamp), 'after_balance')]
    ).select_from(buckets.outerjoin(
        events, func.width_bucket(
            events.c.timestamp, select([func.array(select([buckets.c.bucket]).select_from(buckets).where(buckets.c.index != 0).label('dummy'))])
        ) == buckets.c.index
    )).where(
        # Discard bucket n+1
        buckets.c.index < select([func.max(buckets.c.index)])
    ).group_by(
        buckets.c.bucket
    ).order_by(
        buckets.c.bucket
    ).cte('traffic_hist')


    # Bucket is located before the balance and no traffic_events exist before it
    first_event_timestamp = select([func.min(events.c.timestamp)]).as_scalar()
    case_before_balance_no_data = (
        and_(balance_timestamp != None, hist.c.bucket < balance_timestamp,
        or_(first_event_timestamp == None,
            hist.c.bucket < first_event_timestamp
            )),
        None
    )

    # Bucket is located after the balance
    case_after_balance = (
        or_(balance_timestamp == None, hist.c.bucket >= balance_timestamp),
        func.coalesce(balance_amount, 0) + func.coalesce(
            func.sum(hist.c.after_balance).over(
                order_by=hist.c.bucket.asc(), rows=(None, 0)),
            0)
    )

    # Bucket is located before the balance, but there still exist traffic_events before it
    else_before_balance = (
            func.coalesce(balance_amount, 0) +
            func.coalesce(hist.c.after_balance, 0) -
            func.coalesce(
                func.sum(hist.c.before_balance).over(
                    order_by=hist.c.bucket.desc(), rows=(None, -1)
                ), 0)
    )

    agg_hist = select(
            [hist.c.bucket, hist.c.credit, hist.c.ingress, hist.c.egress, case(
            [case_before_balance_no_data, case_after_balance],
            else_=else_before_balance
        ).label('balance')]).alias('agg_hist')

    # Remove bucket 0
    result = select([agg_hist]).order_by(agg_hist.c.bucket).offset(1)

    return result
Esempio n. 7
0
def traffic_history_query():
    timestamptz = TIMESTAMP(timezone=True)

    events = union_all(
        select([
            TrafficCredit.amount, TrafficCredit.timestamp,
            literal("Credit").label('type')
        ]).where(TrafficCredit.user_id == literal_column('arg_user_id')),
        select([
            (-TrafficVolume.amount).label('amount'), TrafficVolume.timestamp,
            cast(TrafficVolume.type, TEXT).label('type')
        ]).where(TrafficVolume.user_id == literal_column('arg_user_id'))).cte(
            'traffic_events')

    def round_time(time_expr, ceil=False):
        round_func = func.ceil if ceil else func.trunc
        step_epoch = func.extract('epoch', literal_column('arg_step'))
        return cast(
            func.to_timestamp(
                round_func(func.extract('epoch', time_expr) / step_epoch) *
                step_epoch), timestamptz)

    balance = select([TrafficBalance.amount, TrafficBalance.timestamp])\
        .select_from(User.__table__.outerjoin(TrafficBalance))\
        .where(User.id == literal_column('arg_user_id'))\
        .cte('balance')

    balance_amount = select([balance.c.amount]).as_scalar()
    balance_timestamp = select([balance.c.timestamp]).as_scalar()

    # Bucket layout
    # n = interval / step
    # 0: Aggregates all prior traffic_events so that the balance value can be calculated
    # 1 - n: Traffic history entry
    # n+1: Aggregates all data after the last point in time, will be discarded
    buckets = select([
        literal_column('bucket'),
        (func.row_number().over(order_by=literal_column('bucket')) -
         1).label('index')
    ]).select_from(
        func.generate_series(
            round_time(cast(literal_column('arg_start'), timestamptz)) -
            literal_column('arg_step'),
            round_time(
                cast(literal_column('arg_start'), timestamptz) +
                literal_column('arg_interval')),
            literal_column('arg_step')).alias('bucket')).order_by(
                literal_column('bucket')).cte('buckets')

    def cond_sum(condition, label, invert=False):
        return func.sum(
            case([(condition,
                   events.c.amount if not invert else -events.c.amount)],
                 else_=None)).label(label)

    hist = select([
        buckets.c.bucket,
        cond_sum(events.c.type == 'Credit', 'credit'),
        cond_sum(events.c.type == 'Ingress', 'ingress', invert=True),
        cond_sum(events.c.type == 'Egress', 'egress', invert=True),
        func.sum(events.c.amount).label('amount'),
        cond_sum(
            and_(balance_timestamp != None,
                 events.c.timestamp < balance_timestamp), 'before_balance'),
        cond_sum(
            or_(balance_timestamp == None,
                events.c.timestamp >= balance_timestamp), 'after_balance')
    ]).select_from(
        buckets.outerjoin(
            events,
            func.width_bucket(
                events.c.timestamp,
                select([
                    func.array(
                        select([buckets.c.bucket]).select_from(buckets).where(
                            buckets.c.index != 0).label('dummy'))
                ])) == buckets.c.index)).where(
                    # Discard bucket n+1
                    buckets.c.index < select([func.max(buckets.c.index)])
                ).group_by(buckets.c.bucket).order_by(
                    buckets.c.bucket).cte('traffic_hist')

    # Bucket is located before the balance and no traffic_events exist before it
    first_event_timestamp = select([func.min(events.c.timestamp)]).as_scalar()
    case_before_balance_no_data = (and_(
        balance_timestamp != None, hist.c.bucket < balance_timestamp,
        or_(first_event_timestamp == None,
            hist.c.bucket < first_event_timestamp)), None)

    # Bucket is located after the balance
    case_after_balance = (
        or_(balance_timestamp == None, hist.c.bucket >= balance_timestamp),
        func.coalesce(balance_amount, 0) + func.coalesce(
            func.sum(hist.c.after_balance).over(order_by=hist.c.bucket.asc(),
                                                rows=(None, 0)), 0))

    # Bucket is located before the balance, but there still exist traffic_events before it
    else_before_balance = (
        func.coalesce(balance_amount, 0) +
        func.coalesce(hist.c.after_balance, 0) - func.coalesce(
            func.sum(hist.c.before_balance).over(order_by=hist.c.bucket.desc(),
                                                 rows=(None, -1)), 0))

    agg_hist = select([
        hist.c.bucket, hist.c.credit, hist.c.ingress, hist.c.egress,
        case([case_before_balance_no_data, case_after_balance],
             else_=else_before_balance).label('balance')
    ]).alias('agg_hist')

    # Remove bucket 0
    result = select([agg_hist]).order_by(agg_hist.c.bucket).offset(1)

    return result