def search_unique_datasets_query(expressions, select_fields, limit): """ 'unique' here refer to that the query results do not contain datasets having the same 'id' more than once. We are not dealing with dataset_source table here and we are not joining dataset table with dataset_location table. We are aggregating stuff in dataset_location per dataset basis if required. It returns the construted query. """ # expressions involving DATASET_SOURCE cannot not done for now for expression in expressions: assert expression.field.required_alchemy_table != DATASET_SOURCE, \ 'Joins with dataset_source cannot be done for this query' # expressions involving 'uri' and 'uris' will be handled different expressions = [ expression for expression in expressions if expression.field.required_alchemy_table != DATASET_LOCATION ] if select_fields: select_columns = [] for field in select_fields: if field.name in {'uri', 'uris'}: # All active URIs, from newest to oldest uris_field = func.array( select([ _dataset_uri_field(SELECTED_DATASET_LOCATION) ]).where( and_( SELECTED_DATASET_LOCATION.c.dataset_ref == DATASET.c.id, SELECTED_DATASET_LOCATION.c.archived == None)). order_by(SELECTED_DATASET_LOCATION.c.added.desc(), SELECTED_DATASET_LOCATION.c.id.desc()).label( 'uris')).label('uris') select_columns.append(uris_field) else: select_columns.append( field.alchemy_expression.label(field.name)) else: select_columns = _DATASET_SELECT_FIELDS raw_expressions = PostgresDbAPI._alchemify_expressions(expressions) # We don't need 'DATASET_LOCATION table in the from expression select_fields_ = [ field for field in select_fields if field.name not in {'uri', 'uris'} ] from_expression = PostgresDbAPI._from_expression( DATASET, expressions, select_fields_) where_expr = and_(DATASET.c.archived == None, *raw_expressions) return (select(select_columns).select_from(from_expression).where( where_expr).limit(limit))
def ts_locs_array( config: ColumnElement, text: ColumnElement, tsquery: ColumnElement, ) -> ColumnElement: options = f"HighlightAll = TRUE, StartSel = {TS_START}, StopSel = {TS_STOP}" delimited = func.ts_headline(config, text, tsquery, options) parts = func.unnest(func.string_to_array(delimited, TS_START)).alias() part = column(parts.name) part_len = func.length(part) - len(TS_STOP) match_pos = func.sum(part_len).over(rows=(None, -1)) + len(TS_STOP) match_len = func.strpos(part, TS_STOP) - 1 return func.array( select([postgresql.array([match_pos, match_len]) ]).select_from(parts).offset(1).as_scalar(), )
def invalid_query(self, session, exclude=u'Unclassified'): member = orm.aliased(Languoid, flat=True) extent = func.array( session.query(member.pk).filter_by( active=True, level=LanguoidLevel.language).join( TreeClosureTable, TreeClosureTable.child_pk == member.pk).filter_by( parent_pk=Languoid.pk).order_by(member.pk).as_scalar()) cte = session.query(Languoid.id, extent.label('extent'))\ .filter_by(active=True, level=LanguoidLevel.family)\ .filter(~Languoid.name.startswith(exclude)).cte() dup = orm.aliased(cte) return session.query(cte.c.id)\ .filter(session.query(dup).filter( dup.c.id != cte.c.id, dup.c.extent == cte.c.extent).exists())\ .order_by(cte.c.extent, cte.c.id)
def invalid_query(self, session, exclude=u'Unclassified'): member = orm.aliased(Languoid, flat=True) extent = func.array( session.query(member.pk) .filter_by(active=True, level=LanguoidLevel.language) .join(TreeClosureTable, TreeClosureTable.child_pk == member.pk) .filter_by(parent_pk=Languoid.pk) .order_by(member.pk).as_scalar()) cte = session.query(Languoid.id, extent.label('extent'))\ .filter_by(active=True, level=LanguoidLevel.family)\ .filter(~Languoid.name.startswith(exclude)).cte() dup = orm.aliased(cte) return session.query(cte.c.id)\ .filter(session.query(dup).filter( dup.c.id != cte.c.id, dup.c.extent == cte.c.extent).exists())\ .order_by(cte.c.extent, cte.c.id)
return table.c.uri_scheme + ':' + table.c.uri_body # Fields for selecting dataset with uris # Need to alias the table, as queries may join the location table for filtering. SELECTED_DATASET_LOCATION = DATASET_LOCATION.alias('selected_dataset_location') _DATASET_SELECT_FIELDS = ( DATASET, # All active URIs, from newest to oldest func.array( select([ _dataset_uri_field(SELECTED_DATASET_LOCATION) ]).where( and_( SELECTED_DATASET_LOCATION.c.dataset_ref == DATASET.c.id, SELECTED_DATASET_LOCATION.c.archived == None ) ).order_by( SELECTED_DATASET_LOCATION.c.added.desc(), SELECTED_DATASET_LOCATION.c.id.desc() ).label('uris') ).label('uris') ) PGCODE_UNIQUE_CONSTRAINT = '23505' PGCODE_FOREIGN_KEY_VIOLATION = '23503' _LOG = logging.getLogger(__name__) def _split_uri(uri):
def traffic_history_query(): timestamptz = TIMESTAMP(timezone=True) events = union_all( select([TrafficCredit.amount, TrafficCredit.timestamp, literal("Credit").label('type')] ).where(TrafficCredit.user_id == literal_column('arg_user_id')), select([(-TrafficVolume.amount).label('amount'), TrafficVolume.timestamp, cast(TrafficVolume.type, TEXT).label('type')] ).where(TrafficVolume.user_id == literal_column('arg_user_id')) ).cte('traffic_events') def round_time(time_expr, ceil=False): round_func = func.ceil if ceil else func.trunc step_epoch = func.extract('epoch', literal_column('arg_step')) return cast(func.to_timestamp(round_func(func.extract('epoch', time_expr) / step_epoch) * step_epoch), timestamptz) balance = select([TrafficBalance.amount, TrafficBalance.timestamp])\ .select_from(User.__table__.outerjoin(TrafficBalance))\ .where(User.id == literal_column('arg_user_id'))\ .cte('balance') balance_amount = select([balance.c.amount]).as_scalar() balance_timestamp = select([balance.c.timestamp]).as_scalar() # Bucket layout # n = interval / step # 0: Aggregates all prior traffic_events so that the balance value can be calculated # 1 - n: Traffic history entry # n+1: Aggregates all data after the last point in time, will be discarded buckets = select([literal_column('bucket'), (func.row_number().over(order_by=literal_column('bucket')) - 1).label('index')] ).select_from( func.generate_series( round_time(cast(literal_column('arg_start'), timestamptz)) - literal_column('arg_step'), round_time(cast(literal_column('arg_start'), timestamptz) + literal_column('arg_interval')), literal_column('arg_step') ).alias('bucket') ).order_by( literal_column('bucket') ).cte('buckets') def cond_sum(condition, label, invert=False): return func.sum(case( [(condition, events.c.amount if not invert else -events.c.amount)], else_=None)).label(label) hist = select([buckets.c.bucket, cond_sum(events.c.type == 'Credit', 'credit'), cond_sum(events.c.type == 'Ingress', 'ingress', invert=True), cond_sum(events.c.type == 'Egress', 'egress', invert=True), func.sum(events.c.amount).label('amount'), cond_sum(and_(balance_timestamp != None, events.c.timestamp < balance_timestamp), 'before_balance'), cond_sum(or_(balance_timestamp == None, events.c.timestamp >= balance_timestamp), 'after_balance')] ).select_from(buckets.outerjoin( events, func.width_bucket( events.c.timestamp, select([func.array(select([buckets.c.bucket]).select_from(buckets).where(buckets.c.index != 0).label('dummy'))]) ) == buckets.c.index )).where( # Discard bucket n+1 buckets.c.index < select([func.max(buckets.c.index)]) ).group_by( buckets.c.bucket ).order_by( buckets.c.bucket ).cte('traffic_hist') # Bucket is located before the balance and no traffic_events exist before it first_event_timestamp = select([func.min(events.c.timestamp)]).as_scalar() case_before_balance_no_data = ( and_(balance_timestamp != None, hist.c.bucket < balance_timestamp, or_(first_event_timestamp == None, hist.c.bucket < first_event_timestamp )), None ) # Bucket is located after the balance case_after_balance = ( or_(balance_timestamp == None, hist.c.bucket >= balance_timestamp), func.coalesce(balance_amount, 0) + func.coalesce( func.sum(hist.c.after_balance).over( order_by=hist.c.bucket.asc(), rows=(None, 0)), 0) ) # Bucket is located before the balance, but there still exist traffic_events before it else_before_balance = ( func.coalesce(balance_amount, 0) + func.coalesce(hist.c.after_balance, 0) - func.coalesce( func.sum(hist.c.before_balance).over( order_by=hist.c.bucket.desc(), rows=(None, -1) ), 0) ) agg_hist = select( [hist.c.bucket, hist.c.credit, hist.c.ingress, hist.c.egress, case( [case_before_balance_no_data, case_after_balance], else_=else_before_balance ).label('balance')]).alias('agg_hist') # Remove bucket 0 result = select([agg_hist]).order_by(agg_hist.c.bucket).offset(1) return result
def traffic_history_query(): timestamptz = TIMESTAMP(timezone=True) events = union_all( select([ TrafficCredit.amount, TrafficCredit.timestamp, literal("Credit").label('type') ]).where(TrafficCredit.user_id == literal_column('arg_user_id')), select([ (-TrafficVolume.amount).label('amount'), TrafficVolume.timestamp, cast(TrafficVolume.type, TEXT).label('type') ]).where(TrafficVolume.user_id == literal_column('arg_user_id'))).cte( 'traffic_events') def round_time(time_expr, ceil=False): round_func = func.ceil if ceil else func.trunc step_epoch = func.extract('epoch', literal_column('arg_step')) return cast( func.to_timestamp( round_func(func.extract('epoch', time_expr) / step_epoch) * step_epoch), timestamptz) balance = select([TrafficBalance.amount, TrafficBalance.timestamp])\ .select_from(User.__table__.outerjoin(TrafficBalance))\ .where(User.id == literal_column('arg_user_id'))\ .cte('balance') balance_amount = select([balance.c.amount]).as_scalar() balance_timestamp = select([balance.c.timestamp]).as_scalar() # Bucket layout # n = interval / step # 0: Aggregates all prior traffic_events so that the balance value can be calculated # 1 - n: Traffic history entry # n+1: Aggregates all data after the last point in time, will be discarded buckets = select([ literal_column('bucket'), (func.row_number().over(order_by=literal_column('bucket')) - 1).label('index') ]).select_from( func.generate_series( round_time(cast(literal_column('arg_start'), timestamptz)) - literal_column('arg_step'), round_time( cast(literal_column('arg_start'), timestamptz) + literal_column('arg_interval')), literal_column('arg_step')).alias('bucket')).order_by( literal_column('bucket')).cte('buckets') def cond_sum(condition, label, invert=False): return func.sum( case([(condition, events.c.amount if not invert else -events.c.amount)], else_=None)).label(label) hist = select([ buckets.c.bucket, cond_sum(events.c.type == 'Credit', 'credit'), cond_sum(events.c.type == 'Ingress', 'ingress', invert=True), cond_sum(events.c.type == 'Egress', 'egress', invert=True), func.sum(events.c.amount).label('amount'), cond_sum( and_(balance_timestamp != None, events.c.timestamp < balance_timestamp), 'before_balance'), cond_sum( or_(balance_timestamp == None, events.c.timestamp >= balance_timestamp), 'after_balance') ]).select_from( buckets.outerjoin( events, func.width_bucket( events.c.timestamp, select([ func.array( select([buckets.c.bucket]).select_from(buckets).where( buckets.c.index != 0).label('dummy')) ])) == buckets.c.index)).where( # Discard bucket n+1 buckets.c.index < select([func.max(buckets.c.index)]) ).group_by(buckets.c.bucket).order_by( buckets.c.bucket).cte('traffic_hist') # Bucket is located before the balance and no traffic_events exist before it first_event_timestamp = select([func.min(events.c.timestamp)]).as_scalar() case_before_balance_no_data = (and_( balance_timestamp != None, hist.c.bucket < balance_timestamp, or_(first_event_timestamp == None, hist.c.bucket < first_event_timestamp)), None) # Bucket is located after the balance case_after_balance = ( or_(balance_timestamp == None, hist.c.bucket >= balance_timestamp), func.coalesce(balance_amount, 0) + func.coalesce( func.sum(hist.c.after_balance).over(order_by=hist.c.bucket.asc(), rows=(None, 0)), 0)) # Bucket is located before the balance, but there still exist traffic_events before it else_before_balance = ( func.coalesce(balance_amount, 0) + func.coalesce(hist.c.after_balance, 0) - func.coalesce( func.sum(hist.c.before_balance).over(order_by=hist.c.bucket.desc(), rows=(None, -1)), 0)) agg_hist = select([ hist.c.bucket, hist.c.credit, hist.c.ingress, hist.c.egress, case([case_before_balance_no_data, case_after_balance], else_=else_before_balance).label('balance') ]).alias('agg_hist') # Remove bucket 0 result = select([agg_hist]).order_by(agg_hist.c.bucket).offset(1) return result