def _validate_join_predicates(self, predicates): for pred in predicates: op = pred.op() if not isinstance(op, ops.Equals): raise com.TranslationError('Non-equality join predicates are ' 'not supported') left_on, right_on = op.args if left_on.get_name() != right_on.get_name(): raise com.TranslationError('Joining on different column names ' 'is not supported')
def selection(op, expr): plan = translate(op.table) selections = [] for expr in op.selections or [op.table]: # TODO(kszucs) it would be nice if we wouldn't need to handle the # specific cases in the backend implementations, we could add a # new operator which retrieves all of the TableExpr columns # (.e.g. Asterisk) so the translate() would handle this # automatically if isinstance(expr, ir.TableExpr): for name in expr.columns: column = expr.get_column(name) field = translate(column) selections.append(field) elif isinstance(expr, ir.ValueExpr): field = translate(expr) selections.append(field) else: raise com.TranslationError( "DataFusion backend is unable to compile selection with " f"expression type of {type(expr)}") plan = plan.select(*selections) if op.predicates: predicates = map(translate, op.predicates) predicate = functools.reduce(operator.and_, predicates) plan = plan.filter(predicate) if op.sort_keys: sort_keys = map(translate, op.sort_keys) plan = plan.sort(*sort_keys) return plan
def filter_by_time_context(df: DataFrame, timecontext: Optional[TimeContext] = None ) -> DataFrame: """ Filter a Dataframe by given time context Parameters ---------- df : pyspark.sql.dataframe.DataFrame timecontext: TimeContext Returns ------- filtered Spark Dataframe """ if not timecontext: return df if TIME_COL in df.columns: # For py3.8, underlying spark type converter calls utctimetuple() # and will throw excpetion for Timestamp type if tz is set. # See https://github.com/pandas-dev/pandas/issues/32174 # Dropping tz will cause spark to interpret begin, end with session # timezone & os env TZ. We convert Timestamp to pydatetime to # workaround. begin, end = timecontext return df.filter((F.col(TIME_COL) >= begin.to_pydatetime()) & (F.col(TIME_COL) < end.to_pydatetime())) else: raise com.TranslationError( "'time' column missing in Dataframe {}." "To use time context, a Timestamp column name 'time' must" "present in the table. ".format(df))
def _validate_join_predicates(self, predicates): for pred in predicates: op = pred.op() if (not isinstance(op, ops.Equals) and not self._non_equijoin_supported): raise com.TranslationError('Non-equality join predicates, ' 'i.e. non-equijoins, are not ' 'supported')
def compile_variance(t, expr, scope, context=None, **kwargs): how = expr.op().how if how == 'sample': fn = F.var_samp elif how == 'pop': fn = F.var_pop else: raise com.TranslationError( "Unexpected 'how' in translation: {}".format(how)) return compile_aggregator(t, expr, scope, fn, context)
def _adapt_expr(expr): # Non-table expressions need to be adapted to some well-formed table # expression, along with a way to adapt the results to the desired # arity (whether array-like or scalar, for example) # # Canonical case is scalar values or arrays produced by some reductions # (simple reductions, or distinct, say) if isinstance(expr, ir.Table): return expr, toolz.identity if isinstance(expr, ir.Scalar): if not expr.has_name(): expr = expr.name('tmp') if L.is_scalar_reduction(expr): table_expr = L.reduction_to_aggregation(expr) return table_expr, _get_scalar(expr.get_name()) else: return expr, _get_scalar(expr.get_name()) elif isinstance(expr, ir.Analytic): return expr.to_aggregation(), toolz.identity elif isinstance(expr, ir.Column): op = expr.op() if isinstance(op, ops.TableColumn): table_expr = op.table[[op.name]] result_handler = _get_column(op.name) else: if not expr.has_name(): expr = expr.name('tmp') table_expr = expr.to_projection() result_handler = _get_column(expr.get_name()) return table_expr, result_handler else: raise com.TranslationError( f'Do not know how to execute: {type(expr)}')
def filter_by_time_context(df: DataFrame, timecontext: Optional[TimeContext] = None ) -> DataFrame: """ Filter a Dataframe by given time context Parameters ---------- df : pyspark.sql.dataframe.DataFrame timecontext: TimeContext Returns ------- filtered Spark Dataframe """ if not timecontext: return df begin, end = timecontext if TIME_COL in df.columns: return df.filter((F.col(TIME_COL) >= begin) & (F.col(TIME_COL) < end)) else: raise com.TranslationError( "'time' column missing in Dataframe {}." "To use time context, a Timestamp column name 'time' must" "present in the table. ".format(df))
def _adapt_expr(expr): # Non-table expressions need to be adapted to some well-formed table # expression, along with a way to adapt the results to the desired # arity (whether array-like or scalar, for example) # # Canonical case is scalar values or arrays produced by some reductions # (simple reductions, or distinct, say) if isinstance(expr, ir.TableExpr): return expr, toolz.identity def _get_scalar(field): def scalar_handler(results): return results[field][0] return scalar_handler if isinstance(expr, ir.ScalarExpr): if L.is_scalar_reduction(expr): table_expr, name = L.reduction_to_aggregation( expr, default_name='tmp' ) return table_expr, _get_scalar(name) else: base_table = ir.find_base_table(expr) if base_table is None: # exprs with no table refs # TODO(phillipc): remove ScalarParameter hack if isinstance(expr.op(), ops.ScalarParameter): name = expr.get_name() assert ( name is not None ), f'scalar parameter {expr} has no name' return expr, _get_scalar(name) return expr.name('tmp'), _get_scalar('tmp') raise NotImplementedError(repr(expr)) elif isinstance(expr, ir.AnalyticExpr): return expr.to_aggregation(), toolz.identity elif isinstance(expr, ir.ColumnExpr): op = expr.op() def _get_column(name): def column_handler(results): return results[name] return column_handler if isinstance(op, ops.TableColumn): table_expr = op.table[[op.name]] result_handler = _get_column(op.name) else: # Something more complicated. base_table = L.find_source_table(expr) if isinstance(op, ops.DistinctColumn): expr = op.arg try: name = op.arg.get_name() except Exception: name = 'tmp' table_expr = base_table.projection( [expr.name(name)] ).distinct() result_handler = _get_column(name) else: table_expr = base_table.projection([expr.name('tmp')]) result_handler = _get_column('tmp') return table_expr, result_handler else: raise com.TranslationError( f'Do not know how to execute: {type(expr)}' )