def canonicalize_context( timecontext: Optional[TimeContext], ) -> Optional[TimeContext]: """Convert a timecontext to canonical one with type pandas.Timestamp for its begin and end time. Raise Exception for illegal inputs """ SUPPORTS_TIMESTAMP_TYPE = pd.Timestamp if not isinstance(timecontext, tuple) or len(timecontext) != 2: raise com.IbisError( f'Timecontext {timecontext} should specify (begin, end)' ) begin, end = timecontext if not isinstance(begin, SUPPORTS_TIMESTAMP_TYPE): raise com.IbisError( f'begin time value {begin} of type {type(begin)} is not' ' of type pd.Timestamp' ) if not isinstance(end, SUPPORTS_TIMESTAMP_TYPE): raise com.IbisError( f'end time value {end} of type {type(begin)} is not' ' of type pd.Timestamp' ) if begin > end: raise com.IbisError( f'begin time {begin} must be before or equal' f' to end time {end}' ) return begin, end
def to_aggregation(self, metric_name=None, parent_table=None, backup_metric_name=None): """ Convert the TopK operation to a table aggregation """ op = self.op() arg_table = find_base_table(op.arg) by = op.by if not isinstance(by, Expr): by = by(arg_table) by_table = arg_table else: by_table = find_base_table(op.by) if metric_name is None: if by.get_name() == op.arg.get_name(): by = by.name(backup_metric_name) else: by = by.name(metric_name) if arg_table.equals(by_table): agg = arg_table.aggregate(by, by=[op.arg]) elif parent_table is not None: agg = parent_table.aggregate(by, by=[op.arg]) else: raise com.IbisError('Cross-table TopK; must provide a parent ' 'joined table') return agg.sort_by([(by.get_name(), False)]).limit(op.k)
def execute( self, expr: ir.Expr, params: Mapping[ir.Expr, object] = None, limit: str = 'default', **kwargs: Any, ): if isinstance(expr, ir.TableExpr): frame = self.compile(expr, params, **kwargs) table = _to_pyarrow_table(frame) return table.to_pandas() elif isinstance(expr, ir.ColumnExpr): # expression must be named for the projection expr = expr.name('tmp').to_projection() frame = self.compile(expr, params, **kwargs) table = _to_pyarrow_table(frame) return table['tmp'].to_pandas() elif isinstance(expr, ir.ScalarExpr): if expr.op().root_tables(): # there are associated datafusion tables so convert the expr # to a selection which we can directly convert to a datafusion # plan expr = expr.name('tmp').to_projection() frame = self.compile(expr, params, **kwargs) else: # doesn't have any tables associated so create a plan from a # dummy datafusion table compiled = self.compile(expr, params, **kwargs) frame = self._context.empty_table().select(compiled) table = _to_pyarrow_table(frame) return table[0][0].as_py() else: raise com.IbisError( f"Cannot execute expression of type: {type(expr)}" )
def validate_backends(backends) -> list: """Validate bacckends. Parameters ---------- backends Returns ------- list A list containing a specified backend or the default backend. Raises ------ ibis.common.exceptions.IbisError If no backend is specified and if there is no default backend specified. ValueError if there are more than one backend specified. """ if not backends: default = options.default_backend if default is None: raise com.IbisError( 'Expression depends on no backends, and found no default') return [default] if len(backends) > 1: raise ValueError('Multiple backends found') return backends
def execute( self, expr: ir.Expr, timecontext: Mapping | None = None, params: Mapping[ir.Scalar, Any] | None = None, limit: str = 'default', **kwargs: Any, ) -> Any: """Execute an expression.""" if isinstance(expr, types.Table): return self.compile(expr, timecontext, params, **kwargs).toPandas() elif isinstance(expr, types.Column): # expression must be named for the projection if not expr.has_name(): expr = expr.name("tmp") return self.compile( expr.to_projection(), timecontext, params, **kwargs ).toPandas()[expr.get_name()] elif isinstance(expr, types.Scalar): compiled = self.compile(expr, timecontext, params, **kwargs) if isinstance(compiled, Column): # attach result column to a fake DataFrame and # select the result compiled = self._session.range(0, 1).select(compiled) return compiled.toPandas().iloc[0, 0] else: raise com.IbisError( f"Cannot execute expression of type: {type(expr)}" )
def _get_hdfs(self): if self._hdfs is None: raise com.IbisError( 'No HDFS connection; must pass connection ' 'using the hdfs_client argument to ' 'ibis.impala.connect' ) return self._hdfs
def _match_name(self): m = ddl.fully_qualified_re.match(self._qualified_name) if not m: raise com.IbisError( 'Cannot determine database name from {0}'.format( self._qualified_name)) db, quoted, unquoted = m.groups() return db, quoted or unquoted
def validate_backends(backends): if not backends: default = options.default_backend if default is None: raise com.IbisError( 'Expression depends on no backends, and found no default') return [default] if len(backends) > 1: raise ValueError('Multiple backends found') return backends
def schema(self): """ Get the schema for this table (if one is known) Returns ------- schema : Schema """ if not self._is_materialized(): raise com.IbisError('Table operation is not yet materialized') return self.op().schema
def create_table(self, table_name, obj=None, schema=None, database=None, max_rows=None): """ Create a new table in OmniSciDB using an Ibis table expression. Parameters ---------- table_name : string obj : TableExpr or pandas.DataFrame, optional If passed, creates table from select statement results schema : ibis.Schema, optional Mutually exclusive with expr, creates an empty table with a particular schema database : string, default None (optional) max_rows : int, Default None Set the maximum number of rows allowed in a table to create a capped collection. When this limit is reached, the oldest fragment is removed. Default = 2^62. Examples -------- >>> con.create_table('new_table_name', table_expr) # doctest: +SKIP """ _database = self.db_name self.set_database(database) if obj is not None: if isinstance(obj, pd.DataFrame): raise NotImplementedError( 'Pandas Data Frame input not implemented.') else: to_insert = obj ast = self._build_ast(to_insert, OmniSciDBDialect.make_context()) select = ast.queries[0] statement = ddl.CTAS(table_name, select, database=database) elif schema is not None: statement = ddl.CreateTableWithSchema(table_name, schema, database=database, max_rows=max_rows) else: raise com.IbisError('Must pass expr or schema') result = self._execute(statement, False) self.set_database(_database) return result
def _get_concrete_table_path(self, name, database, persist=False): if not persist: if name is None: name = f'__ibis_tmp_{util.guid()}' if database is None: self._ensure_temp_db_exists() database = options.impala.temp_db return name, database else: if name is None: raise com.IbisError('Must pass table name if persist=True') return name, database
def create_table(self, table_name, obj=None, schema=None): """Create a table.""" if obj is None and schema is None: raise com.IbisError('Must pass expr or schema') if obj is not None: df = pd.DataFrame(obj) else: dtypes = ibis_schema_to_pandas(schema) df = schema.apply_to( pd.DataFrame(columns=list(map(toolz.first, dtypes)))) self.dictionary[table_name] = df
def execute_database_table_client(op, client, timecontext: Optional[TimeContext], **kwargs): df = client.dictionary[op.name] if timecontext: begin, end = timecontext if TIME_COL not in df: raise com.IbisError( f'Table {op.name} must have a time column named {TIME_COL}' ' to execute with time context.') # filter with time context mask = df[TIME_COL].between(begin, end) return df.loc[mask].reset_index(drop=True) return df
def _find_any_file(self, hdfs_dir): contents = self.ls(hdfs_dir, status=True) def valid_filename(name): head, tail = posixpath.split(name) tail = tail.lower() return (not tail.endswith('.tmp') and not tail.endswith('.copying') and not tail.startswith('_') and not tail.startswith('.')) for filename, meta in contents: if meta['type'].lower() == 'file' and valid_filename(filename): return filename raise com.IbisError('No files found in the passed directory')
def _find_backend(self) -> BaseBackend: backends = self._find_backends() if not backends: default = options.default_backend if default is None: raise com.IbisError( 'Expression depends on no backends, and found no default') return default if len(backends) > 1: raise ValueError('Multiple backends found') return backends[0]
def adjust_context(op: Any, timecontext: TimeContext) -> TimeContext: """ Params ------- op: ibis.expr.operations.Node timecontext: TimeContext time context associated with the node Returns -------- Adjusted time context For op that is not of type Node, we raise an error to avoid failing silently since the default behavior is to return input timecontext itself. """ raise com.IbisError(f'Unsupported input type for adjust context for {op}')
def execute(self, expr, params=None, limit='default', **kwargs): if isinstance(expr, types.TableExpr): return self.compile(expr).toPandas() elif isinstance(expr, types.ColumnExpr): # expression must be named for the projection expr = expr.name('tmp') return self.compile(expr.to_projection()).toPandas()['tmp'] elif isinstance(expr, types.ScalarExpr): compiled = self.compile(expr) if isinstance(compiled, Column): # attach result column to a fake DataFrame and # select the result compiled = self._session.range(0, 1).select(compiled) return compiled.toPandas().iloc[0, 0] else: raise com.IbisError("Cannot execute expression of type: {}".format( type(expr)))
def attach(self, name, path, create=False): """Connect another SQLite database file Parameters ---------- name : string Database name within SQLite path : string Path to sqlite3 file create : boolean, optional If file does not exist, create file if True otherwise raise an Exception """ if not os.path.exists(path) and not create: raise com.IbisError('File {!r} does not exist'.format(path)) self.raw_sql("ATTACH DATABASE {path!r} AS {name}".format( path=path, name=self.con.dialect.identifier_preparer.quote(name), ))
def create_table( self, table_name: str, obj: dd.DataFrame = None, schema: sch.Schema = None, ): """Create a table.""" if obj is not None: df = obj elif schema is not None: dtypes = ibis_schema_to_dask(schema) df = schema.apply_to( dd.from_pandas( pd.DataFrame(columns=list(map(toolz.first, dtypes))), npartitions=1, )) else: raise com.IbisError('Must pass expr or schema') self.dictionary[table_name] = df
def create_table(self, table_name, obj=None, schema=None): """Create a table.""" if obj is None and schema is None: raise com.IbisError('Must pass expr or schema') if obj is not None: if not self._supports_conversion(obj): raise com.BackendConversionError( f"Unable to convert {obj.__class__} object " f"to backend type: {self.__class__.backend_table_type}") df = self._convert_object(obj) else: pandas_schema = self._convert_schema(schema) dtypes = dict(pandas_schema) df = self._from_pandas( pd.DataFrame(columns=dtypes.keys()).astype(dtypes)) self.dictionary[table_name] = df if schema is not None: self.schemas[table_name] = schema
def parquet_file( self, hdfs_dir, schema=None, name=None, database=None, external=True, like_file=None, like_table=None, persist=False, ): """Make indicated parquet file in HDFS available as an Ibis table. The table created can be optionally named and persisted, otherwise a unique name will be generated. Temporarily, for any non-persistent external table created by Ibis we will attempt to drop it when the underlying object is garbage collected (or the Python interpreter shuts down normally). Parameters ---------- hdfs_dir Path in HDFS schema If no schema provided, and neither of the like_* argument is passed, one will be inferred from one of the parquet files in the directory. like_file Absolute path to Parquet file in HDFS to use for schema definitions. An alternative to having to supply an explicit schema like_table Fully scoped and escaped string to an Impala table whose schema we will use for the newly created table. name Random unique name generated otherwise database Database to create the (possibly temporary) table in external If a table is external, the referenced data will not be deleted when the table is dropped in Impala. Otherwise (external=False) Impala takes ownership of the Parquet file. persist Do not drop the table during garbage collection Returns ------- ImpalaTable Impala table expression """ name, database = self._get_concrete_table_path( name, database, persist=persist ) # If no schema provided, need to find some absolute path to a file in # the HDFS directory if like_file is None and like_table is None and schema is None: try: file_name = next( fn for fn in ( os.path.basename(f["name"]) for f in self.hdfs.ls(hdfs_dir, detail=True) if f["type"].lower() == "file" ) if not fn.startswith(("_", ".")) if not fn.endswith((".tmp", ".copying")) ) except StopIteration: raise com.IbisError("No files found in the passed directory") else: like_file = pjoin(hdfs_dir, file_name) stmt = ddl.CreateTableParquet( name, hdfs_dir, schema=schema, database=database, example_file=like_file, example_table=like_table, external=external, can_exist=False, ) self.raw_sql(stmt) return self._wrap_new_table(name, database, persist)
def formatter(t, expr): if arity != len(expr.op().args): raise com.IbisError('incorrect number of args') return _varargs_call(sa_func, t, expr)
def execute_until_in_scope( expr, scope: Scope, timecontext: Optional[TimeContext] = None, aggcontext=None, clients=None, post_execute_=None, **kwargs, ) -> Scope: """Execute until our op is in `scope`. Parameters ---------- expr : ibis.expr.types.Expr scope : Scope timecontext : Optional[TimeContext] aggcontext : Optional[AggregationContext] clients : List[ibis.client.Client] kwargs : Mapping """ # these should never be None assert aggcontext is not None, 'aggcontext is None' assert clients is not None, 'clients is None' assert post_execute_ is not None, 'post_execute_ is None' # base case: our op has been computed (or is a leaf data node), so # return the corresponding value op = expr.op() if scope.get_value(op, timecontext) is not None: return scope if isinstance(op, ops.Literal): # special case literals to avoid the overhead of dispatching # execute_node return Scope( { op: execute_literal( op, op.value, expr.type(), aggcontext=aggcontext, **kwargs) }, timecontext, ) # figure out what arguments we're able to compute on based on the # expressions inputs. things like expressions, None, and scalar types are # computable whereas ``list``s are not computable_args = [arg for arg in op.inputs if is_computable_input(arg)] # pre_executed_states is a list of states with same the length of # computable_args, these states are passed to each arg if timecontext: arg_timecontexts = compute_time_context( op, num_args=len(computable_args), timecontext=timecontext, clients=clients, ) else: arg_timecontexts = [None] * len(computable_args) pre_executed_scope = pre_execute( op, *clients, scope=scope, timecontext=timecontext, aggcontext=aggcontext, **kwargs, ) new_scope = scope.merge_scope(pre_executed_scope) # Short circuit: if pre_execute puts op in scope, then we don't need to # execute its computable_args if new_scope.get_value(op, timecontext) is not None: return new_scope # recursively compute each node's arguments until we've changed type. # compute_time_context should return with a list with the same length # as computable_args, the two lists will be zipping together for # further execution if len(arg_timecontexts) != len(computable_args): raise com.IbisError( 'arg_timecontexts differ with computable_arg in length ' f'for type:\n{type(op).__name__}.') scopes = [ execute_until_in_scope( arg, new_scope, timecontext=timecontext, aggcontext=aggcontext, post_execute_=post_execute_, clients=clients, **kwargs, ) if hasattr(arg, 'op') else Scope({arg: arg}, timecontext) for (arg, timecontext) in zip(computable_args, arg_timecontexts) ] # if we're unable to find data then raise an exception if not scopes and computable_args: raise com.UnboundExpressionError( 'Unable to find data for expression:\n{}'.format(repr(expr))) # there should be exactly one dictionary per computable argument assert len(computable_args) == len(scopes) new_scope = new_scope.merge_scopes(scopes) # pass our computed arguments to this node's execute_node implementation data = [ new_scope.get_value(arg.op(), timecontext) if hasattr(arg, 'op') else arg for arg in computable_args ] result = execute_node( op, *data, scope=scope, timecontext=timecontext, aggcontext=aggcontext, clients=clients, **kwargs, ) computed = post_execute_(op, result, timecontext=timecontext) return Scope({op: computed}, timecontext)
def create_table( self, table_name: str, obj: Optional[Union[ir.TableExpr, pd.DataFrame]] = None, schema: Optional[sch.Schema] = None, database: Optional[str] = None, max_rows: Optional[int] = None, fragment_size: Optional[int] = None, is_temporary: bool = False, **kwargs, ): """ Create a new table from an Ibis table expression. Parameters ---------- table_name : string obj : ibis.expr.types.TableExpr or pandas.DataFrame, optional If passed, creates table from select statement results schema : ibis.Schema, optional table_name : str obj : TableExpr or pandas.DataFrame, optional, default None If passed, creates table from select statement results. schema : ibis.Schema, optional, default None Mutually exclusive with expr, creates an empty table with a particular schema database : str, optional, default None max_rows : int, optional, default None Set the maximum number of rows allowed in a table to create a capped collection. When this limit is reached, the oldest fragment is removed. fragment_size: int, optional, default 32000000 if gpu_device is enabled otherwise 5000000 Number of rows per fragment that is a unit of the table for query processing, which is not expected to be changed. is_temporary : bool, default False If True it the table will be created as temporary. Examples -------- >>> con.create_table('new_table_name', table_expr) # doctest: +SKIP """ _database = self.db_name self.set_database(database) if fragment_size is None: fragment_size = 32000000 if self.gpu_device else 5000000 if obj is not None: if isinstance(obj, pd.DataFrame): raise NotImplementedError( 'Pandas Data Frame input not implemented.') else: to_insert = obj ast = self._build_ast(to_insert, OmniSciDBDialect.make_context()) select = ast.queries[0] statement = ddl.CTAS(table_name, select, database=database) elif schema is not None: statement = ddl.CreateTableWithSchema( table_name, schema, database=database, max_rows=max_rows, fragment_size=fragment_size, is_temporary=is_temporary, ) else: raise com.IbisError('Must pass expr or schema') self._execute(statement, False) self.set_database(_database)
def create_table( self, table_name, obj=None, schema=None, database=None, external=False, force=False, # HDFS options format='parquet', location=None, partition=None, like_parquet=None, ): """ Create a new table in Impala using an Ibis table expression. This is currently designed for tables whose data is stored in HDFS (or eventually other filesystems). Parameters ---------- table_name : string obj : TableExpr or pandas.DataFrame, optional If passed, creates table from select statement results schema : ibis.Schema, optional Mutually exclusive with obj, creates an empty table with a particular schema database : string, default None (optional) force : boolean, default False Do not create table if table with indicated name already exists external : boolean, default False Create an external table; Impala will not delete the underlying data when the table is dropped format : {'parquet'} location : string, default None Specify the directory location where Impala reads and writes files for the table partition : list of strings Must pass a schema to use this. Cannot partition from an expression (create-table-as-select) like_parquet : string (HDFS path), optional Can specify in lieu of a schema Examples -------- >>> con.create_table('new_table_name', table_expr) # doctest: +SKIP """ if like_parquet is not None: raise NotImplementedError if obj is not None: with self._setup_insert(obj) as to_insert: ast = self.compiler.to_ast(to_insert) select = ast.queries[0] self.raw_sql( CTAS( table_name, select, database=database, can_exist=force, format=format, external=external, partition=partition, path=location, ) ) elif schema is not None: self.raw_sql( CreateTableWithSchema( table_name, schema, database=database, format=format, can_exist=force, external=external, path=location, partition=partition, ) ) else: raise com.IbisError('Must pass obj or schema')
def create_table( self, table_name, obj=None, schema=None, database=None, force=False, # HDFS options format='parquet', ): """ Create a new table in Spark using an Ibis table expression. Parameters ---------- table_name : string obj : TableExpr or pandas.DataFrame, optional If passed, creates table from select statement results schema : ibis.Schema, optional Mutually exclusive with obj, creates an empty table with a particular schema database : string, default None (optional) force : boolean, default False If true, create table if table with indicated name already exists format : {'parquet'} Examples -------- >>> con.create_table('new_table_name', table_expr) # doctest: +SKIP """ if obj is not None: if isinstance(obj, pd.DataFrame): spark_df = self._session.createDataFrame(obj) mode = 'error' if force: mode = 'overwrite' spark_df.write.saveAsTable( table_name, format=format, mode=mode ) return ast = self._build_ast(obj, SparkDialect.make_context()) select = ast.queries[0] statement = ddl.CTAS( table_name, select, database=database, can_exist=force, format=format, ) elif schema is not None: statement = ddl.CreateTableWithSchema( table_name, schema, database=database, format=format, can_exist=force, ) else: raise com.IbisError('Must pass expr or schema') return self._execute(statement.compile())
def hdfs_connect( host='localhost', port=50070, protocol='webhdfs', use_https='default', auth_mechanism='NOSASL', verify=True, session=None, **kwds, ): """Connect to HDFS. Parameters ---------- host : str Host name of the HDFS NameNode port : int NameNode's WebHDFS port protocol : str, The protocol used to communicate with HDFS. The only valid value is ``'webhdfs'``. use_https : bool Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure authentication, the default for this is True, otherwise False. auth_mechanism : str Set to NOSASL or PLAIN for non-secure clusters. Set to GSSAPI or LDAP for Kerberos-secured clusters. verify : bool Set to :data:`False` to turn off verifying SSL certificates. session : Optional[requests.Session] A custom :class:`requests.Session` object. Notes ----- Other keywords are forwarded to HDFS library classes. Returns ------- WebHDFS """ import requests if session is None: session = requests.Session() session.verify = verify if auth_mechanism in ('GSSAPI', 'LDAP'): if use_https == 'default': prefix = 'https' else: prefix = 'https' if use_https else 'http' try: import requests_kerberos # noqa: F401 except ImportError: raise com.IbisError( "Unable to import requests-kerberos, which is required for " "Kerberos HDFS support. Install it by executing `pip install " "requests-kerberos` or `pip install hdfs[kerberos]`.") from hdfs.ext.kerberos import KerberosClient # note SSL url = '{0}://{1}:{2}'.format(prefix, host, port) kwds.setdefault('mutual_auth', 'OPTIONAL') hdfs_client = KerberosClient(url, session=session, **kwds) else: if use_https == 'default': prefix = 'http' else: prefix = 'https' if use_https else 'http' from hdfs.client import InsecureClient url = '{}://{}:{}'.format(prefix, host, port) hdfs_client = InsecureClient(url, session=session, **kwds) return WebHDFS(hdfs_client)
def formatter(translator, expr): op = expr.op() if min_arity > len(op.args) or max_arity < len(op.args): raise com.IbisError("incorrect number of args") return _format_call(translator, func_name, *op.args)
def formatter(translator, expr): op = expr.op() if arity != len(op.args): raise com.IbisError('incorrect number of args') return _format_call(translator, func_name, *op.args)
def construct_time_context_aware_series(series: pd.Series, frame: pd.DataFrame) -> pd.Series: """Construct a Series by adding 'time' in its MultiIndex In window execution, the result Series of udf may need to be trimmed by timecontext. In order to do so, 'time' must be added as an index to the Series. We extract time column from the parent Dataframe `frame`. See `trim_window_result` in execution/window.py for trimming implementation. Parameters ---------- series: pd.Series, the result series of an udf execution frame: pd.DataFrame, the parent Dataframe of `series` Returns ------- pd.Series Examples -------- >>> import pandas as pd >>> from ibis.expr.timecontext import construct_time_context_aware_series >>> df = pd.DataFrame( ... { ... 'time': pd.Series( ... pd.date_range( ... start='2017-01-02', periods=3 ... ).values ... ), ... 'id': [1,2,3], ... 'value': [1.1, 2.2, 3.3], ... } ... ) >>> df time id value 0 2017-01-02 1 1.1 1 2017-01-03 2 2.2 2 2017-01-04 3 3.3 >>> series = df['value'] >>> series 0 1.1 1 2.2 2 3.3 Name: value, dtype: float64 >>> construct_time_context_aware_series(series, df) time 0 2017-01-02 1.1 1 2017-01-03 2.2 2 2017-01-04 3.3 Name: value, dtype: float64 The index will be a MultiIndex of the original RangeIndex and a DateTimeIndex. >>> timed_series = construct_time_context_aware_series(series, df) >>> timed_series time 0 2017-01-02 1.1 1 2017-01-03 2.2 2 2017-01-04 3.3 Name: value, dtype: float64 >>> construct_time_context_aware_series(timed_series, df) time 0 2017-01-02 1.1 1 2017-01-03 2.2 2 2017-01-04 3.3 Name: value, dtype: float64 The result is unchanged for a series already has 'time' as its index. """ time_col = get_time_col() if time_col == frame.index.name: time_index = frame.index elif time_col in frame: time_index = pd.Index(frame[time_col]) else: raise com.IbisError(f'"time" column not present in DataFrame {frame}') if time_col not in series.index.names: series.index = pd.MultiIndex.from_arrays( list( map(series.index.get_level_values, range( series.index.nlevels))) + [time_index], names=series.index.names + [time_col], ) return series