Ejemplo n.º 1
0
def canonicalize_context(
    timecontext: Optional[TimeContext],
) -> Optional[TimeContext]:
    """Convert a timecontext to canonical one with type pandas.Timestamp
    for its begin and end time. Raise Exception for illegal inputs
    """
    SUPPORTS_TIMESTAMP_TYPE = pd.Timestamp
    if not isinstance(timecontext, tuple) or len(timecontext) != 2:
        raise com.IbisError(
            f'Timecontext {timecontext} should specify (begin, end)'
        )

    begin, end = timecontext

    if not isinstance(begin, SUPPORTS_TIMESTAMP_TYPE):
        raise com.IbisError(
            f'begin time value {begin} of type {type(begin)} is not'
            ' of type pd.Timestamp'
        )
    if not isinstance(end, SUPPORTS_TIMESTAMP_TYPE):
        raise com.IbisError(
            f'end time value {end} of type {type(begin)} is not'
            ' of type pd.Timestamp'
        )
    if begin > end:
        raise com.IbisError(
            f'begin time {begin} must be before or equal' f' to end time {end}'
        )
    return begin, end
Ejemplo n.º 2
0
    def to_aggregation(self,
                       metric_name=None,
                       parent_table=None,
                       backup_metric_name=None):
        """
        Convert the TopK operation to a table aggregation
        """
        op = self.op()

        arg_table = find_base_table(op.arg)

        by = op.by
        if not isinstance(by, Expr):
            by = by(arg_table)
            by_table = arg_table
        else:
            by_table = find_base_table(op.by)

        if metric_name is None:
            if by.get_name() == op.arg.get_name():
                by = by.name(backup_metric_name)
        else:
            by = by.name(metric_name)

        if arg_table.equals(by_table):
            agg = arg_table.aggregate(by, by=[op.arg])
        elif parent_table is not None:
            agg = parent_table.aggregate(by, by=[op.arg])
        else:
            raise com.IbisError('Cross-table TopK; must provide a parent '
                                'joined table')

        return agg.sort_by([(by.get_name(), False)]).limit(op.k)
Ejemplo n.º 3
0
 def execute(
     self,
     expr: ir.Expr,
     params: Mapping[ir.Expr, object] = None,
     limit: str = 'default',
     **kwargs: Any,
 ):
     if isinstance(expr, ir.TableExpr):
         frame = self.compile(expr, params, **kwargs)
         table = _to_pyarrow_table(frame)
         return table.to_pandas()
     elif isinstance(expr, ir.ColumnExpr):
         # expression must be named for the projection
         expr = expr.name('tmp').to_projection()
         frame = self.compile(expr, params, **kwargs)
         table = _to_pyarrow_table(frame)
         return table['tmp'].to_pandas()
     elif isinstance(expr, ir.ScalarExpr):
         if expr.op().root_tables():
             # there are associated datafusion tables so convert the expr
             # to a selection which we can directly convert to a datafusion
             # plan
             expr = expr.name('tmp').to_projection()
             frame = self.compile(expr, params, **kwargs)
         else:
             # doesn't have any tables associated so create a plan from a
             # dummy datafusion table
             compiled = self.compile(expr, params, **kwargs)
             frame = self._context.empty_table().select(compiled)
         table = _to_pyarrow_table(frame)
         return table[0][0].as_py()
     else:
         raise com.IbisError(
             f"Cannot execute expression of type: {type(expr)}"
         )
Ejemplo n.º 4
0
def validate_backends(backends) -> list:
    """Validate bacckends.

    Parameters
    ----------
    backends

    Returns
    -------
    list
        A list containing a specified backend or the default backend.

    Raises
    ------
    ibis.common.exceptions.IbisError
        If no backend is specified and
        if there is no default backend specified.
    ValueError
        if there are more than one backend specified.
    """
    if not backends:
        default = options.default_backend
        if default is None:
            raise com.IbisError(
                'Expression depends on no backends, and found no default')
        return [default]

    if len(backends) > 1:
        raise ValueError('Multiple backends found')
    return backends
Ejemplo n.º 5
0
 def execute(
     self,
     expr: ir.Expr,
     timecontext: Mapping | None = None,
     params: Mapping[ir.Scalar, Any] | None = None,
     limit: str = 'default',
     **kwargs: Any,
 ) -> Any:
     """Execute an expression."""
     if isinstance(expr, types.Table):
         return self.compile(expr, timecontext, params, **kwargs).toPandas()
     elif isinstance(expr, types.Column):
         # expression must be named for the projection
         if not expr.has_name():
             expr = expr.name("tmp")
         return self.compile(
             expr.to_projection(), timecontext, params, **kwargs
         ).toPandas()[expr.get_name()]
     elif isinstance(expr, types.Scalar):
         compiled = self.compile(expr, timecontext, params, **kwargs)
         if isinstance(compiled, Column):
             # attach result column to a fake DataFrame and
             # select the result
             compiled = self._session.range(0, 1).select(compiled)
         return compiled.toPandas().iloc[0, 0]
     else:
         raise com.IbisError(
             f"Cannot execute expression of type: {type(expr)}"
         )
Ejemplo n.º 6
0
 def _get_hdfs(self):
     if self._hdfs is None:
         raise com.IbisError(
             'No HDFS connection; must pass connection '
             'using the hdfs_client argument to '
             'ibis.impala.connect'
         )
     return self._hdfs
Ejemplo n.º 7
0
 def _match_name(self):
     m = ddl.fully_qualified_re.match(self._qualified_name)
     if not m:
         raise com.IbisError(
             'Cannot determine database name from {0}'.format(
                 self._qualified_name))
     db, quoted, unquoted = m.groups()
     return db, quoted or unquoted
Ejemplo n.º 8
0
def validate_backends(backends):
    if not backends:
        default = options.default_backend
        if default is None:
            raise com.IbisError(
                'Expression depends on no backends, and found no default')
        return [default]

    if len(backends) > 1:
        raise ValueError('Multiple backends found')
    return backends
Ejemplo n.º 9
0
    def schema(self):
        """
        Get the schema for this table (if one is known)

        Returns
        -------
        schema : Schema
        """
        if not self._is_materialized():
            raise com.IbisError('Table operation is not yet materialized')
        return self.op().schema
Ejemplo n.º 10
0
    def create_table(self,
                     table_name,
                     obj=None,
                     schema=None,
                     database=None,
                     max_rows=None):
        """
        Create a new table in OmniSciDB using an Ibis table expression.

        Parameters
        ----------
        table_name : string
        obj : TableExpr or pandas.DataFrame, optional
          If passed, creates table from select statement results
        schema : ibis.Schema, optional
          Mutually exclusive with expr, creates an empty table with a
          particular schema
        database : string, default None (optional)
        max_rows : int, Default None
          Set the maximum number of rows allowed in a table to create a capped
          collection. When this limit is reached, the oldest fragment is
          removed. Default = 2^62.

        Examples
        --------
        >>> con.create_table('new_table_name', table_expr)  # doctest: +SKIP
        """
        _database = self.db_name
        self.set_database(database)

        if obj is not None:
            if isinstance(obj, pd.DataFrame):
                raise NotImplementedError(
                    'Pandas Data Frame input not implemented.')
            else:
                to_insert = obj
            ast = self._build_ast(to_insert, OmniSciDBDialect.make_context())
            select = ast.queries[0]

            statement = ddl.CTAS(table_name, select, database=database)
        elif schema is not None:
            statement = ddl.CreateTableWithSchema(table_name,
                                                  schema,
                                                  database=database,
                                                  max_rows=max_rows)
        else:
            raise com.IbisError('Must pass expr or schema')

        result = self._execute(statement, False)

        self.set_database(_database)
        return result
Ejemplo n.º 11
0
    def _get_concrete_table_path(self, name, database, persist=False):
        if not persist:
            if name is None:
                name = f'__ibis_tmp_{util.guid()}'

            if database is None:
                self._ensure_temp_db_exists()
                database = options.impala.temp_db
            return name, database
        else:
            if name is None:
                raise com.IbisError('Must pass table name if persist=True')
            return name, database
Ejemplo n.º 12
0
    def create_table(self, table_name, obj=None, schema=None):
        """Create a table."""
        if obj is None and schema is None:
            raise com.IbisError('Must pass expr or schema')

        if obj is not None:
            df = pd.DataFrame(obj)
        else:
            dtypes = ibis_schema_to_pandas(schema)
            df = schema.apply_to(
                pd.DataFrame(columns=list(map(toolz.first, dtypes))))

        self.dictionary[table_name] = df
Ejemplo n.º 13
0
def execute_database_table_client(op, client,
                                  timecontext: Optional[TimeContext],
                                  **kwargs):
    df = client.dictionary[op.name]
    if timecontext:
        begin, end = timecontext
        if TIME_COL not in df:
            raise com.IbisError(
                f'Table {op.name} must have a time column named {TIME_COL}'
                ' to execute with time context.')
        # filter with time context
        mask = df[TIME_COL].between(begin, end)
        return df.loc[mask].reset_index(drop=True)
    return df
Ejemplo n.º 14
0
    def _find_any_file(self, hdfs_dir):
        contents = self.ls(hdfs_dir, status=True)

        def valid_filename(name):
            head, tail = posixpath.split(name)

            tail = tail.lower()
            return (not tail.endswith('.tmp') and not tail.endswith('.copying')
                    and not tail.startswith('_') and not tail.startswith('.'))

        for filename, meta in contents:
            if meta['type'].lower() == 'file' and valid_filename(filename):
                return filename
        raise com.IbisError('No files found in the passed directory')
Ejemplo n.º 15
0
Archivo: core.py Proyecto: jelitox/ibis
    def _find_backend(self) -> BaseBackend:
        backends = self._find_backends()

        if not backends:
            default = options.default_backend
            if default is None:
                raise com.IbisError(
                    'Expression depends on no backends, and found no default')
            return default

        if len(backends) > 1:
            raise ValueError('Multiple backends found')

        return backends[0]
Ejemplo n.º 16
0
def adjust_context(op: Any, timecontext: TimeContext) -> TimeContext:
    """
    Params
    -------
    op: ibis.expr.operations.Node
    timecontext: TimeContext
        time context associated with the node

    Returns
    --------
    Adjusted time context
        For op that is not of type Node, we raise an error to avoid failing
        silently since the default behavior is to return input timecontext
        itself.
    """
    raise com.IbisError(f'Unsupported input type for adjust context for {op}')
Ejemplo n.º 17
0
 def execute(self, expr, params=None, limit='default', **kwargs):
     if isinstance(expr, types.TableExpr):
         return self.compile(expr).toPandas()
     elif isinstance(expr, types.ColumnExpr):
         # expression must be named for the projection
         expr = expr.name('tmp')
         return self.compile(expr.to_projection()).toPandas()['tmp']
     elif isinstance(expr, types.ScalarExpr):
         compiled = self.compile(expr)
         if isinstance(compiled, Column):
             # attach result column to a fake DataFrame and
             # select the result
             compiled = self._session.range(0, 1).select(compiled)
         return compiled.toPandas().iloc[0, 0]
     else:
         raise com.IbisError("Cannot execute expression of type: {}".format(
             type(expr)))
Ejemplo n.º 18
0
    def attach(self, name, path, create=False):
        """Connect another SQLite database file

        Parameters
        ----------
        name : string
            Database name within SQLite
        path : string
            Path to sqlite3 file
        create : boolean, optional
            If file does not exist, create file if True otherwise raise an
            Exception
        """
        if not os.path.exists(path) and not create:
            raise com.IbisError('File {!r} does not exist'.format(path))

        self.raw_sql("ATTACH DATABASE {path!r} AS {name}".format(
            path=path,
            name=self.con.dialect.identifier_preparer.quote(name),
        ))
Ejemplo n.º 19
0
    def create_table(
        self,
        table_name: str,
        obj: dd.DataFrame = None,
        schema: sch.Schema = None,
    ):
        """Create a table."""
        if obj is not None:
            df = obj
        elif schema is not None:
            dtypes = ibis_schema_to_dask(schema)
            df = schema.apply_to(
                dd.from_pandas(
                    pd.DataFrame(columns=list(map(toolz.first, dtypes))),
                    npartitions=1,
                ))
        else:
            raise com.IbisError('Must pass expr or schema')

        self.dictionary[table_name] = df
Ejemplo n.º 20
0
    def create_table(self, table_name, obj=None, schema=None):
        """Create a table."""
        if obj is None and schema is None:
            raise com.IbisError('Must pass expr or schema')

        if obj is not None:
            if not self._supports_conversion(obj):
                raise com.BackendConversionError(
                    f"Unable to convert {obj.__class__} object "
                    f"to backend type: {self.__class__.backend_table_type}")
            df = self._convert_object(obj)
        else:
            pandas_schema = self._convert_schema(schema)
            dtypes = dict(pandas_schema)
            df = self._from_pandas(
                pd.DataFrame(columns=dtypes.keys()).astype(dtypes))

        self.dictionary[table_name] = df

        if schema is not None:
            self.schemas[table_name] = schema
Ejemplo n.º 21
0
    def parquet_file(
        self,
        hdfs_dir,
        schema=None,
        name=None,
        database=None,
        external=True,
        like_file=None,
        like_table=None,
        persist=False,
    ):
        """Make indicated parquet file in HDFS available as an Ibis table.

        The table created can be optionally named and persisted, otherwise a
        unique name will be generated. Temporarily, for any non-persistent
        external table created by Ibis we will attempt to drop it when the
        underlying object is garbage collected (or the Python interpreter shuts
        down normally).

        Parameters
        ----------
        hdfs_dir
            Path in HDFS
        schema
            If no schema provided, and neither of the like_* argument is
            passed, one will be inferred from one of the parquet files in the
            directory.
        like_file
            Absolute path to Parquet file in HDFS to use for schema
            definitions. An alternative to having to supply an explicit schema
        like_table
            Fully scoped and escaped string to an Impala table whose schema we
            will use for the newly created table.
        name
            Random unique name generated otherwise
        database
            Database to create the (possibly temporary) table in
        external
            If a table is external, the referenced data will not be deleted
            when the table is dropped in Impala. Otherwise (external=False)
            Impala takes ownership of the Parquet file.
        persist
            Do not drop the table during garbage collection

        Returns
        -------
        ImpalaTable
            Impala table expression
        """
        name, database = self._get_concrete_table_path(
            name, database, persist=persist
        )

        # If no schema provided, need to find some absolute path to a file in
        # the HDFS directory
        if like_file is None and like_table is None and schema is None:
            try:
                file_name = next(
                    fn
                    for fn in (
                        os.path.basename(f["name"])
                        for f in self.hdfs.ls(hdfs_dir, detail=True)
                        if f["type"].lower() == "file"
                    )
                    if not fn.startswith(("_", "."))
                    if not fn.endswith((".tmp", ".copying"))
                )
            except StopIteration:
                raise com.IbisError("No files found in the passed directory")
            else:
                like_file = pjoin(hdfs_dir, file_name)

        stmt = ddl.CreateTableParquet(
            name,
            hdfs_dir,
            schema=schema,
            database=database,
            example_file=like_file,
            example_table=like_table,
            external=external,
            can_exist=False,
        )
        self.raw_sql(stmt)
        return self._wrap_new_table(name, database, persist)
Ejemplo n.º 22
0
    def formatter(t, expr):
        if arity != len(expr.op().args):
            raise com.IbisError('incorrect number of args')

        return _varargs_call(sa_func, t, expr)
Ejemplo n.º 23
0
def execute_until_in_scope(
    expr,
    scope: Scope,
    timecontext: Optional[TimeContext] = None,
    aggcontext=None,
    clients=None,
    post_execute_=None,
    **kwargs,
) -> Scope:
    """Execute until our op is in `scope`.

    Parameters
    ----------
    expr : ibis.expr.types.Expr
    scope : Scope
    timecontext : Optional[TimeContext]
    aggcontext : Optional[AggregationContext]
    clients : List[ibis.client.Client]
    kwargs : Mapping
    """
    # these should never be None
    assert aggcontext is not None, 'aggcontext is None'
    assert clients is not None, 'clients is None'
    assert post_execute_ is not None, 'post_execute_ is None'

    # base case: our op has been computed (or is a leaf data node), so
    # return the corresponding value
    op = expr.op()
    if scope.get_value(op, timecontext) is not None:
        return scope
    if isinstance(op, ops.Literal):
        # special case literals to avoid the overhead of dispatching
        # execute_node
        return Scope(
            {
                op:
                execute_literal(
                    op, op.value, expr.type(), aggcontext=aggcontext, **kwargs)
            },
            timecontext,
        )

    # figure out what arguments we're able to compute on based on the
    # expressions inputs. things like expressions, None, and scalar types are
    # computable whereas ``list``s are not
    computable_args = [arg for arg in op.inputs if is_computable_input(arg)]

    # pre_executed_states is a list of states with same the length of
    # computable_args, these states are passed to each arg
    if timecontext:
        arg_timecontexts = compute_time_context(
            op,
            num_args=len(computable_args),
            timecontext=timecontext,
            clients=clients,
        )
    else:
        arg_timecontexts = [None] * len(computable_args)

    pre_executed_scope = pre_execute(
        op,
        *clients,
        scope=scope,
        timecontext=timecontext,
        aggcontext=aggcontext,
        **kwargs,
    )

    new_scope = scope.merge_scope(pre_executed_scope)

    # Short circuit: if pre_execute puts op in scope, then we don't need to
    # execute its computable_args
    if new_scope.get_value(op, timecontext) is not None:
        return new_scope

    # recursively compute each node's arguments until we've changed type.
    # compute_time_context should return with a list with the same length
    # as computable_args, the two lists will be zipping together for
    # further execution
    if len(arg_timecontexts) != len(computable_args):
        raise com.IbisError(
            'arg_timecontexts differ with computable_arg in length '
            f'for type:\n{type(op).__name__}.')

    scopes = [
        execute_until_in_scope(
            arg,
            new_scope,
            timecontext=timecontext,
            aggcontext=aggcontext,
            post_execute_=post_execute_,
            clients=clients,
            **kwargs,
        ) if hasattr(arg, 'op') else Scope({arg: arg}, timecontext)
        for (arg, timecontext) in zip(computable_args, arg_timecontexts)
    ]

    # if we're unable to find data then raise an exception
    if not scopes and computable_args:
        raise com.UnboundExpressionError(
            'Unable to find data for expression:\n{}'.format(repr(expr)))

    # there should be exactly one dictionary per computable argument
    assert len(computable_args) == len(scopes)

    new_scope = new_scope.merge_scopes(scopes)
    # pass our computed arguments to this node's execute_node implementation
    data = [
        new_scope.get_value(arg.op(), timecontext)
        if hasattr(arg, 'op') else arg for arg in computable_args
    ]

    result = execute_node(
        op,
        *data,
        scope=scope,
        timecontext=timecontext,
        aggcontext=aggcontext,
        clients=clients,
        **kwargs,
    )
    computed = post_execute_(op, result, timecontext=timecontext)
    return Scope({op: computed}, timecontext)
Ejemplo n.º 24
0
    def create_table(
        self,
        table_name: str,
        obj: Optional[Union[ir.TableExpr, pd.DataFrame]] = None,
        schema: Optional[sch.Schema] = None,
        database: Optional[str] = None,
        max_rows: Optional[int] = None,
        fragment_size: Optional[int] = None,
        is_temporary: bool = False,
        **kwargs,
    ):
        """
        Create a new table from an Ibis table expression.

        Parameters
        ----------
        table_name : string
        obj : ibis.expr.types.TableExpr or pandas.DataFrame, optional
          If passed, creates table from select statement results
        schema : ibis.Schema, optional
        table_name : str
        obj : TableExpr or pandas.DataFrame, optional, default None
          If passed, creates table from select statement results.
        schema : ibis.Schema, optional, default None
          Mutually exclusive with expr, creates an empty table with a
          particular schema
        database : str, optional, default None
        max_rows : int, optional, default None
          Set the maximum number of rows allowed in a table to create a capped
          collection. When this limit is reached, the oldest fragment is
          removed.
        fragment_size: int, optional,
          default 32000000 if gpu_device is enabled otherwise 5000000
          Number of rows per fragment that is a unit of the table for query
          processing, which is not expected to be changed.
        is_temporary : bool, default False
            If True it the table will be created as temporary.

        Examples
        --------
        >>> con.create_table('new_table_name', table_expr)  # doctest: +SKIP
        """
        _database = self.db_name
        self.set_database(database)

        if fragment_size is None:
            fragment_size = 32000000 if self.gpu_device else 5000000

        if obj is not None:
            if isinstance(obj, pd.DataFrame):
                raise NotImplementedError(
                    'Pandas Data Frame input not implemented.')
            else:
                to_insert = obj
            ast = self._build_ast(to_insert, OmniSciDBDialect.make_context())
            select = ast.queries[0]

            statement = ddl.CTAS(table_name, select, database=database)
        elif schema is not None:
            statement = ddl.CreateTableWithSchema(
                table_name,
                schema,
                database=database,
                max_rows=max_rows,
                fragment_size=fragment_size,
                is_temporary=is_temporary,
            )
        else:
            raise com.IbisError('Must pass expr or schema')

        self._execute(statement, False)
        self.set_database(_database)
Ejemplo n.º 25
0
    def create_table(
        self,
        table_name,
        obj=None,
        schema=None,
        database=None,
        external=False,
        force=False,
        # HDFS options
        format='parquet',
        location=None,
        partition=None,
        like_parquet=None,
    ):
        """
        Create a new table in Impala using an Ibis table expression. This is
        currently designed for tables whose data is stored in HDFS (or
        eventually other filesystems).

        Parameters
        ----------
        table_name : string
        obj : TableExpr or pandas.DataFrame, optional
          If passed, creates table from select statement results
        schema : ibis.Schema, optional
          Mutually exclusive with obj, creates an empty table with a
          particular schema
        database : string, default None (optional)
        force : boolean, default False
          Do not create table if table with indicated name already exists
        external : boolean, default False
          Create an external table; Impala will not delete the underlying data
          when the table is dropped
        format : {'parquet'}
        location : string, default None
          Specify the directory location where Impala reads and writes files
          for the table
        partition : list of strings
          Must pass a schema to use this. Cannot partition from an expression
          (create-table-as-select)
        like_parquet : string (HDFS path), optional
          Can specify in lieu of a schema

        Examples
        --------
        >>> con.create_table('new_table_name', table_expr)  # doctest: +SKIP
        """
        if like_parquet is not None:
            raise NotImplementedError

        if obj is not None:
            with self._setup_insert(obj) as to_insert:
                ast = self.compiler.to_ast(to_insert)
                select = ast.queries[0]

                self.raw_sql(
                    CTAS(
                        table_name,
                        select,
                        database=database,
                        can_exist=force,
                        format=format,
                        external=external,
                        partition=partition,
                        path=location,
                    )
                )
        elif schema is not None:
            self.raw_sql(
                CreateTableWithSchema(
                    table_name,
                    schema,
                    database=database,
                    format=format,
                    can_exist=force,
                    external=external,
                    path=location,
                    partition=partition,
                )
            )
        else:
            raise com.IbisError('Must pass obj or schema')
Ejemplo n.º 26
0
    def create_table(
        self,
        table_name,
        obj=None,
        schema=None,
        database=None,
        force=False,
        # HDFS options
        format='parquet',
    ):
        """
        Create a new table in Spark using an Ibis table expression.

        Parameters
        ----------
        table_name : string
        obj : TableExpr or pandas.DataFrame, optional
          If passed, creates table from select statement results
        schema : ibis.Schema, optional
          Mutually exclusive with obj, creates an empty table with a
          particular schema
        database : string, default None (optional)
        force : boolean, default False
          If true, create table if table with indicated name already exists
        format : {'parquet'}

        Examples
        --------
        >>> con.create_table('new_table_name', table_expr)  # doctest: +SKIP
        """
        if obj is not None:
            if isinstance(obj, pd.DataFrame):
                spark_df = self._session.createDataFrame(obj)
                mode = 'error'
                if force:
                    mode = 'overwrite'
                spark_df.write.saveAsTable(
                    table_name, format=format, mode=mode
                )
                return

            ast = self._build_ast(obj, SparkDialect.make_context())
            select = ast.queries[0]

            statement = ddl.CTAS(
                table_name,
                select,
                database=database,
                can_exist=force,
                format=format,
            )
        elif schema is not None:
            statement = ddl.CreateTableWithSchema(
                table_name,
                schema,
                database=database,
                format=format,
                can_exist=force,
            )
        else:
            raise com.IbisError('Must pass expr or schema')

        return self._execute(statement.compile())
Ejemplo n.º 27
0
def hdfs_connect(
    host='localhost',
    port=50070,
    protocol='webhdfs',
    use_https='default',
    auth_mechanism='NOSASL',
    verify=True,
    session=None,
    **kwds,
):
    """Connect to HDFS.

    Parameters
    ----------
    host : str
        Host name of the HDFS NameNode
    port : int
        NameNode's WebHDFS port
    protocol : str,
        The protocol used to communicate with HDFS. The only valid value is
        ``'webhdfs'``.
    use_https : bool
        Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure
        authentication, the default for this is True, otherwise False.
    auth_mechanism : str
        Set to NOSASL or PLAIN for non-secure clusters.
        Set to GSSAPI or LDAP for Kerberos-secured clusters.
    verify : bool
        Set to :data:`False` to turn off verifying SSL certificates.
    session : Optional[requests.Session]
        A custom :class:`requests.Session` object.

    Notes
    -----
    Other keywords are forwarded to HDFS library classes.

    Returns
    -------
    WebHDFS

    """
    import requests

    if session is None:
        session = requests.Session()
    session.verify = verify
    if auth_mechanism in ('GSSAPI', 'LDAP'):
        if use_https == 'default':
            prefix = 'https'
        else:
            prefix = 'https' if use_https else 'http'
        try:
            import requests_kerberos  # noqa: F401
        except ImportError:
            raise com.IbisError(
                "Unable to import requests-kerberos, which is required for "
                "Kerberos HDFS support. Install it by executing `pip install "
                "requests-kerberos` or `pip install hdfs[kerberos]`.")
        from hdfs.ext.kerberos import KerberosClient

        # note SSL
        url = '{0}://{1}:{2}'.format(prefix, host, port)
        kwds.setdefault('mutual_auth', 'OPTIONAL')
        hdfs_client = KerberosClient(url, session=session, **kwds)
    else:
        if use_https == 'default':
            prefix = 'http'
        else:
            prefix = 'https' if use_https else 'http'
        from hdfs.client import InsecureClient

        url = '{}://{}:{}'.format(prefix, host, port)
        hdfs_client = InsecureClient(url, session=session, **kwds)
    return WebHDFS(hdfs_client)
Ejemplo n.º 28
0
 def formatter(translator, expr):
     op = expr.op()
     if min_arity > len(op.args) or max_arity < len(op.args):
         raise com.IbisError("incorrect number of args")
     return _format_call(translator, func_name, *op.args)
Ejemplo n.º 29
0
 def formatter(translator, expr):
     op = expr.op()
     if arity != len(op.args):
         raise com.IbisError('incorrect number of args')
     return _format_call(translator, func_name, *op.args)
Ejemplo n.º 30
0
def construct_time_context_aware_series(series: pd.Series,
                                        frame: pd.DataFrame) -> pd.Series:
    """Construct a Series by adding 'time' in its MultiIndex

    In window execution, the result Series of udf may need
    to be trimmed by timecontext. In order to do so, 'time'
    must be added as an index to the Series. We extract
    time column from the parent Dataframe `frame`.
    See `trim_window_result` in execution/window.py for
    trimming implementation.

    Parameters
    ----------
    series: pd.Series, the result series of an udf execution
    frame: pd.DataFrame, the parent Dataframe of `series`

    Returns
    -------
    pd.Series

    Examples
    --------
    >>> import pandas as pd
    >>> from ibis.expr.timecontext import construct_time_context_aware_series
    >>> df = pd.DataFrame(
    ...     {
    ...         'time': pd.Series(
    ...             pd.date_range(
    ...                 start='2017-01-02', periods=3
    ...             ).values
    ...         ),
    ...         'id': [1,2,3],
    ...         'value': [1.1, 2.2, 3.3],
    ...     }
    ... )
    >>> df
            time  id  value
    0 2017-01-02   1    1.1
    1 2017-01-03   2    2.2
    2 2017-01-04   3    3.3
    >>> series = df['value']
    >>> series
    0    1.1
    1    2.2
    2    3.3
    Name: value, dtype: float64
    >>> construct_time_context_aware_series(series, df)
       time
    0  2017-01-02    1.1
    1  2017-01-03    2.2
    2  2017-01-04    3.3
    Name: value, dtype: float64

    The index will be a MultiIndex of the original RangeIndex
    and a DateTimeIndex.

    >>> timed_series = construct_time_context_aware_series(series, df)
    >>> timed_series
       time
    0  2017-01-02    1.1
    1  2017-01-03    2.2
    2  2017-01-04    3.3
    Name: value, dtype: float64

    >>> construct_time_context_aware_series(timed_series, df)
       time
    0  2017-01-02    1.1
    1  2017-01-03    2.2
    2  2017-01-04    3.3
    Name: value, dtype: float64
    The result is unchanged for a series already has 'time' as its index.
    """
    time_col = get_time_col()
    if time_col == frame.index.name:
        time_index = frame.index
    elif time_col in frame:
        time_index = pd.Index(frame[time_col])
    else:
        raise com.IbisError(f'"time" column not present in DataFrame {frame}')
    if time_col not in series.index.names:
        series.index = pd.MultiIndex.from_arrays(
            list(
                map(series.index.get_level_values, range(
                    series.index.nlevels))) + [time_index],
            names=series.index.names + [time_col],
        )
    return series