Beispiel #1
0
 def keytab(self, arg):
     """
     Look for Kerberos Ticket.
     """
     args = ParameterArgs(parse_argstring(self.url, arg))
     if not utils.renew_kerberos_ticket(args.get("principal"), args.get("keytab")):
         raise Exception("Unable to renew kerberos ticket")
Beispiel #2
0
    def sts(self, arg, line='', cell='', local_ns=None):
        """Connects to spark thrift server and executes the query

        Example2:
            # To initialize spark thrift server connection.
            %%sts -h 127.0.0.1 -p 1000
            select * form dim_cust limit 10

        """
        # save globals and locals so they can be referenced in bind vars
        if not (line or cell):
            if not arg.startswith("-"):
                line = arg
                arg = ''
        args = ParameterArgs(parse_argstring(self.sts, arg))
        user_ns = self.shell.user_ns.copy()
        if local_ns:
            user_ns.update(local_ns)

        if not cell:
            cell = line

        result_set = self._get_connection_(ConnectionType.STS, cluster=args.get("cluster_name"), host=args.get("host"), port=args.get("port"), auth=args.get("auth")).execute(
            cell, self.autolimit, self.displaylimit, self.progress_bar)

        return self._process_results_(result_set, args.get('tableau'), args.get('publish'), args.get('tde_name'), args.get('project_name'))
Beispiel #3
0
    def csv(self, arg, line='', cell='', local_ns=None):
        # save globals and locals so they can be referenced in bind vars
        """CSV Magic
        Accepted Query formats: All select sql statements: select * from filename.csv/tsv
        return: Dataframe

        Example Queries:
        1. select * from test.csv
        2. select col1 from test.csv where col1=1
        3.select * from test.tsv

        Note: Currently csv magic supports only select sqls
        """
        if not (line or cell):
            if not arg.startswith("-"):
                line = arg
                arg = ''
        args = ParameterArgs(parse_argstring(self.csv, arg))
        user_ns = self.shell.user_ns.copy()
        if local_ns:
            user_ns.update(local_ns)
        if not cell:
            cell = line

        result_set = self._get_connection_(ConnectionType.CSV, '').execute(cell)
        return self._process_results_(result_set, args.get('tableau'), args.get('publish'), args.get('tde_name'), args.get('project_name'))
Beispiel #4
0
    def presto(self, arg, line='', cell='', local_ns=None):
        """Connects to presto execution engine for query execution.

        Example2:
            %presto select * from cluster.default.dim_cust limit 10

            # To download data
            %%presto -d True
            select * from cluster.default.dim_cust limit 10

        """
        # save globals and locals so they can be referenced in bind vars
        if not (line or cell):
            if not arg.startswith("-"):
                line = arg
                arg = ''
        args = ParameterArgs(parse_argstring(self.presto, arg))
        user_ns = self.shell.user_ns.copy()
        if local_ns:
            user_ns.update(local_ns)
        if not cell:
            cell = line

        result_set = self._get_connection_(ConnectionType.PRESTO, args.get("cluster_name"), args.get("host"), args.get("port"), args.get("auth")).execute(
            cell, self.autolimit, self.displaylimit, self.progress_bar)

        return self._process_results_(result_set, args.get('tableau'), args.get('publish'), args.get('tde_name'), args.get('project_name'))
    def publish(self, arg, line='', cell='', local_ns=None):
        """
            Publish to Tableau.
        """
        if not (line or cell):
            if not arg.startswith("-"):
                line = arg
                arg = ''
        args = ParameterArgs(parse_argstring(self.publish, arg))
        user_ns = self.shell.user_ns.copy()
        if local_ns:
            user_ns.update(local_ns)

        if not cell:
            cell = line
            if cell.startswith('%%'):
                magic = cell.split(" ", 1)[0].lstrip('%')
                arg = cell.split(" ", 1)[1].split("\n", 1)[0]
                query = cell.split(" ", 1)[1].split("\n", 1)[1]
                result = get_ipython().run_cell_magic(magic, arg, query)
                return publish(result, args.get('tde_name'),
                               args.get('project_name'))
            elif cell.startswith('%'):
                magic = cell.split(" ", 1)[0].lstrip('%')
                query = cell.split(" ", 1)[1]
                result = get_ipython().run_line_magic(magic, query)
                return publish(result, args.get('tde_name'),
                               args.get('project_name'))
        df_name = user_ns[cell]
        return publish(df_name, args.get('tde_name'), args.get('project_name'))
Beispiel #6
0
    def teradata(self, arg, line='', cell='', local_ns=None):
        """Connects to teradata system and executes the query.

        Example2:

            # To download data
            %%teradata --host
            select * from database.table_name sample 10

            # To insert csv data to a table
            %teradata -f dim_cust.csv -t pp_scratch.dim_cust

        """
        # save globals and locals so they can be referenced in bind vars
        if not (line or cell):
            if not arg.startswith("-"):
                line = arg
                arg = ''
        args = ParameterArgs(parse_argstring(self.teradata, arg))
        user_ns = self.shell.user_ns.copy()
        if local_ns:
            user_ns.update(local_ns)

        if not cell:
            cell = line

        if args.get("table") and (args.get("csv") or args.get("dataframe")):
            data_frame = utils.csv_to_df(user_ns, args)

            return self._get_connection_(ConnectionType.TERADATA, args.get("cluster_name"), args.get("host")).insert_csv(
                args.get("table"), data_frame, self.autolimit, self.displaylimit)

        result_set = self._get_connection_(ConnectionType.TERADATA, args.get("cluster_name"), args.get("host")).execute(
            cell, self.autolimit, self.displaylimit, self.progress_bar)

        return self._process_results_(result_set, args.get('tableau'), args.get('publish'), args.get('tde_name'), args.get('project_name'))
    def run_pipeline(self, arg, line='', cell='', local_ns=None):
        """Run notebooks sequentially in a pipeline.
           A dictionary called _pipeline_workspace is created by the magic that will be shared by all the notebooks in
           the pipeline. The state can contain DataFrames, Lists, Dictionaries and objects. Notebook parameterization
           can be used to load and read from the shared state.

           The pipeline supports execution of parameterized notebooks. If parameters are used, the first code cell will
           be treated to contain only parameter assignments. Parameters can be a string, number, list or dictionary.

           To save a notebook's execution in the pipeline, the save name should be specified along with the
           execution notebook separated with a colon.

           Run parameters will only change their equivalent parameters from the first code cell. Unknown parameters will
           be ignored. Adding parameters on an execution is optional.

                # simple pipeline
                Example1:
                    %%run_pipeline
                    first notebook in pipeline;
                    second notebook in pipeline;
                    third notebook in pipeline

                # pipleine with parameterized notebooks
                Example2:
                    %%run_pipeline
                    first notebook in pipeline  key01=int key01=string key02={'key01': param01};
                    second notebook in pipeline;
                    third notebook in pipeline:your save name key01=int key02=string key03=[param01, param02]

        """
        # save globals and locals so they can be referenced in bind vars

        clear_namespace_cell = nbformat.v4.new_code_cell(
            source="from IPython import get_ipython\n" +
            "_ip = get_ipython()\n" + "_user_vars = %who_ls\n" +
            "for _var in _user_vars:\n" +
            "    if _var != '_pipeline_workspace':\n" +
            "        del _ip.user_ns[_var]\n" + "import gc\n" + "gc.collect()")
        pipeline_state_cell = nbformat.v4.new_code_cell(
            source="_pipeline_workspace = {'frames': list()}")

        if not (line or cell):
            if not arg.startswith("-"):
                line = arg
                arg = ''

        args = ParameterArgs(parse_argstring(self.run, arg))

        user_ns = self.shell.user_ns.copy()
        if local_ns:
            user_ns.update(local_ns)

        if not cell:
            cell = line

        notebook_run_cmds = cell.split(';')
        notebook_run_cmds = [
            notebook_run_cmd.strip() for notebook_run_cmd in notebook_run_cmds
        ]

        execute_preprocessor = ExecutePreprocessor(
            kernel_name='python3', timeout=args.get('cell_timeout'))

        kernel_manager, kernel_comm = start_new_kernel(kernel_name='python3')

        execute_preprocessor.km = kernel_manager
        execute_preprocessor.kc = kernel_comm

        def execute_cell(nb4_cell):
            try:
                execute_preprocessor.run_cell(nb4_cell)
            except BaseException:
                if kernel_manager or kernel_comm:
                    kernel_comm.stop_channels()
                    kernel_manager.shutdown_kernel()

        def execute_notebook(notebook_filename, notebook_save_filename,
                             params):

            with open(notebook_filename) as file_handler:
                notebook = nbformat.read(file_handler, as_version=4)
                b_errors = False

                if params:
                    for nb_cell in notebook.cells:
                        if nb_cell.cell_type == 'code':
                            new_cell_source = utils.substitute_params(
                                nb_cell.source, params)
                            nb_cell.source = new_cell_source
                            break

                try:

                    execute_preprocessor.nb = notebook

                    progress_bar = widgets.IntProgress(
                        value=0,
                        min=0,
                        max=len(notebook.cells),
                        step=1,
                        bar_style=
                        'info',  # 'success', 'info', 'warning', 'danger' or ''
                        orientation='horizontal')

                    display_label = notebook_filename
                    if notebook_save_filename:
                        display_label = display_label + ' : ' + notebook_save_filename
                    display(
                        widgets.HBox(
                            [widgets.Label(display_label), progress_bar]))

                    for idx, nb_cell in enumerate(notebook.cells):
                        execute_preprocessor.preprocess_cell(
                            nb_cell,
                            resources={'metadata': {}},
                            cell_index=idx)
                        progress_bar.value = idx + 1

                except CellExecutionError:
                    b_errors = True

                    progress_bar.bar_style = 'danger'

                    if kernel_manager or kernel_comm:
                        kernel_comm.stop_channels()
                        kernel_manager.shutdown_kernel()

                    raise
                finally:
                    if notebook_save_filename:
                        with open(notebook_save_filename,
                                  mode='wt') as file_handler:
                            nbformat.write(notebook, file_handler)

                    if not b_errors:
                        progress_bar.bar_style = 'success'

        execute_cell(pipeline_state_cell)
        for notebook_run_cmd in notebook_run_cmds:

            run_notebook_name, notebook_save_name, nb_params = utils.parse_run_str(
                notebook_run_cmd)

            execute_notebook(run_notebook_name, notebook_save_name, nb_params)
            execute_cell(clear_namespace_cell)

        if kernel_manager or kernel_comm:
            kernel_comm.stop_channels()
            kernel_manager.shutdown_kernel()
    def run(self, arg, line='', cell='', local_ns=None):
        """Runs a notebook from another notebook. Allows for running parameterized notebooks. If using parameters
           the first codecell will be treated to contain only parameter assignments. Parameters can be strings, numbers,
           lists or dictionaries. The magic can enable sequential or parallel execution of notebooks.

           To save a notebook's execution, the save name should be specified along with the execution notebook
           separated with a colon.

           Run parameters will only change their equivalent parameters from the first code cell. Unknown parameters will
           be ignored. Adding parameters on an execution is optional.

                # simple run
                Example1:
                    %run your notebook

                # simple sequential run
                Example1:
                    %%run
                    your notebook 01;
                    your notebook 02

                # simple run allow errors
                Example1:
                    %%run -e True
                    your notebook

                # simple run show progress bar
                Example1:
                    %%run -pbar True
                    your notebook

                # simple run show progress bar and save execution
                Example1:
                    %%run -pbar True
                    your notebook:your save notebook

                # simple run in parallel with progressbar
                Example1:
                    %%run -pbar True -p True
                    your notebook 01;
                    your notebook 02

                # simple run in parallel with progressbar and disabling cell timeout
                Example1:
                    %%run -pbar True -t -1
                    your notebook 01;
                    your notebook 02

                # parameterized run in parallel with progressbar
                Example1:
                    %%run -pbar True -p True
                    your notebook 01  key01=int key01=string key02={'key01': param01};
                    your notebook 02:your save name key01=int key02=string key03=[param01, param02]

        """
        # save globals and locals so they can be referenced in bind vars
        if not (line or cell):
            if not arg.startswith("-"):
                line = arg
                arg = ''

        args = ParameterArgs(parse_argstring(self.run, arg))
        user_ns = self.shell.user_ns.copy()
        if local_ns:
            user_ns.update(local_ns)

        if not cell:
            cell = line

        notebook_run_cmds = cell.split(';')
        notebook_run_cmds = [
            notebook_run_cmd.strip() for notebook_run_cmd in notebook_run_cmds
        ]

        def execute_notebook(notebook_filename, notebook_save_filename,
                             params):
            log = UserMessages()

            with open(notebook_filename) as file_handler:
                notebook = nbformat.read(file_handler, as_version=4)
                b_errors = False
                execute_preprocessor = ExecutePreprocessor(
                    timeout=args.get('cell_timeout'),
                    allow_errors=args.get('allow_errors'))
                kernel_manager = None
                kernel_comm = None
                progress_bar = args.get('enable_progress_bar')

                if params:
                    for nb_cell in notebook.cells:
                        if nb_cell.cell_type == 'code':
                            new_cell_source = utils.substitute_params(
                                nb_cell.source, params)
                            nb_cell.source = new_cell_source
                            break

                try:
                    if progress_bar:

                        progress_bar = widgets.IntProgress(
                            value=0,
                            min=0,
                            max=len(notebook.cells),
                            step=1,
                            bar_style=
                            'info',  # 'success', 'info', 'warning', 'danger' or ''
                            orientation='horizontal')

                        kernel_manager, kernel_comm = start_new_kernel(
                            kernel_name=notebook['metadata']['kernelspec']
                            ['name'])
                        execute_preprocessor.km = kernel_manager
                        execute_preprocessor.kc = kernel_comm
                        execute_preprocessor.nb = notebook

                        display_label = notebook_filename
                        if notebook_save_filename:
                            display_label = display_label + ' : ' + notebook_save_filename
                        display(
                            widgets.HBox(
                                [widgets.Label(display_label), progress_bar]))

                        for idx, nb_cell in enumerate(notebook.cells):
                            execute_preprocessor.preprocess_cell(
                                nb_cell,
                                resources={'metadata': {}},
                                cell_index=idx)
                            progress_bar.value = idx + 1
                    else:
                        log.info("Running Notebook: " + notebook_filename)
                        execute_preprocessor.preprocess(
                            notebook, {'metadata': {}})
                except CellExecutionError:
                    b_errors = True
                    if progress_bar:
                        progress_bar.bar_style = 'danger'
                    raise
                except AttributeError:
                    b_errors = True
                    if progress_bar:
                        progress_bar.bar_style = 'danger'
                    raise
                finally:
                    if notebook_save_filename:
                        with open(notebook_save_filename,
                                  mode='wt') as file_handler:
                            nbformat.write(notebook, file_handler)

                    if kernel_manager or kernel_comm:
                        kernel_comm.stop_channels()
                        kernel_manager.shutdown_kernel()

                    if not b_errors:
                        if progress_bar:
                            progress_bar.bar_style = 'success'
                        else:
                            log.info(notebook_filename +
                                     " was executed successfully.")
                    elif b_errors and not progress_bar:
                        log.error(notebook_filename + " execution failed.")

        if args.get('parallel'):
            futures = []
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=20) as executor:

                for notebook_run_cmd in notebook_run_cmds:
                    run_notebook_name, notebook_save_name, nb_params = utils.parse_run_str(
                        notebook_run_cmd)
                    futures.append(
                        executor.submit(execute_notebook, run_notebook_name,
                                        notebook_save_name, nb_params))

                # Handle other notebook runs if one or more fails intermittently
                for future in concurrent.futures.as_completed(futures):
                    try:
                        future.result()
                    except CellExecutionError:
                        raise
        else:
            for notebook_run_cmd in notebook_run_cmds:
                run_notebook_name, notebook_save_name, nb_params = utils.parse_run_str(
                    notebook_run_cmd)
                execute_notebook(run_notebook_name, notebook_save_name,
                                 nb_params)
    def hive(self, arg, line='', cell='', local_ns=None):
        """Connects to hive execution engine and executes the query.

        Example2:
            %%hive --hive_server hive.server.com --port 10000 --auth gssapi
            select * from database.table_name limit 10

            # To query data from hive
            %%hive
            select * from database.table_name limit 10

            # To insert csv data to a table
            %hive -f file.csv -t database.table_name

        """
        # save globals and locals so they can be referenced in bind vars
        if not (line or cell):
            if not arg.startswith("-"):
                line = arg
                arg = ''
        args = ParameterArgs(parse_argstring(self.hive, arg))
        user_ns = self.shell.user_ns.copy()
        if local_ns:
            user_ns.update(local_ns)

        if not cell:
            cell = line

        if args.get("table") and (args.get("csv") or args.get("dataframe")):
            csv = utils.df_to_csv(user_ns, args)
            df_flag = False
            if args.get("dataframe"):
                df_flag = True

            return self._get_connection_(ConnectionType.HIVE,
                                         cluster=args.get("cluster_name"),
                                         host=args.get("hive_server"),
                                         port=args.get("port"),
                                         auth=args.get("auth")).insert_csv(
                                             args.get("table"),
                                             args.get("name_node_url"),
                                             args.get("name_node_options"),
                                             csv, df_flag, self.autolimit,
                                             self.displaylimit)

        result_set = self._get_connection_(ConnectionType.HIVE, cluster=args.get("cluster_name"),
                                           host=args.get("hive_server"), port=args.get("port"),
                                           auth=args.get("auth"),
                                           resource_manager=args.get("resource_manager_url")).\
            execute(cell, self.autolimit, self.displaylimit, self.progress_bar)

        return self._process_results_(result_set, args.get('tableau'),
                                      args.get('publish'),
                                      args.get('tde_name'),
                                      args.get('project_name'))