Beispiel #1
0
    def store_record(self, metadata: Dict[str, str]) -> None:
        """Method to create and store an activity record

        Args:
            metadata(dict): A dictionary of data to start the activity monitor

        Returns:
            None
        """
        if len(self.cell_data) > 0:
            t_start = time.time()

            # Process collected data and create an activity record
            activity_record = self.process(ActivityType.CODE,
                                           list(reversed(self.cell_data)),
                                           {"path": metadata["path"]})

            # Commit changes to the related Notebook file
            commit = self.commit_labbook()

            # Create note record
            activity_commit = self.store_activity_record(
                commit, activity_record)

            logger.info(
                f"Created auto-generated activity record {activity_commit} in {time.time() - t_start} seconds"
            )

        # Reset for next execution
        self.can_store_activity_record = False
        self.cell_data = list()
        self.current_cell = ExecutionData()
Beispiel #2
0
    def __init__(self,
                 user: str,
                 owner: str,
                 labbook_name: str,
                 monitor_key: str,
                 config_file: str = None,
                 author_name: Optional[str] = None,
                 author_email: Optional[str] = None) -> None:
        """Constructor requires info to load the lab book

        Args:
            user(str): current logged in user
            owner(str): owner of the lab book
            labbook_name(str): name of the lab book
            monitor_key(str): Unique key for the activity monitor in redis
            author_name(str): Name of the user starting this activity monitor
            author_email(str): Email of the user starting this activity monitor
        """
        # Call super constructor
        ActivityMonitor.__init__(self,
                                 user,
                                 owner,
                                 labbook_name,
                                 monitor_key,
                                 config_file,
                                 author_name=author_name,
                                 author_email=author_email)

        # For now, register processors by default
        self.register_processors()

        # Let's call them cells as if they were Jupyter
        self.current_cell = ExecutionData()
        self.cell_data: List[ExecutionData] = list()

        # variables that track the context of messages in the log
        #   am I in the console, or the notebook?
        #   what chunk is being executed?
        #   in what notebook?
        self.is_console = False
        self.is_notebook = False
        self.chunk_id = None
        self.nbname = None
Beispiel #3
0
    def test_is_not_empty(self):
        """Test the constructor"""
        ed = ExecutionData()
        ed.code.append({"this": "thing"})

        assert ed.is_empty() is False

        ed = ExecutionData()
        ed.result.append({"this": "thing"})

        assert ed.is_empty() is False

        ed = ExecutionData()
        ed.tags.append("tag")

        assert ed.is_empty() is False
Beispiel #4
0
    def store_record(self) -> None:
        """Store R input/output/code to ActivityRecord / git commit

        store_record() should be called after moving any data in self.current_cell to
        self.cell_data. Any data remaining in self.current_cell will be removed.

        Args:
            None
        """
        if len(self.cell_data) > 0:
            t_start = time.time()

            # Process collected data and create an activity record
            if self.is_console:
                codepath = "console"
            else:
                codepath = self.nbname if self.nbname else "Unknown notebook"

            activity_record = self.process(ActivityType.CODE,
                                           list(reversed(self.cell_data)),
                                           {'path': codepath})

            # Commit changes to the related Notebook file
            commit = self.commit_labbook()

            # Create note record
            activity_commit = self.store_activity_record(
                commit, activity_record)

            logger.info(
                f"Created auto-generated activity record {activity_commit} in {time.time() - t_start} seconds"
            )

        # Reset for next execution
        self.current_cell = ExecutionData()
        self.cell_data = list()
        self.is_notebook = False
        self.is_console = False
Beispiel #5
0
    def __init__(self,
                 user: str,
                 owner: str,
                 labbook_name: str,
                 monitor_key: str,
                 config_file: str = None,
                 author_name: Optional[str] = None,
                 author_email: Optional[str] = None) -> None:
        """Constructor requires info to load the lab book

        Args:
            user(str): current logged in user
            owner(str): owner of the lab book
            labbook_name(str): name of the lab book
            monitor_key(str): Unique key for the activity monitor in redis
            author_name(str): Name of the user starting this activity monitor
            author_email(str): Email of the user starting this activity monitor
        """
        # Call super constructor
        ActivityMonitor.__init__(self,
                                 user,
                                 owner,
                                 labbook_name,
                                 monitor_key,
                                 config_file,
                                 author_name=author_name,
                                 author_email=author_email)

        # For now, register processors by default
        self.register_processors()

        # Tracking variables during message processing
        self.kernel_status = 'idle'
        self.current_cell = ExecutionData()
        self.cell_data: List[ExecutionData] = list()
        self.execution_count = 0
Beispiel #6
0
    def handle_message(self, msg: Dict[str, Dict]):
        """Method to handle processing an IOPub Message from a JupyterLab kernel

        Args:
            msg(dict): An IOPub message


        Returns:
            None
        """
        # Initialize can_process to False. This variable is used to indicate if the cell data should be processed into
        # an ActivityRecord and saved
        if msg['msg_type'] == 'status':
            # If status was busy and transitions to idle store cell since execution has completed
            if self.kernel_status == 'busy' and msg['content'][
                    'execution_state'] == 'idle':
                self.set_busy_state(False)

                if self.current_cell.cell_error is False and self.current_cell.is_empty(
                ) is False:
                    # Current cell did not error and has content
                    # Add current cell to collection of cells ready to process
                    self.cell_data.append(self.current_cell)

                # Reset current_cell attribute for next execution
                self.current_cell = ExecutionData()

                # Indicate record COULD be processed if timeout occurs
                self.can_store_activity_record = True

            elif self.kernel_status == 'idle' and msg['content'][
                    'execution_state'] == 'busy':
                # Starting to process new cell execution
                self.set_busy_state(True)
                self.can_store_activity_record = False

            # Update status
            self.kernel_status = msg['content']['execution_state']

        elif msg['msg_type'] == 'execute_input':
            # A message containing the input to kernel has been received
            self.current_cell.code.append({'code': msg['content']['code']})
            self.execution_count = msg['content']['execution_count']
            self.current_cell.tags.append(
                f"ex:{msg['content']['execution_count']}")

        elif msg['msg_type'] == 'execute_result':
            # A message containing the output of a cell execution has been received
            if self.execution_count != msg['content']['execution_count']:
                logger.error("Execution count mismatch detected {},{}".format(
                    self.execution_count, msg['content']['execution_count']))

            self.current_cell.result.append({
                'data':
                msg['content']['data'],
                'metadata':
                msg['content']['metadata']
            })

        elif msg['msg_type'] == 'stream':
            # A message containing plaintext output of a cell execution has been received
            self.current_cell.result.append({
                'data': {
                    "text/plain": msg['content']['text']
                },
                'metadata': {
                    'source': 'stream'
                }
            })

        elif msg['msg_type'] == 'display_data':
            # A message containing rich output of a cell execution has been received
            self.current_cell.result.append({
                'data': msg['content']['data'],
                'metadata': {
                    'source': 'display_data'
                }
            })

        elif msg['msg_type'] == 'error':
            # An error occurred, so don't save this cell by resetting the current cell attribute.
            self.current_cell.cell_error = True

        else:
            logger.info("Received and ignored IOPUB Message of type {}".format(
                msg['msg_type']))
Beispiel #7
0
class JupyterLabNotebookMonitor(ActivityMonitor):
    """Class to monitor a notebook kernel for activity to be processed."""
    def __init__(self,
                 user: str,
                 owner: str,
                 labbook_name: str,
                 monitor_key: str,
                 config_file: str = None,
                 author_name: Optional[str] = None,
                 author_email: Optional[str] = None) -> None:
        """Constructor requires info to load the lab book

        Args:
            user(str): current logged in user
            owner(str): owner of the lab book
            labbook_name(str): name of the lab book
            monitor_key(str): Unique key for the activity monitor in redis
            author_name(str): Name of the user starting this activity monitor
            author_email(str): Email of the user starting this activity monitor
        """
        # Call super constructor
        ActivityMonitor.__init__(self,
                                 user,
                                 owner,
                                 labbook_name,
                                 monitor_key,
                                 config_file,
                                 author_name=author_name,
                                 author_email=author_email)

        # For now, register processors by default
        self.register_processors()

        # Tracking variables during message processing
        self.kernel_status = 'idle'
        self.current_cell = ExecutionData()
        self.cell_data: List[ExecutionData] = list()
        self.execution_count = 0

    def register_processors(self) -> None:
        """Method to register processors

        Returns:
            None
        """
        self.add_processor(JupyterLabCodeProcessor())
        self.add_processor(GenericFileChangeProcessor())
        self.add_processor(JupyterLabPlaintextProcessor())
        self.add_processor(JupyterLabImageExtractorProcessor())
        self.add_processor(JupyterLabCellVisibilityProcessor())
        self.add_processor(ActivityDetailLimitProcessor())
        self.add_processor(ActivityShowBasicProcessor())

    def handle_message(self, msg: Dict[str, Dict]):
        """Method to handle processing an IOPub Message from a JupyterLab kernel

        Args:
            msg(dict): An IOPub message


        Returns:
            None
        """
        # Initialize can_process to False. This variable is used to indicate if the cell data should be processed into
        # an ActivityRecord and saved
        if msg['msg_type'] == 'status':
            # If status was busy and transitions to idle store cell since execution has completed
            if self.kernel_status == 'busy' and msg['content'][
                    'execution_state'] == 'idle':
                self.set_busy_state(False)

                if self.current_cell.cell_error is False and self.current_cell.is_empty(
                ) is False:
                    # Current cell did not error and has content
                    # Add current cell to collection of cells ready to process
                    self.cell_data.append(self.current_cell)

                # Reset current_cell attribute for next execution
                self.current_cell = ExecutionData()

                # Indicate record COULD be processed if timeout occurs
                self.can_store_activity_record = True

            elif self.kernel_status == 'idle' and msg['content'][
                    'execution_state'] == 'busy':
                # Starting to process new cell execution
                self.set_busy_state(True)
                self.can_store_activity_record = False

            # Update status
            self.kernel_status = msg['content']['execution_state']

        elif msg['msg_type'] == 'execute_input':
            # A message containing the input to kernel has been received
            self.current_cell.code.append({'code': msg['content']['code']})
            self.execution_count = msg['content']['execution_count']
            self.current_cell.tags.append(
                f"ex:{msg['content']['execution_count']}")

        elif msg['msg_type'] == 'execute_result':
            # A message containing the output of a cell execution has been received
            if self.execution_count != msg['content']['execution_count']:
                logger.error("Execution count mismatch detected {},{}".format(
                    self.execution_count, msg['content']['execution_count']))

            self.current_cell.result.append({
                'data':
                msg['content']['data'],
                'metadata':
                msg['content']['metadata']
            })

        elif msg['msg_type'] == 'stream':
            # A message containing plaintext output of a cell execution has been received
            self.current_cell.result.append({
                'data': {
                    "text/plain": msg['content']['text']
                },
                'metadata': {
                    'source': 'stream'
                }
            })

        elif msg['msg_type'] == 'display_data':
            # A message containing rich output of a cell execution has been received
            self.current_cell.result.append({
                'data': msg['content']['data'],
                'metadata': {
                    'source': 'display_data'
                }
            })

        elif msg['msg_type'] == 'error':
            # An error occurred, so don't save this cell by resetting the current cell attribute.
            self.current_cell.cell_error = True

        else:
            logger.info("Received and ignored IOPUB Message of type {}".format(
                msg['msg_type']))

    def store_record(self, metadata: Dict[str, str]) -> None:
        """Method to create and store an activity record

        Args:
            metadata(dict): A dictionary of data to start the activity monitor

        Returns:
            None
        """
        if len(self.cell_data) > 0:
            t_start = time.time()

            # Process collected data and create an activity record
            activity_record = self.process(ActivityType.CODE,
                                           list(reversed(self.cell_data)),
                                           {"path": metadata["path"]})

            # Commit changes to the related Notebook file
            commit = self.commit_labbook()

            # Create note record
            activity_commit = self.store_activity_record(
                commit, activity_record)

            logger.info(
                f"Created auto-generated activity record {activity_commit} in {time.time() - t_start} seconds"
            )

        # Reset for next execution
        self.can_store_activity_record = False
        self.cell_data = list()
        self.current_cell = ExecutionData()

    def start(self, metadata: Dict[str, str], database: int = 1) -> None:
        """Method called in a periodically scheduled async worker that should check the dev env and manage Activity
        Monitor Instances as needed

        Args:
            metadata(dict): A dictionary of data to start the activity monitor
            database(int): The database ID to use

        Returns:
            None
        """
        # Connect to the kernel
        cf = jupyter_client.find_connection_file(
            metadata["kernel_id"], path=os.environ['JUPYTER_RUNTIME_DIR'])
        km = jupyter_client.BlockingKernelClient()

        with open(cf, 'rt') as cf_file:
            cf_data = json.load(cf_file)

        # Get IP address of lab book container on the bridge network
        container_ip = self.get_container_ip()

        if not container_ip:
            raise ValueError("Failed to find LabBook container IP address.")
        cf_data['ip'] = container_ip

        km.load_connection_info(cf_data)

        # Get connection to the DB
        redis_conn = redis.Redis(db=database)

        try:
            while True:
                try:
                    # Check for messages, waiting up to 1 second. This is the rate that records will be merged
                    msg = km.get_iopub_msg(timeout=1)
                    self.handle_message(msg)

                except queue.Empty:
                    # if queue is empty and the record is ready to store, save it!
                    if self.can_store_activity_record is True:
                        self.store_record(metadata)

                # Check if you should exit
                if redis_conn.hget(self.monitor_key,
                                   "run").decode() == "False":
                    logger.info(
                        "Received Activity Monitor Shutdown Message for {}".
                        format(metadata["kernel_id"]))
                    break

        except Exception as err:
            logger.error(
                "Error in JupyterLab Activity Monitor: {}".format(err))
        finally:
            # Delete the kernel monitor key so the dev env monitor will spin up a new process
            # You may lose some activity if this happens, but the next action will sweep up changes
            redis_conn.delete(self.monitor_key)
Beispiel #8
0
    def process_activity(self, mitmlog):
        """Collect tail of the activity log and turn into an activity record.

        Args:
            mitmlog(file): open file object

        Returns:
            ar(): activity record
        """
        # get an fstream generator object
        fstream = mitmio.FlowReader(mitmlog).stream()

        # no context yet.  not a notebook or console
        self.is_console = False
        self.is_notebook = False

        while True:
            try:
                f = next(fstream)
            except StopIteration:
                break
            except FlowReadException as e:
                logger.info("MITM Flow file corrupted: {}. Exiting.".format(e))
                break

            st: Dict = f.get_state()

            is_png = False
            is_gzip = False
            is_json = False

            # Check response types for images and json
            for header in st['response']['headers']:

                # png images
                if header[0] == b'Content-Type':
                    if header[1] == b'image/png':
                        is_png = True

                # json
                if header[0] == b'Content-Type':
                    if header[1] == b'application/json':
                        is_json = True

                if header[0] == b'Content-Encoding':
                    if header[1] == b'gzip':
                        is_gzip = True
                    else:
                        # Not currently used, but useful for debugging and potentially in future
                        encoding = header[1]

            # process images
            if is_png:
                if is_gzip:
                    self._parse_image(st)
                else:
                    logger.error(
                        f"RSERVER Found image/png that was not gzip encoded.")

            if is_json:
                self._parse_json(st, is_gzip)

        # Flush cell data IFF anything happened
        if self.current_cell.code:
            self.cell_data.append(self.current_cell)
        self.store_record()
        self.current_cell = ExecutionData()
        self.cell_data = list()
        self.chunk_id = None
Beispiel #9
0
    def _parse_json_record(self, json_record: Dict) -> None:
        """Extract code and data from the record.

        When context switches between console <-> notebook, we store a record for
        the previous execution and start a new record.

        Args:
            json_record: dictionary parsed from mitmlog
        """
        result = json_record.get('result')
        # No result ignore
        if not result:
            return

        # execution of new notebook cell
        if result[0]['type'] == 'chunk_exec_state_changed':
            if self.is_console:
                # switch from console to notebook. store record
                if self.current_cell.code:
                    self.cell_data.append(self.current_cell)
                    self.store_record()
            elif self.is_notebook and self.current_cell.code:
                self.cell_data.append(self.current_cell)
                self.current_cell = ExecutionData()
                self.chunk_id = None

            self.is_notebook = True
            self.is_console = False
            # Reset current_cell attribute for next execution
            self.current_cell.tags.append('notebook')

        # execution of console code.  you get a message for every line.
        if result[0]['type'] == 'console_write_prompt':
            if self.is_notebook:
                # switch to console. store record
                if self.current_cell.code:
                    self.cell_data.append(self.current_cell)
                    self.store_record()
                self.current_cell.tags.append('console')

            # add a tag is this is first line of console code
            elif not self.is_console:
                self.current_cell.tags.append('console')

            self.is_console = True
            self.is_notebook = False

        # parse the entries in this message
        for edata, etype in [(entry.get('data'), entry.get('type'))
                             for entry in result]:
            if etype == 'chunk_output':
                outputs = edata.get('chunk_outputs')
                if outputs:
                    for oput in outputs:
                        result = format_output(oput)
                        if result:
                            self.current_cell.result.append(result)

                oput = edata.get('chunk_output')
                if oput:
                    result = format_output(oput)
                    if result:
                        self.current_cell.result.append(result)

            # get notebook code
            if self.is_notebook and etype == 'notebook_range_executed':

                # new cell advance cell
                if self.chunk_id is None or self.chunk_id != edata['chunk_id']:
                    if self.current_cell.code:
                        self.cell_data.append(self.current_cell)
                    self.current_cell = ExecutionData()
                    self.chunk_id = edata['chunk_id']
                    self.current_cell.tags.append('notebook')

                # take code in current cell
                self.current_cell.code.append({'code': edata['code']})

            # console code
            if self.is_console and etype == 'console_write_input':
                # remove trailing whitespace -- specificially \n
                if edata['text'] != "\n" or self.current_cell.code != []:
                    self.current_cell.code.append(
                        {'code': edata['text'].rstrip()})

            # this happens in both notebooks and console
            #   ignore if no context (that's the R origination message
            if etype == 'console_output' and (self.is_console
                                              or self.is_notebook):
                self.current_cell.result.append(
                    {'data': {
                        'text/plain': edata['text']
                    }})
Beispiel #10
0
class RStudioServerMonitor(ActivityMonitor):
    """Class to monitor an rstudio server for activity to be processed."""
    def __init__(self,
                 user: str,
                 owner: str,
                 labbook_name: str,
                 monitor_key: str,
                 config_file: str = None,
                 author_name: Optional[str] = None,
                 author_email: Optional[str] = None) -> None:
        """Constructor requires info to load the lab book

        Args:
            user(str): current logged in user
            owner(str): owner of the lab book
            labbook_name(str): name of the lab book
            monitor_key(str): Unique key for the activity monitor in redis
            author_name(str): Name of the user starting this activity monitor
            author_email(str): Email of the user starting this activity monitor
        """
        # Call super constructor
        ActivityMonitor.__init__(self,
                                 user,
                                 owner,
                                 labbook_name,
                                 monitor_key,
                                 config_file,
                                 author_name=author_name,
                                 author_email=author_email)

        # For now, register processors by default
        self.register_processors()

        # Let's call them cells as if they were Jupyter
        self.current_cell = ExecutionData()
        self.cell_data: List[ExecutionData] = list()

        # variables that track the context of messages in the log
        #   am I in the console, or the notebook?
        #   what chunk is being executed?
        #   in what notebook?
        self.is_console = False
        self.is_notebook = False
        self.chunk_id = None
        self.nbname = None

    def register_processors(self) -> None:
        """Method to register processors

        Returns:
            None
        """
        self.add_processor(RStudioServerCodeProcessor())
        self.add_processor(GenericFileChangeProcessor())
        self.add_processor(RStudioServerPlaintextProcessor())
        self.add_processor(RStudioServerImageExtractorProcessor())
        self.add_processor(ActivityDetailLimitProcessor())
        self.add_processor(ActivityShowBasicProcessor())

    def start(self, metadata: Dict[str, str], database: int = 1) -> None:
        """Method called in a periodically scheduled async worker that should check the dev env and manage Activity
        Monitor Instances as needed

        Args:
            metadata(dict): A dictionary of data to start the activity monitor
            database(int): The database ID to use

        Returns:
            None
        """
        # Get connection to the DB
        redis_conn = redis.Redis(db=database)

        # TODO DC: Dean asks why we need to use a regex here.
        # This will get hoisted by https://github.com/gigantum/gigantum-client/issues/453
        m = re.match(r".*:activity_monitor:(\w+)$", self.monitor_key)
        if m is not None:
            filename = f"/mnt/share/mitmproxy/{m.group(1)}.rserver.dump"
        else:
            logger.error(f"No active monitor matching {self.monitor_key}")

        # TODO RB will need to open in write mode later to sparsify parts of the file that have already been read
        # https://github.com/gigantum/gigantum-client/issues/434
        # open the log file
        mitmlog = open(filename, "rb")
        if not mitmlog:
            logger.info(f"Failed to open RStudio log {self.monitor_key}")
            return

        try:
            while True:
                still_running = redis_conn.hget(self.monitor_key, "run")
                # Check if you should exit
                # sometimes this runs after key has been deleted.  None is shutdown too.
                if not still_running or still_running.decode() == "False":
                    logger.info(
                        f"Received Activity Monitor Shutdown Message for {self.monitor_key}"
                    )
                    break

                previous_cells = len(self.cell_data)

                # Read activity and update aggregated "cell" data
                self.process_activity(mitmlog)

                # We are processing every second, then aggregating activity records when idle
                if previous_cells == len(
                        self.cell_data) and self.current_cell.is_empty():
                    # there are no new cells in the last second, and no cells are in-process
                    self.store_record()

                # Check for new records every second
                time.sleep(1)

        except Exception as e:
            logger.error(
                f"Fatal error in RStudio Server Activity Monitor: {e}")
            raise
        finally:
            # Delete the kernel monitor key so the dev env monitor will spin up a new process
            # You may lose some activity if this happens, but the next action will sweep up changes
            logger.info(f"Shutting down RStudio monitor {self.monitor_key}")
            redis_conn.delete(self.monitor_key)

    def store_record(self) -> None:
        """Store R input/output/code to ActivityRecord / git commit

        store_record() should be called after moving any data in self.current_cell to
        self.cell_data. Any data remaining in self.current_cell will be removed.

        Args:
            None
        """
        if len(self.cell_data) > 0:
            t_start = time.time()

            # Process collected data and create an activity record
            if self.is_console:
                codepath = "console"
            else:
                codepath = self.nbname if self.nbname else "Unknown notebook"

            activity_record = self.process(ActivityType.CODE,
                                           list(reversed(self.cell_data)),
                                           {'path': codepath})

            # Commit changes to the related Notebook file
            commit = self.commit_labbook()

            # Create note record
            activity_commit = self.store_activity_record(
                commit, activity_record)

            logger.info(
                f"Created auto-generated activity record {activity_commit} in {time.time() - t_start} seconds"
            )

        # Reset for next execution
        self.current_cell = ExecutionData()
        self.cell_data = list()
        self.is_notebook = False
        self.is_console = False

    def _parse_json_record(self, json_record: Dict) -> None:
        """Extract code and data from the record.

        When context switches between console <-> notebook, we store a record for
        the previous execution and start a new record.

        Args:
            json_record: dictionary parsed from mitmlog
        """
        result = json_record.get('result')
        # No result ignore
        if not result:
            return

        # execution of new notebook cell
        if result[0]['type'] == 'chunk_exec_state_changed':
            if self.is_console:
                # switch from console to notebook. store record
                if self.current_cell.code:
                    self.cell_data.append(self.current_cell)
                    self.store_record()
            elif self.is_notebook and self.current_cell.code:
                self.cell_data.append(self.current_cell)
                self.current_cell = ExecutionData()
                self.chunk_id = None

            self.is_notebook = True
            self.is_console = False
            # Reset current_cell attribute for next execution
            self.current_cell.tags.append('notebook')

        # execution of console code.  you get a message for every line.
        if result[0]['type'] == 'console_write_prompt':
            if self.is_notebook:
                # switch to console. store record
                if self.current_cell.code:
                    self.cell_data.append(self.current_cell)
                    self.store_record()
                self.current_cell.tags.append('console')

            # add a tag is this is first line of console code
            elif not self.is_console:
                self.current_cell.tags.append('console')

            self.is_console = True
            self.is_notebook = False

        # parse the entries in this message
        for edata, etype in [(entry.get('data'), entry.get('type'))
                             for entry in result]:
            if etype == 'chunk_output':
                outputs = edata.get('chunk_outputs')
                if outputs:
                    for oput in outputs:
                        result = format_output(oput)
                        if result:
                            self.current_cell.result.append(result)

                oput = edata.get('chunk_output')
                if oput:
                    result = format_output(oput)
                    if result:
                        self.current_cell.result.append(result)

            # get notebook code
            if self.is_notebook and etype == 'notebook_range_executed':

                # new cell advance cell
                if self.chunk_id is None or self.chunk_id != edata['chunk_id']:
                    if self.current_cell.code:
                        self.cell_data.append(self.current_cell)
                    self.current_cell = ExecutionData()
                    self.chunk_id = edata['chunk_id']
                    self.current_cell.tags.append('notebook')

                # take code in current cell
                self.current_cell.code.append({'code': edata['code']})

            # console code
            if self.is_console and etype == 'console_write_input':
                # remove trailing whitespace -- specificially \n
                if edata['text'] != "\n" or self.current_cell.code != []:
                    self.current_cell.code.append(
                        {'code': edata['text'].rstrip()})

            # this happens in both notebooks and console
            #   ignore if no context (that's the R origination message
            if etype == 'console_output' and (self.is_console
                                              or self.is_notebook):
                self.current_cell.result.append(
                    {'data': {
                        'text/plain': edata['text']
                    }})

    def _is_error(self, result: Dict) -> bool:
        """Check if there's an error in the message"""
        for entry in result:
            if entry['type'] == 'console_error':
                return True
        else:
            return False

    def _parse_image(self, st: Dict):
        # These are from notebooks
        m = re.match(r"/chunk_output/(([\w]+)/)+([\w]+.png)",
                     st['request']['path'].decode())
        if m:
            img_data = zlib.decompress(st['response']['content'],
                                       16 + zlib.MAX_WBITS)
            # if we actually wanted to work with the image, could do so like this:
            # img = Image.open(io.BytesIO(img_data))
            eimg_data = base64.b64encode(img_data)
            self.current_cell.result.append({'data': {'image/png': eimg_data}})

        # These are from scripts.
        m = re.match(r"/graphics/(?:[^[\\/:\"*?<>|]+])*([\w-]+).png",
                     st['request']['path'].decode())
        if m:
            img_data = zlib.decompress(st['response']['content'],
                                       16 + zlib.MAX_WBITS)
            eimg_data = base64.b64encode(img_data)
            self.current_cell.result.append({'data': {'image/png': eimg_data}})

    def _parse_json(self, st: Dict, is_gzip: bool):
        # get the filename
        m = re.match(r"/rpc/refresh_chunk_output.*",
                     st['request']['path'].decode())
        if m:
            # A new chunk, so potentially a new notebook.
            if self.current_cell.code:
                self.cell_data.append(self.current_cell)
                # RB was always storing a record here
                # with new logic, running cells in two notebooks within a second seems unlikely
                # self.store_record()

            # strict=False allows control codes, as used in tidyverse output
            jdata = json.loads(st['request']['content'], strict=False)
            fullname = jdata['params'][0]

            # pull out the name relative to the "code" directory
            m1 = re.match(r".*/code/(.*)$", fullname)
            if m1:
                self.nbname = m1.group(1)
            else:
                m2 = re.match(r"/mnt/labbook/(.*)$", fullname)
                if m2:
                    self.nbname = m2.group(1)
                else:
                    self.nbname = fullname

        # code or output event
        m = re.match(r"/events/get_events", st['request']['path'].decode())
        if m:
            if is_gzip:
                jdata = zlib.decompress(st['response']['content'],
                                        16 + zlib.MAX_WBITS)
            else:
                jdata = st['response']['content']
            # get text/code fields out of dictionary
            try:
                # strict=False allows control codes, as used in tidyverse output
                self._parse_json_record(json.loads(jdata, strict=False))
            except json.JSONDecodeError as je:
                logger.info(
                    f"Ignoring JSON Decoder Error in process_activity {je}.")
                return False

        return True

    def process_activity(self, mitmlog):
        """Collect tail of the activity log and turn into an activity record.

        Args:
            mitmlog(file): open file object

        Returns:
            ar(): activity record
        """
        # get an fstream generator object
        fstream = mitmio.FlowReader(mitmlog).stream()

        # no context yet.  not a notebook or console
        self.is_console = False
        self.is_notebook = False

        while True:
            try:
                f = next(fstream)
            except StopIteration:
                break
            except FlowReadException as e:
                logger.info("MITM Flow file corrupted: {}. Exiting.".format(e))
                break

            st: Dict = f.get_state()

            is_png = False
            is_gzip = False
            is_json = False

            # Check response types for images and json
            for header in st['response']['headers']:

                # png images
                if header[0] == b'Content-Type':
                    if header[1] == b'image/png':
                        is_png = True

                # json
                if header[0] == b'Content-Type':
                    if header[1] == b'application/json':
                        is_json = True

                if header[0] == b'Content-Encoding':
                    if header[1] == b'gzip':
                        is_gzip = True
                    else:
                        # Not currently used, but useful for debugging and potentially in future
                        encoding = header[1]

            # process images
            if is_png:
                if is_gzip:
                    self._parse_image(st)
                else:
                    logger.error(
                        f"RSERVER Found image/png that was not gzip encoded.")

            if is_json:
                self._parse_json(st, is_gzip)

        # Flush cell data IFF anything happened
        if self.current_cell.code:
            self.cell_data.append(self.current_cell)
        self.store_record()
        self.current_cell = ExecutionData()
        self.cell_data = list()
        self.chunk_id = None
Beispiel #11
0
    def test_is_empty(self):
        """Test the constructor"""
        ed = ExecutionData()

        assert ed.is_empty() is True