Esempio n. 1
0
def test_ratom_report_enron_027(isolated_cli_runner, enron_dataset_part027,
                                params, expected):
    msg_id = 2390436

    result = generate_report(params, enron_dataset_part027,
                             isolated_cli_runner, expected)

    with db_session_from_cmd_out(result) as session:
        # Verify total message count
        assert session.query(Message).count() == 9297

        # Get message contents from DB
        msg = session.query(Message).filter_by(pff_identifier=msg_id).one()
        headers, body = msg.headers, msg.body

        if expected.with_messages:
            # Access message directly and compare
            archive_file = list(enron_dataset_part027.glob("*.pst"))[0]
            with open_mail_archive(archive_file) as archive:
                message = archive.get_message_by_id(msg_id)
                assert cleanup_message_body(
                    *archive.get_message_body(message)) == body
                assert archive.get_message_headers(message) == headers

        else:
            assert headers is None
            assert body is None
Esempio n. 2
0
def test_get_mbox_message_by_id(sample_mbox_file):
    with open_mail_archive(sample_mbox_file) as archive:

        assert archive.message_count == 113

        for index, message in enumerate(archive.messages(), start=1):
            msg = archive.get_message_by_id(index)
            assert extract_message_from_archive(archive, index)
            assert archive.format_message(msg) == archive.format_message(
                message)
            assert archive.get_message_headers(message)
Esempio n. 3
0
def test_ratom_entities_enron_001(isolated_cli_runner, enron_dataset_part001,
                                  params, expected):
    msg_id = 2097572

    # Run entity extraction job with message content flag on
    result = extract_entities(params, enron_dataset_part001,
                              isolated_cli_runner, expected)

    # Get message contents from DB
    with db_session_from_cmd_out(result) as session:
        msg = session.query(Message).filter_by(pff_identifier=msg_id).one()
        headers, body = msg.headers, msg.body

    # Access message directly and compare
    archive_file = list(enron_dataset_part001.glob("*.pst"))[0]
    with open_mail_archive(archive_file) as archive:
        message = archive.get_message_by_id(msg_id)
        assert cleanup_message_body(*archive.get_message_body(message)) == body
        assert archive.get_message_headers(message) == headers
Esempio n. 4
0
def test_apply_spacy_model(sample_pst_file, model_name, expected_entity_types):
    # Extract a known (short) message
    msg_id = 2112164
    with open_mail_archive(sample_pst_file) as archive:
        msg_body = archive.get_message_body(
            archive.get_message_by_id(msg_id))[0]

    # Sanity check
    assert len(msg_body) == 564

    # Pre-load our model to install any missing dependencies
    assert get_cached_spacy_model(model_name)

    # Apply our model pretending to be in a forked process
    with patch(
            "libratom.lib.entities.current_process") as mock_current_process:
        mock_current_process.return_value.name = "NotMainProcess"

        # pylint:disable=no-value-for-parameter
        res, error = process_message(
            # Must use dictionary form if function is called explicitly
            {
                "filepath": sample_pst_file,
                "message_id": msg_id,
                "date": datetime.datetime.utcnow(),
                "body": msg_body,
                "body_type": BodyType.PLAIN,
                "spacy_model_name": model_name,
                "attachments": None,
            })

    assert res and not error

    # Check that the expected entity types were found
    assert expected_entity_types.issubset(
        set(entity[1] for entity in res["entities"]))
Esempio n. 5
0
def get_file_info(path: Path) -> Tuple[Dict, Optional[str]]:
    """
    For a given file path, returns the size, md5 and sha256 checksums
    """

    path_str, name = str(path), path.name
    res = {"path": path_str, "name": name}

    try:
        size = os.stat(path_str).st_size

        md5 = hashlib.md5()
        sha256 = hashlib.sha256()

        # First we read the file one block at a time and update digests
        with open(path_str, "rb") as f:
            for block in iter(partial(f.read, 128), b""):
                md5.update(block)
                sha256.update(block)

        md5, sha256 = md5.hexdigest(), sha256.hexdigest()

        res.update({"size": size, "md5": md5, "sha256": sha256})

        # Then we try to get a message count
        try:
            with open_mail_archive(path) as archive:
                res["msg_count"] = archive.message_count

        except Exception as exc:
            res["error"] = str(exc)

    except Exception as exc:
        return res, str(exc)

    return res, None
Esempio n. 6
0
def get_messages(
    files: Iterable[Path],
    progress_callback: Callable,
    with_content=True,
    with_headers=False,
    **kwargs,
) -> Generator[Dict, None, None]:
    """
    Message generator to feed a pool of processes from a directory of PST files
    """

    msg_count = 0

    # Iterate over files
    for file in files:
        try:
            with open_mail_archive(file) as archive:
                # Iterate over messages
                for message in archive.messages():
                    try:
                        # Keyword arguments for process_message()
                        res = {
                            "filepath":
                            archive.filepath,
                            "message_id":
                            getattr(message, "identifier", None),
                            "attachments":
                            archive.get_attachment_metadata(message),
                        }

                        try:
                            res["date"] = archive.get_message_date(message)
                        except Exception as exc:
                            res["date"] = None

                            logger.debug(
                                "Unable to extract date from message: {message_id} in file: {filepath}"
                                .format(**res))
                            logger.debug(exc, exc_info=True)

                        if with_content:
                            body, body_type = archive.get_message_body(message)
                            res["body"] = body
                            res["body_type"] = body_type

                        if with_headers:
                            res["headers"] = archive.get_message_headers(
                                message)

                        # Add any optional arguments
                        res.update(kwargs)

                        yield res

                    except Exception as exc:
                        # Log and move on to the next message
                        message_id = getattr(message, "identifier", None)
                        message_str = (f"message {message_id}"
                                       if message_id else "a message")
                        logger.info(f"Skipping {message_str} from {file}")
                        logger.debug(exc, exc_info=True)

                    finally:
                        msg_count += 1

                        # Update progress every N messages
                        if not msg_count % RATOM_MSG_PROGRESS_STEP:
                            progress_callback(RATOM_MSG_PROGRESS_STEP)

        except Exception as exc:
            # Log and move on to the next file
            logger.info(f"Skipping file {file}")
            logger.debug(exc, exc_info=True)

    # Update progress with remaining message count
    progress_callback(msg_count % RATOM_MSG_PROGRESS_STEP)