Esempio n. 1
0
def events() -> Iterator[Event]:
    with match_structure(input(), expected=EXPECTED, partial=True) as exports:
        for exp in exports:
            for csv_file in exp.rglob("*"):
                if not csv_file.is_file():
                    continue
                yield from _csv_to_json(csv_file)
Esempio n. 2
0
def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults:
    count = 0
    emitted = GoogleEventSet()
    # reversed shouldn't really matter? but logic is to use newer
    # takeouts if they're named according to date, since JSON Activity
    # is nicer than HTML Activity
    for path in reversed(inputs()):
        with match_structure(path, expected=EXPECTED, partial=True) as results:
            for m in results:
                # e.g. /home/sean/data/google_takeout/Takeout-1634932457.zip") -> 'Takeout-1634932457'
                # means that zipped takeouts have nice filenames from cachew
                cw_id, _, _ = path.name.rpartition(".")
                # each takeout result is cached as well, in individual databases per-type
                tk = TakeoutParser(m, cachew_identifier=cw_id, error_policy="drop")
                for event in tk.parse(cache=not disable_takeout_cache):
                    count += 1
                    if isinstance(event, Exception):
                        continue
                    if event in emitted:
                        continue
                    emitted.add(event)
                    yield event  # type: ignore[misc]
    logger.debug(
        f"HPI Takeout merge: from a total of {count} events, removed {count - len(emitted)} duplicates"
    )
Esempio n. 3
0
def test_gdpr_unzip() -> None:

    with match_structure(structure_data / "gdpr_export.zip",
                         expected=gdpr_expected) as results:
        assert len(results) == 1
        extracted = results[0]
        index_file = extracted / "messages" / "index.csv"
        assert index_file.read_text().strip() == "test message"

    # make sure the temporary directory this created no longer exists
    assert not extracted.exists()
Esempio n. 4
0
def activity() -> Iterator[Activity]:
    emitted: Set[str] = set()
    for exp in get_files(config.export_path):
        with match_structure(
                exp, expected=EXPECTED_DISCORD_STRUCTURE) as discord_export:
            for activity_dir in [d / "activity" for d in discord_export]:
                for act in parse_activity(activity_dir):
                    if act.event_id in emitted:
                        continue
                    yield act
                    emitted.add(act.event_id)
Esempio n. 5
0
def messages() -> Iterator[Message]:
    emitted: Set[int] = set()
    for exp in get_files(config.export_path):
        with match_structure(
                exp, expected=EXPECTED_DISCORD_STRUCTURE) as discord_export:
            for message_dir in [d / "messages" for d in discord_export]:
                for msg in parse_messages(message_dir):
                    if msg.message_id in emitted:
                        continue
                    yield Message(
                        message_id=msg.message_id,
                        timestamp=msg.timestamp,
                        channel=msg.channel,
                        content=_remove_link_suppression(msg.content),
                        attachments=msg.attachments,
                    )
                    emitted.add(msg.message_id)
Esempio n. 6
0
def events(disable_takeout_cache: bool = DISABLE_TAKEOUT_CACHE) -> CacheResults:
    error_policy = config.error_policy
    count = 0
    emitted = GoogleEventSet()
    # reversed shouldn't really matter? but logic is to use newer
    # takeouts if they're named according to date, since JSON Activity
    # is nicer than HTML Activity
    for path in reversed(inputs()):
        with ExitStack() as exit_stack:
            if config._use_zippath:
                from my.core.kompress import ZipPath
                # for later takeouts it's just 'Takeout' dir,
                # but for older (pre 2015) it contains email/date in the subdir name
                results = tuple(cast(Sequence[Path], ZipPath(path).iterdir()))
            else:
                results = exit_stack.enter_context(match_structure(path, expected=EXPECTED, partial=True))
            for m in results:
                # e.g. /home/sean/data/google_takeout/Takeout-1634932457.zip") -> 'Takeout-1634932457'
                # means that zipped takeouts have nice filenames from cachew
                cw_id, _, _ = path.name.rpartition(".")
                # each takeout result is cached as well, in individual databases per-type
                tk = TakeoutParser(m, cachew_identifier=cw_id, error_policy=error_policy)
                # TODO might be nice to pass hpi cache dir?
                for event in tk.parse(cache=not disable_takeout_cache):
                    count += 1
                    if isinstance(event, Exception):
                        if error_policy == 'yield':
                            yield event
                        elif error_policy == 'raise':
                            raise event
                        elif error_policy == 'drop':
                            pass
                        continue
                    if event in emitted:
                        continue
                    emitted.add(event)
                    yield event  # type: ignore[misc]
    logger.debug(
        f"HPI Takeout merge: from a total of {count} events, removed {count - len(emitted)} duplicates"
    )
Esempio n. 7
0
def test_not_directory() -> None:
    with pytest.raises(NotADirectoryError,
                       match=r"Expected either a zipfile or a directory"):
        with match_structure(structure_data / "messages/index.csv",
                             expected=gdpr_expected):
            pass
Esempio n. 8
0
def test_match_partial() -> None:
    # a partial match should match both the 'broken' and 'gdpr_export' directories
    with match_structure(structure_data / "gdpr_subdirs",
                         expected=gdpr_expected,
                         partial=True) as results:
        assert len(results) == 2
Esempio n. 9
0
def test_gdpr_structure_exists() -> None:
    with match_structure(structure_data, expected=gdpr_expected) as results:
        assert results == (structure_data / "gdpr_subdirs" / "gdpr_export", )
Esempio n. 10
0
def accounts() -> Sequence[Path]:
    accounts = []
    for f in get_files(config.export_path):
        with match_structure(f, EXPECTED) as match:
            accounts.extend(list(match))
    return accounts
Esempio n. 11
0
def export_dirs() -> List[Path]:
    base: Path = Path(config.export_path).expanduser().absolute()
    with match_structure(base, expected="animelist.xml") as matches:
        return list(matches)