Beispiel #1
0
def test_annotated_pdfs_with_filelist() -> None:
    """
    Test annotated_pdfs, with a real PDF file
    annotated_pdfs should return a list of one Pdf object, with three Annotations
    """
    from my.pdfs import annotated_pdfs

    filelist = [
        testdata() / 'pdfs' /
        'Information Architecture for the World Wide Web.pdf'
    ]
    annotations_generator = annotated_pdfs(filelist=filelist)

    import inspect
    assert inspect.isgeneratorfunction(annotated_pdfs)

    highlights_from_pdfs = []

    for pdf_object in list(annotations_generator):
        assert not isinstance(pdf_object, Exception)
        highlights_from_pdfs.extend(
            [a.highlight for a in pdf_object.annotations])

    assert len(highlights_from_pdfs) == 3
    assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS
Beispiel #2
0
def test_module(with_config) -> None:
    # TODO crap. if module is imported too early (on the top level, it makes it super hard to overrride config)
    # need to at least detect it...
    from my.pdfs import annotations, annotated_pdfs

    # todo check types etc as well
    assert ilen(annotations()) >= 3
    assert ilen(annotated_pdfs()) >= 1
Beispiel #3
0
def test_annotated_pdfs_with_filelist():
    """
    Test annotated_pdfs, with a real PDF file
    annotated_pdfs should return a list of one Pdf object, with three Annotations
    """
    filelist = [
        Path(ROOT / 'Information Architecture for the World Wide Web.pdf')
    ]
    annotations_generator = annotated_pdfs(filelist=filelist, roots=None)

    assert inspect.isgeneratorfunction(annotated_pdfs)

    highlights_from_pdfs = []

    for pdf_object in list(annotations_generator):
        highlights_from_pdfs.extend(
            [a.highlight for a in pdf_object.annotations])

    assert len(highlights_from_pdfs) == 3
    assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS
Beispiel #4
0
    def get_items(self) -> Mirror.Results:
        import my.pdfs as pdfs

        for pdf in sorted(
                pdfs.annotated_pdfs(),
                key=lambda p: datetime.min if isinstance(p, Exception) or p.
                created is None else p.created.replace(tzinfo=None),
        ):
            if isinstance(pdf, Exception):
                yield error(pdf)
                continue

            def chit(pdf: pdfs.Pdf):
                for a in pdf.annotations:
                    parts = []
                    highlight = (a.highlight or '').strip()
                    author = (a.author or '').strip()
                    comment = (a.comment or '').strip()
                    if highlight:
                        parts.append(literal(highlight))
                    if author:
                        parts.append(f'by {author}')
                    if comment:
                        parts.append(comment)
                    body = '\n'.join(parts)
                    page1 = a.page + 1
                    page_link = docview_link(path=pdf.path,
                                             title=f'page {page1}',
                                             page1=page1)
                    yield node(
                        dt_heading(a.created, page_link),
                        body=body,
                    )

            pdf_link = docview_link(path=pdf.path, title=str(
                pdf.path))  # todo would be nice to extract metadata for title
            yield node(dt_heading(pdf.created, pdf_link),
                       children=list(chit(pdf)))