def test_annotated_pdfs_with_filelist() -> None: """ Test annotated_pdfs, with a real PDF file annotated_pdfs should return a list of one Pdf object, with three Annotations """ from my.pdfs import annotated_pdfs filelist = [ testdata() / 'pdfs' / 'Information Architecture for the World Wide Web.pdf' ] annotations_generator = annotated_pdfs(filelist=filelist) import inspect assert inspect.isgeneratorfunction(annotated_pdfs) highlights_from_pdfs = [] for pdf_object in list(annotations_generator): assert not isinstance(pdf_object, Exception) highlights_from_pdfs.extend( [a.highlight for a in pdf_object.annotations]) assert len(highlights_from_pdfs) == 3 assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS
def test_module(with_config) -> None: # TODO crap. if module is imported too early (on the top level, it makes it super hard to overrride config) # need to at least detect it... from my.pdfs import annotations, annotated_pdfs # todo check types etc as well assert ilen(annotations()) >= 3 assert ilen(annotated_pdfs()) >= 1
def test_annotated_pdfs_with_filelist(): """ Test annotated_pdfs, with a real PDF file annotated_pdfs should return a list of one Pdf object, with three Annotations """ filelist = [ Path(ROOT / 'Information Architecture for the World Wide Web.pdf') ] annotations_generator = annotated_pdfs(filelist=filelist, roots=None) assert inspect.isgeneratorfunction(annotated_pdfs) highlights_from_pdfs = [] for pdf_object in list(annotations_generator): highlights_from_pdfs.extend( [a.highlight for a in pdf_object.annotations]) assert len(highlights_from_pdfs) == 3 assert set(highlights_from_pdfs) == EXPECTED_HIGHLIGHTS
def get_items(self) -> Mirror.Results: import my.pdfs as pdfs for pdf in sorted( pdfs.annotated_pdfs(), key=lambda p: datetime.min if isinstance(p, Exception) or p. created is None else p.created.replace(tzinfo=None), ): if isinstance(pdf, Exception): yield error(pdf) continue def chit(pdf: pdfs.Pdf): for a in pdf.annotations: parts = [] highlight = (a.highlight or '').strip() author = (a.author or '').strip() comment = (a.comment or '').strip() if highlight: parts.append(literal(highlight)) if author: parts.append(f'by {author}') if comment: parts.append(comment) body = '\n'.join(parts) page1 = a.page + 1 page_link = docview_link(path=pdf.path, title=f'page {page1}', page1=page1) yield node( dt_heading(a.created, page_link), body=body, ) pdf_link = docview_link(path=pdf.path, title=str( pdf.path)) # todo would be nice to extract metadata for title yield node(dt_heading(pdf.created, pdf_link), children=list(chit(pdf)))