Example #1
0
    def update_preview(self):
        if self.pages is not None:
            pdf_file = self.pages[self.current_page - 1]
            pdf_file.seek(0)
            document = Document(stream=pdf_file, filetype="PDF")
            image = QPixmap()

            page = document.loadPage(0)

            container_size = self.preview_box.parent().size()

            normalized_zoom_factor = min(
                page.rect.height / container_size.height(),
                page.rect.width / container_size.width(),
            )
            scale_mat = Matrix(normalized_zoom_factor, normalized_zoom_factor)

            image.loadFromData(page.getPixmap(matrix=scale_mat).getPNGData())
            image_size = container_size * self.zoom_factor
            self.preview_box.setPixmap(
                image.scaled(image_size, Qt.KeepAspectRatio))
            self.preview_box.resize(image_size)
        else:
            self.preview_box.setTextFormat(Qt.RichText)
            self.preview_box.setStyleSheet("QLabel { color : darkgray; }")
            self.preview_box.setText(
                f'<center><img src="{appctxt.get_resource("logo.png")}") /><br />Wähle auf der rechten Seite<br />eine Datei und ein Papierformat<br />aus um die Vorschau anzuzeigen</center>'
            )

            self.preview_box.resize(QSize(200, 200))
Example #2
0
def manga_to_PDF(dir_path, one_file=True, width=None, height=None):
    if dir_path[-1] == '\\' or dir_path[-1] == '/':
        dir_path = dir_path[0:-1]
    if not os.path.isdir(dir_path):
        raise ValueError('传入的路径并非文件夹')

    from fitz import Document, Pixmap, Rect
    from glob import _iglob as glob

    if one_file:
        title = os.path.basename(dir_path)
        with Document() as doc:
            for file_path in glob(os.path.join(dir_path, "*", "*.jpg"), False,
                                  False):
                pixmap = Pixmap(file_path)
                if width and height:
                    pixmap = Pixmap(pixmap, width, height, None)
                elif width:
                    pixmap = Pixmap(pixmap, width,
                                    int(pixmap.height / pixmap.width * width),
                                    None)
                elif height:
                    pixmap = Pixmap(pixmap,
                                    int(pixmap.width / pixmap.height * height),
                                    height, None)
                rect = Rect(0, 0, pixmap.width, pixmap.height)
                page = doc.newPage(width=pixmap.width, height=pixmap.height)
                page.insertImage(rect, pixmap=pixmap)
            doc.save(os.path.join(dir_path, title + ".pdf"), deflate=True)

    else:
        for chap in glob(os.path.join(dir_path, "*"), False, True):
            title = os.path.basename(chap)
            with Document() as doc:
                for file_path in glob(os.path.join(chap, "*.jpg"), False,
                                      False):
                    pixmap = Pixmap(file_path)
                    if width and height:
                        pixmap = Pixmap(pixmap, width, height, None)
                    elif width:
                        pixmap = Pixmap(
                            pixmap, width,
                            int(pixmap.height / pixmap.width * width), None)
                    elif height:
                        pixmap = Pixmap(
                            pixmap, int(pixmap.width / pixmap.height * height),
                            height, None)
                    rect = Rect(0, 0, pixmap.width, pixmap.height)
                    page = doc.newPage(width=pixmap.width,
                                       height=pixmap.height)
                    page.insertImage(rect, pixmap=pixmap)
                doc.save(os.path.join(dir_path, title + ".pdf"), deflate=True)
Example #3
0
def inspect(file: Union[str, bytes, Path]) -> PdfRedactionsDict:
    """
    Inspect a file for bad redactions and return a Dict with their info

    :file: The PDF to process, as bytes if you have the file in memory (useful
    if it's coming from the network), as a unicode string if you know the
    path to the file on your local disk, or as a pathlib.Path object.
    :return: A dict with the bad redaction information. If no bad redactions
    are found, returns an empty dict.
    """
    if type(file) == bytes:
        pdf = Document(stream=file, filetype="pdf")
    elif type(file) == str and file.startswith("https://"):
        r = requests.get(file, timeout=10)
        r.raise_for_status()
        pdf = Document(stream=r.content, filetype="pdf")
    else:
        # str filepath or Pathlib Path
        pdf = Document(file)

    bad_redactions = {}
    for page_number, page in enumerate(pdf, start=1):
        redactions = get_bad_redactions(page)
        if redactions:
            bad_redactions[page_number] = redactions
    pdf.close()
    bad_redactions = check_if_all_dates(bad_redactions)

    return bad_redactions
Example #4
0
def extract_meta(doc: Document,
                 pattern: str,
                 page: Optional[int] = None,
                 ign_case: bool = False) -> List[dict]:
    """Extract meta for a `pattern` on `page` in a pdf document

    Arguments
      doc: document from pymupdf
      pattern: a regular expression pattern
      page: page number (1-based index), if None is given, search for the
            entire document, but this is highly discouraged.
      ign_case: ignore case?
    """
    result = []

    if page is None:
        pages = doc.pages()
    elif 1 <= page <= doc.pageCount:
        pages = [doc[page - 1]]
    else:  # page out of range
        return result

    regex = re.compile(pattern,
                       re.IGNORECASE) if ign_case else re.compile(pattern)

    # we could parallelize this, but I don't see a reason
    # to *not* specify a page number
    for p in pages:
        result.extend(search_in_page(regex, p))

    return result
Example #5
0
def attact_annotation(annot: Annotation, doc: fitz.Document):
    page: fitz.Page = doc.loadPage(annot.pageNum)
    atype = annot.annotType[0]
    ann_new = None
    if atype == fitz.ANNOT_FREETEXT:
        ann_new = page.addFreetextAnnot(annot.rect, annot.info['content'])
    elif atype == fitz.ANNOT_TEXT:
        ann_new = page.addTextAnnot(annot.point, annot.info['content'])
    elif atype == fitz.ANNOT_HIGHLIGHT:
        ann_new = page.addHighlightAnnot(annot.quads)
    elif atype == fitz.ANNOT_STRIKEOUT:
        ann_new = page.addStrikeOUTAnnot(annot.quads)
    elif atype == fitz.ANNOT_SQUIGGLY:
        ann_new = page.addSquigglyAnnot(annot.quads)
    elif atype == fitz.ANNOT_UNDERLINE:
        ann_new = page.addUnderlineAnnot(annot.quads)
    else:
        print(
            f'Annotation type {annot.annotType} is not supported. Ignore: {annot}'
        )

    if ann_new:
        ann_new: fitz.Annot
        ann_new.setInfo(annot.info)
        ann_new.setColors(annot.colors)
        ann_new.setBorder(annot.border)
        if annot.lineEnds:
            ann_new.setLineEnds(*annot.lineEnds)
        ann_new.setOpacity(annot.opacity)
def extract_info_from_pdf(doc: fitz.Document, user: str) -> [Train]:
    trains_list = list()
    for i in range(doc.pageCount):
        page = doc.loadPage(i)
        text = page.getText("text")
        train = extract_info_from_text(text, user)
        if train is not None:
            trains_list.append(train)
    return trains_list
Example #7
0
def read_book(file):
    """
    This function allows you to read book
    
    :file: must be string formatted .pdf or .epub or .fb2 file.
        Other formats will raise exception.
    """
    if file.endswith('.pdf') or file.endswith('.epub') or file.endswith(
            '.fb2'):
        return Document(file)
Example #8
0
def save_transform_pdf(
    case_id: str,
    pdf_file: FileStorage,
):
    __init_time = time.time()
    __total_rendering_time = 0

    case_dir_path = __case_dir_path(case_id)
    os.makedirs(case_dir_path, exist_ok=True)
    pdf_path = "%s%08d.pdf" % (parent_dir, rd.randint(0, 1e7))
    pdf_file.save(pdf_path)
    pdf = Document(pdf_path)
    refs_size = pdf._getXrefLength()
    img_cnt = 0
    q = Queue()
    # 遍历每一个对象
    for i in range(1, refs_size):
        # 定义对象字符串
        text = pdf._getXrefString(i)
        is_object = re.search(checkXO, text)
        # 使用正则表达式查看是否是图片
        is_img = re.search(checkIM, text)
        # 如果不是对象也不是图片,则continue
        if not is_object or not is_img:
            continue
        img_cnt += 1
        path = '%s%08d.png' % (case_dir_path, img_cnt)
        # 根据索引生成图像
        q.put((pdf_path, i, path))

    procs = [
        Process(target=render, name="P%d" % _pi, args=(q, ))
        for _pi in range(64)
    ]
    for proc in procs:
        print('start', proc.name)
        proc.start()
    for proc in procs:
        proc.join()

    print("total time: %.4f" % (time.time() - __init_time))
Example #9
0
def _is_device_cs(xref, doc: fitz.Document):
    '''Check whether object xref is a device based color space.
    '''
    # cs definition
    obj_contents = doc.xrefObject(xref)

    # for now, just check /ICCBased CS:
    # it's treated as a device based cs if /Device[Gray|RGB|CMYK] exists in /Alternate.
    #
    # [ /ICCBased 15 0 R ]
    #
    # <<
    #   /Alternate /DeviceRGB
    #   /Filter /FlateDecode
    #   /Length 2597
    #   /N 3
    # >>
    if '/ICCBased' in obj_contents:
        name, x, *_ = obj_contents[1:-1].strip().split()
        ICC_contents = doc.xrefObject(int(x))
        return '/Alternate /Device' in ICC_contents

    # ignore all other color spaces, may include if facing associated cases
    return False
Example #10
0
def return_page(doc: Document, page_num=0):
    """
    This function allows you to read any page of your file in html format.

    :doc: Document with all pages.

    :page_num: Number of document page, default value = 0.
                Wrong number raise exception.
    """

    if page_num < 0 or page_num >= doc.page_count:
        raise Exception("Wrong page")

    page = doc.loadPage(page_num)
    return page
Example #11
0
def show_page(doc: Document, page_num=0):
    """
    This function allows you to read any page of your file in html format.
    
    :doc: Document with all pages.
    
    :page_num: Number of document page, default value = 0.
                Wrong number raise exception.
    """

    if page_num < 0 or page_num >= doc.page_count:
        raise Exception("Wrong page")

    pix = doc.loadPage(page_num).get_pixmap()
    fmt = QImage.Format_RGBA8888 if pix.alpha else QImage.Format_RGB888
    qtimg = QImage(pix.samples, pix.width, pix.height, pix.stride, fmt)
    return qtimg
Example #12
0
def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]:
    """Extract toc entries from a document

    Arguments
      doc: a pdf document
      recipe: recipe from user
    Returns
      a list of toc entries in the document
    """
    result = []

    for page in doc.pages():
        for blk in page.getTextPage().extractDICT().get('blocks', []):
            result.extend(
                recipe.extract_block(blk, page.number + 1)
            )

    return result
Example #13
0
def render(q: Queue):
    __pdf_path = None
    __pdf_doc = None
    while not q.empty():
        pdf_path, xref, save_to = q.get()
        if __pdf_path == pdf_path:
            pdf_doc = __pdf_doc
        else:
            print('Worker Read')
            pdf_doc = Document(pdf_path)
            __pdf_path = pdf_path
            __pdf_doc = pdf_doc

        pix = fitz.Pixmap(pdf_doc, xref)
        if pix.n < 5:
            pix.writePNG(save_to)
        # 否则先转换CMYK
        else:
            fitz.Pixmap(fitz.csRGB, pix).writePNG(save_to)
Example #14
0
def _check_device_cs(doc:fitz.Document, page:fitz.Page):
    '''Get all color space name used in current page and check if they're device based color space.'''
    # default device based cs
    cs = {
        '/DeviceGray': True, 
        '/DeviceRGB' : True, 
        '/DeviceCMYK': True
    }

    # content of page object, e.g.
    # <<
    # ...
    # /Resources <<
    #     ...
    #     /ColorSpace <<
    #     /Cs6 14 0 R
    #     >>
    # >>
    # /Rotate 0
    # /Type /Page
    # >>
    obj_contents = doc.xrefObject(page.xref)

    cs_found = False
    for line_ in obj_contents.splitlines():
        line = line_.strip()

        # check start/end of color space block
        if not cs_found and line.startswith('/ColorSpace'):
            cs_found = True
            continue

        if not cs_found:
            continue
        elif line=='>>':
            break

        # now within cs block, e.g. /Cs6 14 0 R
        cs_name, xref, *_ = line.split()
        cs[cs_name] = _is_device_cs(int(xref), doc)

    return cs
 def test_reading_books_test1(self):
     test_file = "c++_trofumenko.pdf"
     actual_result = read_book(test_file).page_count
     expected_result = Document(test_file).page_count
     self.assertEqual(actual_result, expected_result)
Example #16
0
def write_toc(doc: Document, toc: List[ToCEntry]):
    """Write table of contents to a document"""
    fitz_toc = list(map(lambda e: e.to_fitz_entry(), toc))
    doc.set_toc(fitz_toc)
Example #17
0
def read_toc(doc: Document) -> List[ToCEntry]:
    """Read table of contents from a document"""
    return [ToCEntry(*entry) for entry in doc.get_toc()]
Example #18
0
    page_delta = args.delta[0] + 1
else:
    page_delta = 1
if args.start:
    start = args.start[0]
else:
    start = 0
if args.end:
    end = args.end[0]
results_file = args.output[0]
font_standard = args.font_size[0]
pdf_file = args.pdf_file[0]
names_file = args.names[0]
with open(names_file, "r") as file:
    names_list = tuple(map(lambda x: str(x).strip("\n"), file.readlines()))
doc = Document(pdf_file)
if end is None:
    end = len(doc)
results = open(results_file, "a")
for name in names_list:
    print(f"Parsing: {name}")
    word_in_pages = []
    results.write(f"{name} ")
    for page in range(start, end):
        lines = unescape(doc[page].getText("html")).split("\n")
        for line in lines:
            item = line.replace("\t", " ")
            bs = BeautifulSoup(item, "html.parser")
            if bs.p is not None:
                if name.title() in str(bs.p.text) and str(
                        bs.p["style"]).find("top:84pt") == -1: