Ejemplo n.º 1
0
def parse_state(state):
    data = []
    if state in [
            x.split("_US_", 1)[1].split("_Mobility", 1)[0]
            for x in glob.glob("mobilityData/US/*.pdf")
    ]:
        document = fitz.Document(
            f"mobilityDataPDF/US/{date}_US_{state}_Mobility_Report_en.pdf")
        for i in range(2, document.pageCount - 1):
            for entry in parse_page(document, i):
                entry["state"] = state
                entry["page"] = i
                data.append(entry)
        df = pd.DataFrame(data)
        return df[[
            "state", "county", "category", "change", "changecalc", "dates",
            "values", "page"
        ]]

    else:
        document = fitz.Document(
            f"mobilityDataPDF/2020-04-11_{state}_Mobility_Report_en.pdf")
        if document.pageCount < 4:
            return pd.DataFrame(data)
        for i in range(2, document.pageCount - 1):
            for entry in parse_page(document, i):
                entry["country"] = state
                entry["page"] = i
                data.append(entry)
        df = pd.DataFrame(data)
        return df[[
            "country", "county", "category", "change", "changecalc", "dates",
            "values", "page"
        ]]
Ejemplo n.º 2
0
def merge_pdfs(outfile, pdf_files):
    """Merges given PDF files into one PDF file.

    :param pdf_files: PDF files to merge
    :param outfile: Merged PDF file
    """

    result_pdf = fitz.Document()

    print("reading input pdf files...", flush=True)

    # reads pdf files
    for pdf_file in pdf_files:
        with fitz.Document(pdf_file) as pdf_doc:
            result_pdf.insertPDF(pdf_doc)

    print("pdf files have been read")

    # creates directories needed to write the outfile (if needed)
    # os.path.normpath() used to turn "" (empty string) directory path to "."
    # manually checking for "" and turning into "." would also work
    # https://bugs.python.org/issue33968
    os.makedirs(os.path.normpath(os.path.dirname(outfile)), exist_ok=True)

    print(f"saving the merged pdf document into: {outfile}")
    print(f"  ({os.path.abspath(outfile)})")

    result_pdf.save(outfile)
Ejemplo n.º 3
0
def process(pdf_dir, out_dir):
    """合并pdf"""
    out_pdf = fitz.Document()
    files_path = files_in_folder(pdf_dir, filter_extend=['pdf'])
    for file_path in files_path:
        Message.info(f'开始提取:{file_path}')
        with fitz.Document(file_path) as now_pdf:
            out_pdf.insert_pdf(now_pdf)

    out_pdf_path = os.path.join(out_dir, f"PDF合并文件-{uuid.uuid1()}.pdf")
    out_pdf.save(out_pdf_path)
Ejemplo n.º 4
0
def image_pdf(file_dir):
    dir_name, base_name = get_dir_name(file_dir)
    doc = fitz.Document()
    for img in os.listdir(file_dir):  # 排序获得对象
        img = file_dir + os.sep + img
        img_doc = fitz.Document(img)  # 获得图片对象
        pdf_bytes = img_doc.convertToPDF()  # 获得图片流对象
        img_pdf = fitz.Document("pdf", pdf_bytes)  # 将图片流创建单个的PDF文件
        doc.insertPDF(img_pdf)  # 将单个文件插入到文档
        img_doc.close()
        img_pdf.close()
    doc.save(dir_name + os.sep + base_name + ".pdf")  # 保存文档
    doc.close()
Ejemplo n.º 5
0
def image_pdf(file_dir):
    dir_name, base_name = get_dir_name(file_dir)
    doc = fitz.Document()
    for img in sorted(glob.glob(file_dir + '\\*'),
                      key=os.path.getmtime):  # 排序获得对象
        img_doc = fitz.Document(img)  # 获得图片对象
        pdf_bytes = img_doc.convertToPDF()  # 获得图片流对象
        img_pdf = fitz.Document("pdf", pdf_bytes)  # 将图片流创建单个的PDF文件
        doc.insertPDF(img_pdf)  # 将单个文件插入到文档
        img_doc.close()
        img_pdf.close()
    doc.save(dir_name + os.sep + base_name + ".pdf")  # 保存文档
    doc.close()
    messagebox.showinfo('提示', '转换成功!')
Ejemplo n.º 6
0
def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
                             resources, outpdf):
    input_file = resources / 'toc.pdf'
    before_toc = fitz.Document(str(input_file)).getToC()

    check_ocrmypdf(
        input_file, outpdf,
        ocr_option,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    after_toc = fitz.Document(str(outpdf)).getToC()
    print(before_toc)
    print(after_toc)
    assert before_toc == after_toc
Ejemplo n.º 7
0
def check_docs_keywords(folder_name):
    download_docs(folder_name, create_list_links(folder_name))
    list_files = glob.glob(f"/LOTS/{folder_name}/*.*")

    def find_keywords(text):
        for keywords_key in list(keywords_files.keys()):
            for keywords in keywords_files.get(keywords_key):
                if keywords in text:
                    return keywords_key

    for file in list_files:
        if "pdf" in file:
            pdf = fitz.Document(file)
            i_page = 0
            while i_page < pdf.pageCount:
                page = pdf.loadPage(i_page)
                page_text = page.getText("text")
                found_keywords = find_keywords(page_text)
                if found_keywords:
                    return found_keywords
                i_page += 1

        elif "docx" in file:
            all_text = docx2txt.process(file)
            return find_keywords(all_text)

        elif "doc" in file and "docx" not in file:
            with open(file) as file_in:
                with open(f"{file}.txt", "w") as file_out:
                    for line in file_in:
                        file_out.write(line)
            txt_text = open(f"{file}.txt", encoding="cp1251")
            doc_text = txt_text.read()
            return find_keywords(doc_text)
Ejemplo n.º 8
0
def _conver_img(pdf_path, pdf_save_path, pdf_name):
    """
    将pdf转化为jpg
    """
    doc = fitz.Document(pdf_save_path)
    pdf_name_without_ext = pdf_name.split(".")[0]
    i = 1
    jpg_dir = []
    for pg in range(doc.pageCount):
        page = doc[pg]
        rotate = int(0)
        # 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。
        zoom_x = 2.0
        zoom_y = 2.0
        trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
        pm = page.getPixmap(matrix=trans, alpha=False)
        if platform.system() == "Windows":
            pm.writePNG(pdf_path +
                        '{0}-{1}.jpg'.format(pdf_name_without_ext, "%04d" % i))
            jpg_dir.append('{0}-{1}.jpg'.format(pdf_name_without_ext,
                                                "%04d" % i))
        else:
            pm.writePNG(pdf_path +
                        '{0}-{1}.jpg'.format(pdf_name_without_ext, "%04d" % i))
            jpg_dir.append('{0}-{1}.jpg'.format(pdf_name_without_ext,
                                                "%04d" % i))
        i = i + 1

    return jpg_dir
Ejemplo n.º 9
0
def extract_all_lines_slides(filename):
    dict_keywords = {
        "Net property income": ["Net property income"],
        "Distribution per unit": ["Distribution per unit", "DPU"],
        "Total assets": ["Total assets"],
        "Total liabilities": ["Total liabilities"],
        "Total debts": ["Total debts"],
        "Units": ["Units in issue"],
        "Net asset value": ["Net asset value", "NAV"],
        "Gearing": ["Aggregate Leverage", "Gearing"],
        "Cost of debt": ["Cost of debt"],
        "Interest cover": ["Interest cover"],
        "Average term to maturity": ["Average term to maturity"],
        "WALE": ["WALE", "Weighted average"]
    }
    doc = fitz.Document(filename)
    results = dict()
    for key, keywords in dict_keywords.items():
        res = dict()
        for keyword in keywords:
            dct = extract_line_slides(doc, keyword)
            if dct is not None:
                res.update(dct)
        results[key] = res
    return results
Ejemplo n.º 10
0
def make_pdf(input_folder, output_path, fname, quiet):
    output = fitz.Document()
    non_svgs = []
    n = 0
    for file_path in os.listdir(input_folder):
        if file_path.lower().endswith('.svg'):
            n += 1
            im = svg2rlg(os.path.join(input_folder, file_path))
            b = renderPDF.drawToString(im)  # convert to pdf
            img_pdf = fitz.open('pdf', b)  # open as pdf
            output.insertPDF(img_pdf)
        else:
            non_svgs.append(file_path)

    if n:
        try:
            output.save(output_path)

            if not quiet:
                print("Successfully rendered " + str(n) + " SVGs to " + fname)
                if non_svgs:
                    print("Ignored " + str(len(non_svgs)) + " non-svg files:")
                    for line in non_svgs:
                        print('\t' + line)
        except:
            print('Error - something went wrong while saving the file',
                  file=sys.stderr)
            return 1
    else:
        print('Error - no SVGs in input folder\n', file=sys.stderr)
        return 1

    return 0
Ejemplo n.º 11
0
def main():
    import argparse
    import sys

    parser = argparse.ArgumentParser(
        description='Adds "table of contents" to pdf files.')

    parser.add_argument("--offset",
                        type=int,
                        default=0,
                        help="site offset when the first chapter starts.")

    parser.add_argument("input")

    parser.add_argument("toc", type=argparse.FileType("r"))

    parser.add_argument("output")

    args = parser.parse_args()

    toc = parse_toc(args.toc, args.offset)
    # write toc
    doc = fitz.Document(args.input)
    inserted = doc.setToC(toc)
    doc.save(args.output)
    # done
    print("Done setting {} chapters".format(inserted))
Ejemplo n.º 12
0
    def on_treeWidget_imagenamelist_itemDoubleClicked(self, qtreeitem, p_int):

        img_id = int(qtreeitem.text(1))
        key_dict = {'autoid': img_id}
        res = self.IC.get_data(1, False, *VALUE_TUPLE_IM, **key_dict)
        if not len(res):
            return
        ext = res[0]['ext']
        image = res[0]['img']

        if ext.lower() == 'pdf':
            self.comboBox_jumpto.setVisible(True)
            self.pushButton_prepage.setVisible(True)
            self.pushButton_nextpage.setVisible(True)
            self.current_img = fitz.Document(stream=image, filetype='pdf')
            page_count = self.current_img.pageCount
            page_list = []
            self.comboBox_jumpto.clear()
            for i in range(1, page_count + 1):
                page_list.append('第' + str(i) + '页')
            self.comboBox_jumpto.addItems(page_list)
            self.current_page = self.current_img.loadPage(0)

        else:
            self.comboBox_jumpto.setVisible(False)
            self.pushButton_prepage.setVisible(False)
            self.pushButton_nextpage.setVisible(False)
            img = QImage.fromData(image)
            self.current_img = QPixmap.fromImage(img)
            self.label_image.setPixmap(self.current_img)

        # 默认放大为3被,同时自动调用on_horizontalSlider_zoom_valueChanged
        self.horizontalSlider_zoom.setValue(30)
Ejemplo n.º 13
0
    def make_page(self, page:fitz.Page, debug=True):
        ''' Parse and create single page.
            If debug=True, illustration pdf will be created during parsing the raw pdf layout.
        '''
        # debug information
        # fitz object in debug mode: plot page layout
        # file path for this debug pdf: demo.pdf -> debug_demo.pdf
        path, filename = os.path.split(self.filename_pdf)
        filename_json  = os.path.join(path, 'layout.json')
        debug_kwargs = {
            'debug'   : debug,
            'doc'     : fitz.Document() if debug else None,
            'filename': os.path.join(path, f'debug_{filename}')
        }

        # init page layout
        self.initialize(page)
        if debug: 
            self._layout.plot(debug_kwargs['doc'], 'Source Text Blocks')
            self._paths.plot(debug_kwargs['doc'], 'Source Shapes', self._layout.width, self._layout.height)

        # parse and save page
        self.layout.parse(**debug_kwargs).make_page(self.doc_docx)
        self.save()

        # save debug files
        if debug:
            # save layout plotting as pdf file
            debug_kwargs['doc'].save(debug_kwargs['filename'])
            # write layout information
            self.layout.serialize(filename_json)

        return self
Ejemplo n.º 14
0
    def debug_page(self, i:int, docx_filename:str=None, debug_pdf=None, layout_file=None, config:dict=None):
        ''' Parse, create and plot single page for debug purpose.
            ---
            Args:
            - i (int): page index to convert
            - docx_filename (str): DOCX filename to write to
            - debug_pdf (str): new pdf file storing layout information (add prefix "debug_" by default)
            - layout_file (str): new json file storing parsed layout data (layout.json by default)
        '''
        config = config if config else {}

        # include debug information
        # fitz object in debug mode: plot page layout
        # file path for this debug pdf: demo.pdf -> debug_demo.pdf
        path, filename = os.path.split(self.filename_pdf)
        if not debug_pdf: debug_pdf = os.path.join(path, f'debug_{filename}')
        if not layout_file: layout_file  = os.path.join(path, 'layout.json')
        config.update({
            'debug'         : True,
            'debug_doc'     : fitz.Document(),
            'debug_filename': debug_pdf
        })

        # parse and create docx
        self.convert(docx_filename, pages=[i], config=config)
        
        # layout information for debugging
        self.serialize(layout_file)
def get_table_area(pdf_data):
    """This finds a bounding box for the Race, Ethnicity table by looking
    for bounding boxes for the words "White" and "Total" (occuring
    below it) on page 3 of the PDF, and the page's right bound.

    """
    doc = fitz.Document(stream=pdf_data, filetype='pdf')
    page3 = doc[2]  # page indexes start at 0

    white_bbox = None
    for (x0, y0, x1, y1, word, block_no, line_no,
         word_no) in page3.getText('words'):
        if word == 'White':
            white_bbox = fitz.Rect(x0, y0, x1, y1)

    total_bbox = None
    for (x0, y0, x1, y1, word, block_no, line_no,
         word_no) in page3.getText('words'):
        if word == 'Total':
            if (round(x0) == round(white_bbox.x0)
                    and round(y0) > round(white_bbox.y0)):
                total_bbox = fitz.Rect(x0, y0, x1, y1)

    return fitz.Rect(white_bbox.x0, white_bbox.y0,
                     page3.bound().x1, total_bbox.y1)
Ejemplo n.º 16
0
def _decode_page(page_data):
    """
    Read the image and try to find the QR codes.

    :param bytes page_data: Data of the PDF single page
    :returns: decoded qrcode, numpy array of page and test image data to show
              the detection
    :rtype: str, binary
    """
    tic = time()

    doc = fitz.Document("pdf", page_data)
    # get first page
    page = next(doc.pages())

    zoom = (5.0, 5.0)
    mat = fitz.Matrix(*zoom)  # zoom factor in each dimension
    # use 'mat' instead of the identity matrix
    pix = page.get_pixmap(matrix=mat, alpha=0)

    img_url = os.path.join(os.getcwd(), "page0.png")
    pix.save(img_url)  # store image as a PNG

    # qr_data = zxing_wrapper.scan_qrcode(img_url, page)
    # _logger.debug(f"\t\tQRCode decoded using ZXing in {time() - tic:.3} sec")
    qr_data = zbar_wrapper.scan_qrcode(img_url, page)
    _logger.debug(f"\t\tQRCode decoded using ZBar in {time() - tic:.3} sec")
    doc.close()

    return qr_data, img_url
Ejemplo n.º 17
0
 def __init__(self, pdf_file, debug=False, text_gray=218):
     super(Walker, self).__init__()
     self.pdf_file = pdf_file
     self.pdf = fitz.Document(pdf_file)
     self.page_count = len(self.pdf)
     self.DEBUG = debug
     self.TEXT_GRAY = text_gray
Ejemplo n.º 18
0
    def debug_page(self, page:fitz.Page):
        ''' Parse, create and plot single page for debug purpose.
            Illustration pdf will be created during parsing the raw pdf layout.
        '''
        # debug information
        # fitz object in debug mode: plot page layout
        # file path for this debug pdf: demo.pdf -> debug_demo.pdf
        path, filename = os.path.split(self.filename_pdf)
        filename_json  = os.path.join(path, 'layout.json')
        debug_kwargs = {
            'debug'   : True,
            'doc'     : fitz.Document(),
            'filename': os.path.join(path, f'debug_{filename}')
        }

        # init page layout
        self.initialize(page)
        self._layout.plot(**debug_kwargs)
        self._paths_extractor.paths.plot(debug_kwargs['doc'], 'Source Paths', self._layout.width, self._layout.height)

        # parse and save debug files
        self.layout.parse(**debug_kwargs)
        if len(debug_kwargs['doc']): debug_kwargs['doc'].save(debug_kwargs['filename']) # layout plotting        
        self.layout.serialize(filename_json) # layout information
        
        # make docx page
        self._layout.make_page(self.doc_docx)
        self.save()

        return self
Ejemplo n.º 19
0
    def __parse_pdf(self, path, result_dir, **kwargs):
        doc = fitz.Document(path)
        pages = kwargs['range'].pages
        p_layer = self.progress.add_layer((0, len(pages)))

        for page_number in pages:
            page = doc.load_page(page_number - 1)
            self.send_update('Rendering {}-th page of PDF'.format(
                str(page.number + 1)))

            try:
                # page.get_pixmap().writePNG('test.png')
                scale = 1.25
                scale_matrix = fitz.Matrix(
                    scale,
                    scale)  # get image 'scale' times larger than page.bound()
                png = page.get_pixmap(matrix=scale_matrix).getPNGData()
                png = np.frombuffer(png, dtype=np.int8)
                self.__parse_img(
                    png,
                    result_dir,
                    file_prefix='page-{}-'.format(str(page.number + 1)),
                    board_title_fmt="Страница {}, доска {{}} из {{}}".format(
                        page.number + 1))

            except KeyboardInterrupt:
                raise
            except:
                pass
            self.progress.append_progress(p_layer, 1)

        self.progress.pop_layer(p_layer)
Ejemplo n.º 20
0
    def PdfFileRead(self):
        """
        This current code provides a workaround in case MuPDF (a dependency
        for PyMuPDF) is not usable in the development environment. For such
        instances, the module relies on PyPDF2 to extract text data. However,
        because of the likelihood of white spaces being rampant in the
        extracted string data, those characters get filtered out.
        """

        contents = self.get_contents()

        try:
            import fitz

            pdf_file = fitz.Document(stream=contents, filetype="pdf")
            raw_text = [ele.get_text("text") for ele in pdf_file]
            text = "".join(raw_text)
        # else:
        except Exception:
            import PyPDF2

            pdf_reader = PyPDF2.PdfFileReader(contents)
            raw_text = [ele.extractText() for ele in pdf_reader.pages]
            text = "".join(raw_text)
        return text
Ejemplo n.º 21
0
    def debug_page(self,
                   i: int,
                   docx_filename: str = None,
                   config: dict = None):
        ''' Parse, create and plot single page for debug purpose.
            Illustration pdf will be created during parsing the raw pdf layout.
        '''
        config = config if config else {}

        # include debug information
        # fitz object in debug mode: plot page layout
        # file path for this debug pdf: demo.pdf -> debug_demo.pdf
        path, filename = os.path.split(self.filename_pdf)
        filename_json = os.path.join(path, 'layout.json')
        debug_doc = fitz.Document()
        config.update({
            'debug': True,
            'doc': debug_doc,
            'filename': os.path.join(path, f'debug_{filename}')
        })

        # parse and make page
        layouts = self.make_docx(docx_filename, pages=[i], config=config)

        # layout information for debugging
        layouts[0].serialize(filename_json)

        return layouts[0]
Ejemplo n.º 22
0
    def __init__(self, pdf_file: str):
        ''' Initialize fitz object with given pdf file path; initialize docx object.'''
        # pdf/docx filename
        self.filename_pdf = pdf_file

        # fitz object to read pdf
        self._doc_pdf = fitz.Document(pdf_file)
Ejemplo n.º 23
0
 def add_pages(self):
     # 请求文件
     if not self.file:
         message_label = QLabel('没有文件.')
         self.page_container.layout().addWidget(message_label)
         return
     try:
         response = requests.get(self.file)
         doc = fitz.Document(filename='a', stream=response.content)
     except Exception as e:
         message_label = QLabel('获取文件内容失败.\n{}'.format(e))
         self.page_container.layout().addWidget(message_label)
         return
     for page_index in range(doc.pageCount):
         page = doc.loadPage(page_index)
         page_label = QLabel()
         # page_label.setMinimumSize(self.width() - 20, self.height())  # 设置label大小
         # show PDF content
         zoom_matrix = fitz.Matrix(1.5, 1.5)  # 图像缩放比例
         pagePixmap = page.getPixmap(
             matrix=zoom_matrix,
             alpha=False)
         imageFormat = QImage.Format_RGB888  # get image format
         pageQImage = QImage(
             pagePixmap.samples,
             pagePixmap.width,
             pagePixmap.height,
             pagePixmap.stride,
             imageFormat)  # init QImage
         page_map = QPixmap()
         page_map.convertFromImage(pageQImage)
         page_label.setPixmap(page_map)
         page_label.setScaledContents(True)  # pixmap resize with label
         self.page_container.layout().addWidget(page_label)
Ejemplo n.º 24
0
def train_from_pdf(bot: Bot, update: Update, conn):
    try:
        buffer = update.message.document.get_file().download_as_bytearray()
        document = fitz.Document(stream=buffer, filetype="pdf")
        trains = pdf_extraction.extract_info_from_pdf(document,
                                                      update.effective_chat.id)
        if not trains:
            raise Exception(f"Train extraction list was empty :( {trains}")
        message = f"{_(app_strings.added_train)}"
        for train in trains:
            db_utils.insert_train_in_db(train, conn, False)
            message += get_train_info_message(
                train,
                format_date(train.depart_date,
                            check_daily=False,
                            check_interval=""), conn)
        conn.commit()
        bot.send_sticker(update.message.from_user.id, stickers.drake_approving)
        update.message.reply_text(message)
    except TrainInPastError as e:
        logging.error(e)
        bot.send_sticker(update.message.from_user.id, stickers.tom_puzzled)
        update.message.reply_text(_(app_strings.train_in_past_error))
    except Exception as e:
        logging.error(e)
        bot.send_sticker(update.effective_chat.id, stickers.blackman_crying)
        bot.send_message(update.effective_chat.id, _(app_strings.error_pdf))
Ejemplo n.º 25
0
    def debug_page(self,
                   i: int,
                   docx_filename: str = None,
                   debug_pdf: str = None,
                   layout_file: str = None,
                   kwargs: dict = None):
        '''Parse, create and plot single page for debug purpose.
        
        Args:
            i (int): Page index to convert.
            docx_filename (str): docx filename to write to.
            debug_pdf (str): New pdf file storing layout information. Default to add prefix ``debug_``.
            layout_file (str): New json file storing parsed layout data. Default to ``layout.json``.
        '''
        kwargs = kwargs if kwargs else {}

        # include debug information
        # fitz object in debug mode: plot page layout
        # file path for this debug pdf: demo.pdf -> debug_demo.pdf
        path, filename = os.path.split(self.filename_pdf)
        if not debug_pdf: debug_pdf = os.path.join(path, f'debug_{filename}')
        if not layout_file: layout_file = os.path.join(path, 'layout.json')
        kwargs.update({
            'debug': True,
            'debug_doc': fitz.Document(),
            'debug_filename': debug_pdf
        })

        # parse and create docx
        self.convert(docx_filename, pages=[i], kwargs=kwargs)

        # layout information for debugging
        self.serialize(layout_file)
Ejemplo n.º 26
0
    def on_treeWidget_imagenamelist_itemDoubleClicked(self, qtreeitem, p_int):
        if self.power[1] == '0':
            return
        rela_id = int(qtreeitem.text(0))
        for item in self.images_list:
            if item.autoid == rela_id:

                if item.imgid.ext.lower() == 'pdf':
                    self.comboBox_jumpto.setVisible(True)
                    self.pushButton_prepage.setVisible(True)
                    self.pushButton_nextpage.setVisible(True)
                    self.current_img = fitz.Document(stream=item.imgid.img,
                                                     filetype='pdf')
                    page_count = self.current_img.pageCount
                    page_list = []
                    self.comboBox_jumpto.clear()
                    for i in range(1, page_count + 1):
                        page_list.append('第' + str(i) + '页')
                    self.comboBox_jumpto.addItems(page_list)
                    self.current_page = self.current_img.loadPage(0)

                else:
                    self.comboBox_jumpto.setVisible(False)
                    self.pushButton_prepage.setVisible(False)
                    self.pushButton_nextpage.setVisible(False)
                    img = QImage.fromData(item.imgid.img)
                    self.current_img = QPixmap.fromImage(img)
                    self.label_image.setPixmap(self.current_img)
                break
        # 默认放大为3被,同时自动调用on_horizontalSlider_zoom_valueChanged
        self.horizontalSlider_zoom.setValue(30)
Ejemplo n.º 27
0
def pdf_image(pdf_name, Gray=False):
    img_paths = []
    pdf = fitz.Document(pdf_name)
    for i, pg in enumerate(range(0, pdf.pageCount)):
        page = pdf[pg]  # 获得每一页的对象
        trans = fitz.Matrix(3.0, 3.0).preRotate(0)
        pm = page.getPixmap(matrix=trans, alpha=False)  # 获得每一页的流对象
        # pm.writePNG(dir_name + os.sep + base_name[:-4] + '_' + '{:0>3d}.png'.format(pg + 1))  # 保存图片
        img_path = pdf_name[:-4] + '_' + str(pg + 1) + '.jpg'
        pm.writePNG(img_path)  # 保存图片
        img_paths.append(img_path)

        if Gray:  # 是否转为灰度
            img = Image.open(img_path)
            # img.show()
            low = img.convert('L')
            low.save(img_path)

            '''
            这种模式转换的灰度图片size比较大
            img = cv2.imread(img_path, 0)
            # cv2.imshow("img", img)
            cv2.imwrite(img_path, img)
            '''

    pdf.close()
    return img_paths
Ejemplo n.º 28
0
def pdf_format_2(input_file, page_no):
    doc = fitz.Document(input_file)
    #     page = doc[0]
    try:
        page = doc[int(page_no) - 1]
        contents = page.get_text("blocks")
        outer_list = []
        for content in contents:
            if 'DECLARACIÓN NUTRIMENTAL' in content[4] and ';' in content[4]:
                whole_content = content[4].split(';')
                outer_list = []
                for each_content in whole_content:
                    each_content_1 = re.sub(r'^.*?\[', '', each_content)
                    #                     print(each_content_1,"EACH ")
                    each_content_1 = each_content_1.replace(
                        'DECLARACIÓN NUTRIMENTAL', '')
                    if ']' in each_content_1:
                        each_content_2 = each_content_1.replace(']', '\n')
                        each_content_3 = each_content_2.strip().split('\n')
                        #                         print(each_content_3,"EACH 3")
                        outer_list.extend(each_content_3)
                    else:
                        outer_list.extend([each_content_1])
#                         print(each_content_1,"ELSE")
#         print('&&&&&&&&&&&&&&&&&&&&&&&&')
#         print(outer_list)
        outer_list = [i.strip() for i in outer_list if i != '']
        #         print(outer_list,"%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
        return outer_list
    except:
        print("page Num doesn't exist")
Ejemplo n.º 29
0
def parse_place(place, pdf_path, args):
    # Actually parses a place from the PDF it is to parse.
    doc = fitz.Document(pdf_path)
    data = []

    # if not args.no_aggregate:
    for entry in parse_front_pages(doc):
        entry["state"] = place
        entry["page"] = 1
        entry["county"] = "Overall"
        data.append(entry)
    # if not args.aggregate_only:
    for i in range(2, doc.pageCount - 1):
        for entry in parse_page(doc, i):
            entry["state"] = place
            entry["page"] = i
            data.append(entry)
    # outname = f"data/{place}.json.gz"
    df = pd.DataFrame(data)
    if len(df) == 0: return df

    ncounties = df['county'].nunique()
    print(f"Parsed {len(df)} plots for {ncounties} counties in {place}")
    df = df[[
        "state", "county", "category", "change", "changecalc", "dates",
        "values", "page"
    ]]
    return df
Ejemplo n.º 30
0
def main(input: str, output: str, dpi: int, first_page: Optional[int],
         last_page: Optional[int], ocr: bool, clean: bool):
    if os.path.splitext(input)[1].lower() == ".pdf":
        # PDF mode
        assert os.path.exists(input)
        page_count = fitz.Document(input).page_count
        first_page = 0 if first_page is None else first_page - 1
        last_page = page_count if last_page is None else last_page
        args = zip(repeat(input), range(first_page, last_page), repeat(dpi),
                   repeat(ocr), repeat(clean))
    else:
        # Glob mode
        files = sorted(glob.glob(input, recursive=True))
        first_page = 0 if first_page is None else first_page - 1
        last_page = len(files) if last_page is None else last_page
        args = zip(files[first_page:last_page], repeat(0), repeat(-1),
                   repeat(ocr), repeat(clean))
    total = last_page - first_page
    with Pool() as p:
        results = tqdm(p.imap(clean_single_page, args), total=total)
        if os.path.splitext(output)[1].lower() == ".pdf":
            merge_to_pdf(results, output)
        elif not os.path.exists(output) or os.path.isdir(output):
            if ocr:
                raise RuntimeError("the OCR flag is useless because we are "
                                   "writing images (not PDF) to the output "
                                   "directory.")
            if not os.path.exists(output):
                Path(output).mkdir(parents=True)
            for (index, page) in enumerate(results):
                file_path = os.path.join(output, f"{index}.jpg")
                assert isinstance(page, Image.Image)
                page.save(file_path)
        else:
            raise RuntimeError("invalid output format.")