Esempio n. 1
0
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        fd = open(pdf_path, "rb")

        doc = PDFDocument(fd)
        self.viewer = SimplePDFViewer(fd)
        self.pages = [p for p in doc.pages()]
Esempio n. 2
0
def cmap_extraction_example():
	pdf_filepath = './tutorial-example.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		doc = PDFDocument(fd)

		from itertools import islice
		page = next(islice(doc.pages(), 2, 3))
		print('page.Resources.Font = {}.'.format(page.Resources.Font))
		print('len(page.Resources.Font) = {}.'.format(len(page.Resources.Font)))

		font = page.Resources.Font['R26']
		print('font.Subtype = {}, bool(font.ToUnicode) = {}.'.format(font.Subtype, bool(font.ToUnicode)))

		# It is PostScript Type1 font, and texts use CMap provided by ToUnicode attribute.
		# Font's ToUnicode attribute contains a reference to the CMap file data stream.
		cmap = font.ToUnicode
		print('type(cmap) = {}.'.format(type(cmap)))
		print('cmap.Filter = {}.'.format(cmap.Filter))

		data = cmap.filtered
		with open('./sample-cmap.txt', 'wb') as fd2:
			 fd2.write(data)
	finally:
		fd.close()
Esempio n. 3
0
def get_text_pypdf(DOI:str) -> str:
    try:
        """gets the text from a given DOI"""
        hostname = socket.gethostname()
        path = pathlib.Path(__file__).parent.absolute()
        name = hostname + str(DOI).replace("/", "") + ".pdf"
        fp = Path(path / "pdfs" / name)  # build filepath
        url = "https://www.medrxiv.org/content/" + str(DOI) + "v1.full.pdf"  # build url
        response = requests.get(url)
        fp.write_bytes(response.content)  # save .pdf

        fd = open(str(path) + "/pdfs/" + name, "rb")  # open with pdfreader
        doc = PDFDocument(fd)
        all_pages = [p for p in doc.pages()]  # get pages
        viewer = SimplePDFViewer(fd)  # use simple viwer
        text = ""
        for p in range(len(all_pages)):  # for each page
            viewer.navigate(p + 1)  # nav to page
            try:
                viewer.render()  # render -> clean and strip
                text += (u"".join(viewer.canvas.strings).encode(sys.stdout.encoding, errors='replace').decode("windows-1252")) + '\n'
            except OverflowError:
                pass
        fd.close()
        return text.lower()
    except Exception as e:
        print(e, DOI)
        return ""
Esempio n. 4
0
def encrypted_and_password_protected_pdf_tutorial():
	pdf_filepath = './encrypted-with-qwerty.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		viewer = SimplePDFViewer(fd, password='******')

		viewer.render()

		text = ''.join(viewer.canvas.strings)
		print('text = {}.'.format(text))

		#--------------------
		doc = PDFDocument(fd, password='******')

		page_one = next(doc.pages())
		print('page_one.Contents = {}.'.format(page_one.Contents))

		#--------------------
		try:
			doc = PDFDocument(fd, password='******')
			#viewer = SimplePDFViewer(fd, password='******')
		except ValueError as ex:
			print('ValueError raised: {}.'.format(ex))
	finally:
		fd.close()
Esempio n. 5
0
def xobject_image_example():
	pdf_filepath = './example-image-xobject.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		doc = PDFDocument(fd)

		# Extract XObject image.
		page = next(doc.pages())
		print('page.Resources.XObject = {}.'.format(page.Resources.XObject))

		xobj = page.Resources.XObject['img0']
		print('xobj.Type = {}, xobj.Subtype = {}.'.format(xobj.Type, xobj.Subtype))

		pil_image = xobj.to_Pillow()
		#pil_image.save('./extract-logo.png')

		#--------------------
		# Extract Images: a very simple way.
		viewer = SimplePDFViewer(fd)
		viewer.render()

		all_page_images = viewer.canvas.images
		if 'img0' in all_page_images:
			img = all_page_images['img0']
			print('img.Type = {}, img.Subtype = {}.'.format(img.Type, img.Subtype))

		all_page_inline_images = viewer.canvas.inline_images
		if all_page_inline_images:
			img = all_page_inline_images[0]
			print('img.Type = {}, img.Subtype = {}.'.format(img.Type, img.Subtype))
	finally:
		fd.close()

	#--------------------
	pdf_filepath = './tutorial-example.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		viewer = SimplePDFViewer(fd)

		# Extract image masks.
		viewer.navigate(5)
		viewer.render()

		inline_images = viewer.canvas.inline_images
		image_mask = next(img for img in inline_images if img.ImageMask)

		pil_img = image_mask.to_Pillow()
		#pil_img.save('./mask.png')
	finally:
		fd.close()
Esempio n. 6
0
def init_cmb_from_pdf(month):
    filename = FILE_PATH.format(str(month).zfill(2))
    # logger.info(filename)
    fd = open(filename, "rb")

    doc = PDFDocument(fd)
    all_pages = [p for p in doc.pages()]
    # logger.info(len(all_pages))

    viewer = SimplePDFViewer(fd)
    records = []
    for i in range(len(all_pages)):
        viewer.navigate(i+1)
        viewer.render()
        records = np.append(records, viewer.canvas.strings[4:])

    head = np.where(records == '记账日')[0][0]
    tail = np.where(records == '本期还款总额')[0][-1]
    records = records[head:tail]

    # title_cn = records[:5]
    # title_en = records[5:11]
    records = records[11:]

    column_cn = ['交易日' '交易摘要' '人民币金额' '卡号末四位' '记账日' '交易地金额']
    column_en = ['transaction_date', 'transaction_description', 'transction_amount',
                 'card_number', 'bill_date', 'str_rmb']
    # Data: ['' '掌上生活还款' '-3,011.49' '9978' '07/24' '-3,011.49']

    df = pd.DataFrame(records.reshape(
        [int(len(records)/6), 6]), columns=column_en)

    df['type'] = 'cmb'

    df['transaction_date'] = df['transaction_date'].apply(
        lambda _: '2020/' + _)
    df['transaction_date'] = pd.to_datetime(
        df['transaction_date'], format="%Y/%m/%d", errors='coerce')

    df['transction_amount'] = df['transction_amount'].apply(
        lambda _: decimal_from_value(_))

    df = df[['transaction_date', 'transction_amount',
             'transaction_description', 'type']]

    return df
Esempio n. 7
0
def grade_document(document, verbose = False, point_flags = ('%','%') ) :

	doc = PDFDocument( document )
	viewer = SimplePDFViewer( document )

	grade = 0

	for page_number, page in enumerate( doc.pages() ) :

		if verbose :
			print('------------------')
			print('Page:', page_number)

		viewer.navigate( page_number + 1 )
		viewer.render()

		grade += grade_page( viewer.canvas, verbose = verbose, point_flags = point_flags )

	return grade
Esempio n. 8
0
def main():
    # Get the PDF
    r = requests.get(
        "https://www.sos.arkansas.gov/uploads/elections/ARCountyClerks.pdf")

    # Pass byte stream to PDFDocument parser (used for iterating through pages)
    doc = PDFDocument(r.content)
    # Pass byte stream to PDF viewer (used for reading strings on pages)
    viewer = SimplePDFViewer(r.content)
    navigate_pages(doc, viewer)
    pprint(ELECTION_OFFICE_INFO)
Esempio n. 9
0
def pdf_object_navigation_example():
	pdf_filepath = './tutorial-example.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		doc = PDFDocument(fd)

		catalog = doc.root
		print('catalog.Type = {}.'.format(catalog.Type))
		print('catalog.Metadata.Type = {}, catalog.Metadata.Subtype = {}.'.format(catalog.Metadata.Type, catalog.Metadata.Subtype))

		pages_tree_root = catalog.Pages
		print('pages_tree_root.Type = {}.'.format(pages_tree_root.Type))

		# Attribute names are cases sensitive.
		# Missing or non-existing attributes have value of None.
		print('(catalog.type is None) = {}.'.format(catalog.type is None))
		print('(catalog.Metadata.subType is None) = {}.'.format(catalog.Metadata.subType is None))
		print('(catalog.Metadata.UnkNown_AttriBute is None) = {}.'.format(catalog.Metadata.UnkNown_AttriBute is None))

		# If object is an array, access its items by index.
		first_page = pages_tree_root.Kids[0]
		print('first_page.Type = {}.'.format(first_page.Type))
		print('first_page.Contents.Length = {}.'.format(first_page.Contents.Length))

		# If object is a stream, you can get either raw data (deflated in this example) or decoded content.
		raw_data = first_page.Contents.stream
		print('(first_page.Contents.Length == len(raw_data)) = {}.'.format(first_page.Contents.Length == len(raw_data)))
		print('first_page.Contents.Filter = {}.'.format(first_page.Contents.Filter))

		decoded_content = first_page.Contents.filtered
		print('len(decoded_content) = {}.'.format(len(decoded_content)))
		print('decoded_content.startswith(b"BT\n0 0 0 rg\n/GS0 gs") = {}.'.format(decoded_content.startswith(b'BT\n0 0 0 rg\n/GS0 gs')))

		# On the file structure level all objects have unique number and generation to identify them.
		num, gen = 2, 0
		raw_obj = doc.locate_object(num, gen)
		obj = doc.build(raw_obj)
		print('obj.Type = {}.'.format(obj.Type))
	finally:
		fd.close()
Esempio n. 10
0
def get_text(DOI: str) -> str:
    txt = ""
    name = "curr."
    fp = Path(Path.cwd() / "pdfs" / "curr.pdf")  # build filepath
    url = "https://www.medrxiv.org/content/" + DOI + "v1.full.pdf"  # build url
    response = requests.get(url)
    fp.write_bytes(response.content)  # save .pdf

    fd = open(r"pdfs\curr.pdf", "rb")  # open with pdfreader
    doc = PDFDocument(fd)
    all_pages = [p for p in doc.pages()]  # get pages
    viewer = SimplePDFViewer(fd)  # use simple viwer

    for p in range(len(all_pages)):  # for each page
        viewer.navigate(p + 1)  # nav to page
        viewer.render()  # render -? clean and strip
        txt += (u"".join(viewer.canvas.strings).encode(
            sys.stdout.encoding,
            errors='replace').decode("windows-1252")) + '\n'

    return txt
Esempio n. 11
0
def font_extraction_example():
	pdf_filepath = './example-font.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		doc = PDFDocument(fd)

		page = next(doc.pages())
		print('sorted(page.Resources.Font.keys()) = {}.'.format(sorted(page.Resources.Font.keys())))

		font = page.Resources.Font['T1_0']
		print('font.Subtype = {}, font.BaseFont = {}, font.Encoding = {}.'.format(font.Subtype, font.BaseFont, font.Encoding))

		font_file = font.FontDescriptor.FontFile
		print('type(font_file) = {}.'.format(type(font_file)))
		print('font_file.Filter = {}.'.format(font_file.Filter))

		data = font_file.filtered
		#with open('./sample-font.type1', 'wb') as fd2:
		#	 fd2.write(data)
	finally:
		fd.close()
Esempio n. 12
0
def GradeDocSafe(document, verbose=False, point_flags=defaut_point_flags):
    '''
	grade = GradeDoc(document, verbose = False, point_flags = defaut_point_flags )
	'''

    doc = PDFDocument(document)
    viewer = SimplePDFViewer(document)

    grade = 0

    for page_number, page in enumerate(doc.pages()):

        if verbose:
            print('------------------')
            print('Page:', page_number + 1)

        viewer.navigate(page_number + 1)
        viewer.render()

        grade += GradePage(viewer.canvas.strings,
                           verbose=verbose,
                           point_flags=point_flags)

    return grade
Esempio n. 13
0
def navigate_pages(doc: PDFDocument, viewer: SimplePDFViewer):
    for i, page in enumerate(doc.pages(), 1):
        # navigate to page
        viewer.navigate(i)
        # render the page
        viewer.render()

        # collapse that ass
        page_strings: List[str] = viewer.canvas.strings.copy()

        merge_ranges = get_line_ranges(strings_list=page_strings)

        page_strings = establish_uniformity(strings_list=page_strings,
                                            line_range_list=merge_ranges)

        get_county_election_office_info(strings_list=page_strings)
Esempio n. 14
0
def uploaded_file():
    if request.method == 'POST':
        f = request.files['file']
        filepath = os.path.join(app.config['UPLOAD_FOLDER'],
                                secure_filename(f.filename))
        f.save(filepath)
        fd = open(filepath, "rb")
        doc = PDFDocument(fd)
        version = doc.header.version
        print(doc.metadata)
        creationDate = doc.metadata.get('CreationDate')
        dataType = doc.metadata.get('Subtype')
        #data methods
        viewer = SimplePDFViewer(fd)
        textData = []
        for canvas in viewer:
            #print(canvas.strings)
            textData += canvas.strings
            tempstring = ''
            textWords = []
            for character in textData:
                if character != ' ':
                    tempstring += character
                else:
                    if tempstring:
                        textWords.append(tempstring)
                        tempstring = ''

        print(secure_filename(f.filename))
        print(creationDate)
        print(textWords)

        fileDocument = {
            "name": secure_filename(f.filename),
            "creationDate": creationDate,
            "text": textWords
        }

        collection.insert_one(fileDocument)
        return 'file uploaded successfully'
Esempio n. 15
0
def document_tutorial():
	pdf_filepath = './tutorial-example.pdf'

	from io import BytesIO
	with open(pdf_filepath, 'rb') as fd:
		stream = BytesIO(fd.read())
	doc = PDFDocument(stream)

	try:
		fd = open(pdf_filepath, 'rb')
		doc = PDFDocument(fd)

		print('doc.header.version = {}.'.format(doc.header.version))
		print('doc.metadata = {}.'.format(doc.metadata))

		print('doc.root.Type = {}.'.format(doc.root.Type))
		print('doc.root.Metadata.Subtype = {}.'.format(doc.root.Metadata.Subtype))
		print('doc.root.Outlines.First["Title"] = {}.'.format(doc.root.Outlines.First['Title']))

		#--------------------
		# Browse document pages.
		page_one = next(doc.pages())

		all_pages = [p for p in doc.pages()]
		print('len(all_pages) = {}.'.format(len(all_pages)))

		page_six = next(itertools.islice(doc.pages(), 5, 6))
		page_five = next(itertools.islice(doc.pages(), 4, 5))
		page_eight = all_pages[7]

		print('page_six.MediaBox = {}.'.format(page_six.MediaBox))
		print('page_six.Annots[0].Subj = {}.'.format(page_six.Annots[0].Subj))
		print('page_six.Parent.Type = {}.'.format(page_six.Parent.Type))
		print('page_six.Parent.Count = {}.'.format(page_six.Parent.Count))
		print('len(page_six.Parent.Kids) = {}.'.format(len(page_six.Parent.Kids)))
	finally:
		fd.close()
Esempio n. 16
0
# import tabula
# # df = read_pdf("Activity_Report.pdf")

# tabula.convert_into("Activity_Report.pdf", "output.csv", output_format="csv", pages='all')

# import pdftables_api

# c = pdftables_api.Client('r0tedshcbejj')
# c.xlsx('Acentria Activity Report.pdf', 'Acentria Activity Report.xlsx')
# c.xlsx('Integration/Tam_Weaver/RWI Policy Types Sample 0.pdf', 'Integration/Tam_Weaver/RWI_PolicyTypes_Sample0.xlsx')
import pdfreader
from pdfreader import PDFDocument, SimplePDFViewer

fd = open("Acentria Activity Report.pdf", "rb")

doc = PDFDocument(fd)
page = next(doc.pages())
print(doc.root)
# df = tabula.read_pdf('Acentria Activity Report.pdf', pages = 3, lattice = True)[1]
# import os
# import sys
# import pdftables_api
# from PyPDF2 import PdfFileWriter, PdfFileReader

# if len(sys.argv) < 3:
#     command = os.path.basename(__file__)
#     sys.exit('Usage: {} pdf-file page-number, ...'.format(command))

# pdf_input_file = sys.argv[1];
# pages_args = ",".join(sys.argv[2:]).replace(" ","")
# pages_required = [int(p) for p in filter(None, pages_args.split(","))]
Esempio n. 17
0
            elif centre_type:
                print("TYPE 2")
                centre_scraping_one_page(text_on_page, writer, county_name)
                print("We are " + str((i / len(all_pages)) * 100) + "% done.")
            elif chester_type:
                print("TYPE 3")
                ches_scraping_one_page(text_on_page, writer, county_name)
                print("We are " + str((i / len(all_pages)) * 100) + "% done.")
            else:
                print("LAST TYPE")
                beav_scraping_one_page(text_on_page, writer, county_name)
                print("We are " + str((i / len(all_pages)) * 100) + "% done.")


#short function to grab the county name from the file name of the input pdf
def get_county(file_name):
    for i in range(len(file_name)):
        if file_name[i] == "_":
            stop = i
            break
    return file_name[:stop]


if __name__ == "__main__":
    file_name = "blair_results_2020.pdf"
    fd = open(file_name, "rb")
    doc = PDFDocument(fd)
    viewer = SimplePDFViewer(fd)
    county = get_county(file_name)
    big_scrape(viewer, doc, county)
                number_of_races += 1

    #print(number_of_races)
    if "PRESIDENT OF THE UNITED STATES" in text_on_page:
        presidental_race(text_on_page, csv_writer, office, party)
    elif number_of_races == 1:
        single_race(text_on_page, csv_writer)
    elif any('DELEGATE' in item for item in text_on_page):
        pass
    else:
        double_race_page(text_on_page, csv_writer)


if __name__ == "__main__":
    fd = open("bradford_results_2020.pdf", "rb")
    doc = PDFDocument(fd)
    viewer = SimplePDFViewer(fd)
    parties = ["DEM", "REP", "NPA"]
    offices = [
        "PRESIDENT OF THE UNITED STATES", "ATTORNEY GENERAL",
        "AUDITOR GENERAL", "STATE TREASURER"
    ]
    presidential_candidates = [
        "BERNIE SANDERS", "JOSEPH R. BIDEN", "TULSI GABBARD",
        "DONALD TRUMP  (W)", "Total", "Write-in", "DONALD J. TRUMP",
        "ROQUE ROCKY DE LA FUENTE", "BILL WELD", "BERNIE SANDERS (W)"
    ]
    all_pages = [p for p in doc.pages()]
    with open('20200602__pa__primary__bradford__precinct.csv', 'w',
              newline='') as csvfile:
        writer = csv.writer(csvfile)
def main():
    # Parse the arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("file_name")
    parser.add_argument("-o", "--output", default=None,
                        help="sets the output directory")
    parser.add_argument("-v", "--verbose", default=False, type=str2bool,
                        const=True, nargs='?',
                        help="increase output verbosity")
    parser.add_argument("-fp", "--first_page", default=0,
                        help="first page to extract from")
    parser.add_argument("-lp", "--last_page", default=1000,
                        help="last page to extract from")
    parser.add_argument("-mw", "--min_width", default=200,
                        help="minimum pixel width")
    parser.add_argument("-mh", "--min_height", default=200,
                        help="minimum pixel height")
    parser.add_argument("-xw", "--max_width", default=1210,
                        help="maximum pixel width")
    parser.add_argument("-xh", "--max_height", default=1570,
                        help="maximum pixel height")
    parser.add_argument("-mt", "--make_transparent", default=True,
                        type=str2bool, const=False, nargs='?',
                        help="flag to make the background transparent")
    parser.add_argument("-wt", "--white_to_trans", default=True,
                        type=str2bool, const=False, nargs='?',
                        help="turn white pixels transparent")
    parser.add_argument("-bt", "--black_to_trans", default=True,
                        type=str2bool, const=False, nargs='?',
                        help="turn black pixels transparent")
    parser.add_argument("-wf", "--white_fuzz", default=1,
                        help="fuzz percent (0-100) for white transparency")
    parser.add_argument("-bf", "--black_fuzz", default=1,
                        help="fuzz percent (0-100) for black transparency")
    parser.add_argument("-ims", "--image_string", default="Im",
                        help="string that appears in all image names")
    args = parser.parse_args()

    if args.verbose:
        print(f"Args:\n\t{args}")

    # Obtain the base filename
    file_name = args.file_name
    assert os.path.exists(file_name)
    assert file_name[-4:] == ".pdf", "must provide '.pdf' file"
    base_file_name = file_name[:-4]
    # Split on slashes
    base_file_name = base_file_name.split("/")[-1]
    base_file_name = base_file_name.split("\\")[-1]
    assert len(base_file_name) > 0

    # Make the output directory
    if args.output is not None:
        output = args.output
    else:
        output = base_file_name + "_images"
        if args.verbose:
            print(f"No output file given; outputing to {output}/")
    os.makedirs(output, exist_ok=True)

    # Import the pdfreader
    fd = open(file_name, "rb")
    doc = PDFDocument(fd)

    # Check pages
    assert args.first_page > -1
    assert args.last_page > -1
    assert args.last_page > args.first_page

    # Loop over pages
    for i, page in enumerate(doc.pages()):
        if i < args.first_page:
            continue
        if i >= args.last_page:
            exit()
        if args.verbose:
            nkeys = len(page.Resources.XObject.keys())
            print(f"On page {i} -- {nkeys} XObjects detected")

        # Loop over possible image objects
        for key in page.Resources.XObject.keys():
            if args.image_string in key or "im" in key:
                xobj = page.Resources.XObject[key]
                try:
                    pil_image = xobj.to_Pillow()
                except IndexError:
                    if args.verbose:
                        print(
                            f"IndexError raised on page {i} {key} - skipping"
                        )
                    continue
                width, height = pil_image.size
                if width < args.max_width and height < args.max_height:
                    if width > args.min_width and height > args.min_height:
                        if args.verbose:
                            print(
                                f"Saving image {key} on page{i}: "+\
                                f"(w,h)={pil_image.size}"
                            )
                        pil_image.save(f"{output}/page{i}_{key}.png")
                        if args.make_transparent:
                            _do_transparent(args, i, key, pil_image, output)
    return
Esempio n. 20
0
            glyph.attrib['name'] = ''
            glyphs[ET.tostring(glyph).strip()] = name

    shares = []
    data = b''
    glyphOrder = []
    i = 0
    prime = None
    for line in f:
        data += line
        if line == b'%%EOF\n':
            with TmpFile() as pdf, TmpFile() as fontfile, TmpFile() as ttxfile:
                pdf.write(data)

                # Convert font to xml and extract character map
                font = next(PDFDocument(pdf).pages(
                )).Resources.Font['F2+0'].FontDescriptor.FontFile2.filtered
                fontfile.write(font)
                ttx.ttDump(fontfile.name, ttxfile.name, ttx.Options('', 0))
                scrambled = ET.parse(ttxfile.name).getroot()
                cmap = scrambled.find('.//cmap/cmap_format_6')

                # Match glyphs to their names based on their shape
                for c in cmap.findall('map'):
                    code = int(c.attrib['code'], 16)
                    if 0x20 <= code <= 0x7e:
                        name = c.attrib['name']
                        scrambled_glyph = scrambled.find(
                            f'.//glyf/TTGlyph[@name="{name}"]')
                        scrambled_glyph.attrib['name'] = ''
                        glyphOrder.append(
                            glyphs[ET.tostring(scrambled_glyph).strip()])
Esempio n. 21
0
def main():
    print('Opening pdf and writing to decrypted copy')
    p_pdf = open('AP12176A_20200701_142127.pdf', 'rb')  #this will change
    pdfReader = PyPDF2.PdfFileReader(p_pdf)
    pdfWriter = PyPDF2.PdfFileWriter()
    pdfReader.decrypt('')

    #write all the pages in the unlocked file to a new pdf
    print('Writing to decrypted copy.')
    for pageNum in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(pageNum)
        pdfWriter.addPage(pageObj)

    u_name = input('File name for decrypted copy? (excluding ".pdf"):  '
                   )  #prompt for the name of the .txt output file
    u_file = u_name + '.pdf'

    #pdfOutputFile = open(u_file, 'wb')
    print('Finishing writing to decrypted copy')
    pdfOutputFile = open('temp_pdf.pdf', 'wb')
    pdfWriter.write(pdfOutputFile)  #write to the temporary unlocked pdf

    pdfOutputFile.close()
    p_pdf.close()

    #open the unlocked pdf.  We'll use PDFDocument and SimplePDFViewer to pull all the text
    #_pdf = open(u_file, "rb")
    u_pdf = open('temp_pdf.pdf', "rb")
    doc = PDFDocument(u_pdf)
    reader = SimplePDFViewer(u_pdf)

    print('counting pages in pdf')
    pgs = [p for p in doc.pages()]  #count number of pages
    page_ct = len(pgs)

    print('cycling through pages')
    with open(u_name + '.txt', 'w') as g:
        for pg in range(page_ct):  #cycle through pages
            reader.navigate(pg + 1)
            reader.render()
            if (pg + 1) % 10 == 0:
                print('processing page ' + str(pg + 1) + ' of ' + str(page_ct))
            st = reader.canvas.strings  #list with 1 line per element
            for l in range(len(st)):
                ln = st[l].encode('ascii', 'replace').decode(
                    'utf-8')  #turn unknown chars into ?
                g.write(ln + '\n')

    group_exp = '^\s(.{8})\s(.{17})\s\s(.{8})\s\s(.{10})\s\s(.{21})\s(.{10})\s(.{18})\s(.{12})(.+)?$'  #regex for grouping an invoice..
    group_inv = re.compile(group_exp)  #set as a regex
    acct_exp = 'ACCOUNT.+(\d{6}).+DEPARTMENT\s+(\w{8})'  #find acct and center
    find_acct = re.compile(acct_exp)
    vend_exp = '^([\w]{10})\s+(\S.+\S)\s+$'  #find 10 'word' chars, then a space, then everything up to the line break
    find_vendor = re.compile(vend_exp)
    corp_exp = 'COMPANY\s(\w{4})\s+DATE'  #find the 4 characters between COMPANY and DATE
    find_corp = re.compile(corp_exp)

    corp = 'XXXX'
    center = 'XXXXXXXX'
    account = 'XXXXXX'
    v_short = 'XXXX'
    v_long = 'XXXXXXXX'

    data_tmp = []

    ct = 0

    with open(u_name + '.txt', 'r') as h:
        for line in h:
            ln = str(line)  #remove leading/trailing spaces and newline chars
            if find_corp.search(ln):  #look for a new corp
                if corp != find_corp.search(ln).group(1):
                    corp = find_corp.search(ln).group(1)
            elif find_vendor.search(ln):  #look for a new vendor
                if v_short != str(find_vendor.search(ln).group(1)):
                    v_short = str(find_vendor.search(ln).group(1))
                    v_long = str(find_vendor.search(ln).group(2))
            elif find_acct.search(ln):  #look for a new acct/center
                if center != find_acct.search(ln).group(
                        2) or account != find_acct.search(ln).group(1):
                    center = find_acct.search(ln).group(2)
                    account = find_acct.search(ln).group(1)
            elif is_inv(ln):  #look for an invoice
                tmp = group_inv.search(ln)  #print(is_inv(ln).groups())
                gl_eff = tmp.group(1).strip()
                inv_num = tmp.group(2).strip()
                inv_date = tmp.group(3).strip()
                po = tmp.group(4).strip()
                desc = tmp.group(5).strip()
                q = tmp.group(6).strip()
                if q == '':
                    qty = 0
                else:  #qty
                    qty = q.replace(',', '')
                prod_id = tmp.group(7).strip()
                if len(tmp.group(8).strip()) != 0:  #expense
                    exp = tmp.group(8).strip()
                    exp = float(str(exp).replace(',', ''))
                else:
                    exp = 0
                if len(tmp.group(9).strip()) != 0:  #expense
                    cred = tmp.group(9).strip()
                    cred = float(str(cred).replace(',', ''))
                else:
                    cred = 0
                if int(qty) == 0:
                    per_unit = 0
                else:
                    per_unit = round(float(exp) / float(qty), 2)
                new_row = [
                    corp, center, account, v_short, v_long, gl_eff, inv_num,
                    inv_date, po, desc, qty, prod_id, exp, cred, per_unit
                ]
                data_tmp.append(new_row)
                ct += 1
                if ct % 1000 == 0:
                    print('Finished adding row ' + str(ct))

    data_cols = [
        'Company', 'Center', 'Account', 'Vendor_Short', 'Vendor_Long',
        'GL_Effective_Date', 'Inv_Number', 'Inv_Date', 'PO', 'Description',
        'Qty', 'ProdID', 'Expense', 'Credit', 'Per_Unit_Cost'
    ]
    col_widths = [14, 12, 12, 18, 35, 22, 20, 13, 12, 31, 10, 20, 15, 15, 18]
    data_inv = pd.DataFrame(data=data_tmp, columns=data_cols)
    data_inv['GL_Effective_Date'] = pd.to_datetime(
        data_inv['GL_Effective_Date'])
    data_inv['Inv_Date'] = pd.to_datetime(data_inv['Inv_Date'])
    data_inv['Account'] = data_inv['Account'].astype('int64')
    data_inv['Qty'] = data_inv['Qty'].astype('int64')
    i_rows = data_inv['Company'].size

    with pd.ExcelWriter(u_name + '.xlsx',
                        engine='xlsxwriter',
                        datetime_format='m/d/yyyy') as writer:
        data_inv.to_excel(writer, sheet_name='DATA', index=False)
        workbook = writer.book
        worksheet = writer.sheets['DATA']
        curr_format = workbook.add_format(
            {'num_format': '$#,##0.00;[Red]($#,##0.00)'})
        worksheet.set_column(12, 12, 13,
                             curr_format)  #first col, last col, width, format
        worksheet.set_column(13, 13, 13, curr_format)
        worksheet.autofilter('A1:O' + str(i_rows + 1))
        worksheet.freeze_panes(1, 0)  #freeze 1st row
        for a in range(len(col_widths)):
            worksheet.set_column(a, a, col_widths[a])

    print('DONE - pulled ' + str(i_rows) + ' lines into ' + u_name + '.xlsx')
Esempio n. 22
0
src_dir = '~/Olds'  #源文件目录地址
des_dir = '~/News'  #新文件目录地址
num = 0

if not os.path.exists(des_dir):  #如果没有目标文件夹,新建一个目标文件夹进行存储
    os.makedirs(des_dir)

if os.path.exists(src_dir):
    dirs = os.listdir(src_dir)  #获取源文件的目录地址
    print(dirs)
    for dirc in dirs:  #对于目录下的每一个文件
        if "pdf" not in dirc:
            continue
        fd = open(os.path.join(src_dir, dirc), 'rb')
        doc = PDFDocument(fd)  #打开并建立一个PDF文件对象
        viewer = SimplePDFViewer(fd)
        reader = PyPDF2.PdfFileReader(fd)
        print(reader.documentInfo)
        #print(reader.getPage(0).extractText())
        #paper_title = pdf_reader.getDocumentInfo()                         #获取PDF标题
        viewer.render()
        #print(doc.root)
        #print(viewer.canvas.text_content)
        #print("num : %s" % num , doc)                                    #终端显示处理到第几个文件
        # num += 1
        # paper_title = str(paper_title)                                           #标题字符化

        # if paper_title.find('/') != -1:       #对于'/'无法写入文件名的情况,将其用'_'代替
        #     new_paper_title = paper_title.replace('/','_')
        #     paper_title = new_paper_title
Esempio n. 23
0
def read_file(path_to_file):
    fd = open(path_to_file, "rb")
    doc = PDFDocument(fd)
    viewer = SimplePDFViewer(fd)
    viewer.render()
    return viewer
Esempio n. 24
0
import pdfreader
from pdfreader import PDFDocument, SimplePDFViewer
from functions import cleanString
from ItemsClass import Item
from datetime import datetime
import json
import re

fd = open("order1.pdf", "rb")
doc = PDFDocument(fd)
numpages = len([p for p in doc.pages()])
viewer = SimplePDFViewer(fd)
strings = []
for num in range(0, numpages):
    viewer.navigate(num + 1)
    viewer.render()
    strings += viewer.canvas.strings[4:]

print(strings)

prev = ""
Items = []
item = Item()
Order = {
    "Items": [],
    "Request": "",
    "Total": "",
    "Customer": "",
    "Delivery": ""
}
requesting = False
Esempio n. 25
0
def main():
    """Docstring will go here"""
    print('*' * 40 + '\nERT PDF TO TXT CONVERTER\n')
    print('Showing PDF files in ' + os.getcwd())
    files = []

    f_count = 0
    for file in os.listdir():
        if file.endswith(".pdf") or file.endswith(".PDF"):
            f_name = file.rsplit('.', maxsplit=1)[0]
            print('(' + str(f_count) + ')  ' + file)
            files.append(f_name)  #take the file name w/o the .pdf
            f_count += 1

    prompt = 'Enter the number corresponding to the target pdf, or q to quit: '
    choice = input(prompt)
    while is_valid(choice, f_count) == False:
        if choice == 'q':
            print('Quitting.')
            sys.exit()
        else:
            print('Invalid choice - try again.')
            choice = input(prompt)
    if choice != 'q':
        pdf_to_open = str(files[int(choice)]) + '.pdf'
        print(pdf_to_open)

    #open the protected pdf and remove the password
    print('Converting to unlocked PDF')
    p_pdf = open(pdf_to_open, 'rb')  #this will change
    pdfReader = PyPDF2.PdfFileReader(p_pdf)
    pdfReader.decrypt('')
    pdfWriter = PyPDF2.PdfFileWriter()

    #write all the pages in the unlocked file to a new pdf
    for pageNum in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(pageNum)
        pdfWriter.addPage(pageObj)

    u_name = input('Output file name? (excluding ".txt"):  '
                   )  #prompt for the name of the new unlocked pdf
    #print(Path(u_name + '.txt').is_file())
    #sys.exit()

    pdfOutputFile = open('temp_pdf.pdf', 'wb')
    pdfWriter.write(pdfOutputFile)

    pdfOutputFile.close()
    p_pdf.close()

    #open the unlocked pdf.  We'll use PDFDocument and SimplePDFViewer to pull all the text
    u_pdf = open('temp_pdf.pdf', "rb")
    doc = PDFDocument(u_pdf)
    reader = SimplePDFViewer(u_pdf)
    start_time = time.time()
    pgs = [p for p in doc.pages()]  #count number of pages
    page_ct = len(pgs)
    print('Writing ' + str(page_ct) + ' pages to ' + u_name + '.txt ...')

    with open(u_name + '.txt', 'w') as g:
        for pg in range(page_ct):  #cycle through pages
            reader.navigate(pg + 1)
            reader.render()
            if (pg + 1) % 10 == 0:
                print('Processing page ' + str(pg + 1) + ' of ' + str(page_ct))
            st = reader.canvas.strings  #list with 1 line per element
            for l in range(len(st)):
                ln = st[l].encode('ascii',
                                  'replace')  #turn unknown chars into ?
                ln = ln.decode('ascii', 'strict')
                g.write(ln + '\n')

    u_pdf.close()
    os.remove('temp_pdf.pdf')

    print('Saved as ' + u_name + '.txt')
    print("This took %s seconds." % round((time.time() - start_time), 2))
Esempio n. 26
0
async def get_election_offices():
    async with aiohttp.ClientSession() as session:
        async with session.get(URL) as r:
            text = await r.read()

    # Prep helper vars
    phone, office_supervisor, website, location_name, county_name = ("", ) * 5

    doc = PDFDocument(text)
    viewer = SimplePDFViewer(text)
    physical_address, mailing_address = ({}, ) * 2
    election_offices = []
    for i, page in enumerate(doc.pages(), 1):
        viewer.navigate(i)
        viewer.render()
        # This is parsed in the order at which pdf elements are read by the viewer.
        for j, s in enumerate(viewer.canvas.strings):
            if not county_name:
                m = re.search(r"\D+(?=\s-)", s)
                if m:
                    county_name = m.group(0).split(maxsplit=1)[0].capitalize()
                    location_name = f"{county_name} Election Office"

            mapping = electionsaver.addressSchemaMapping

            if not physical_address:
                m = re.search(r"(?<=MUNICIPAL ADDRESS :).*", s)
                if m:
                    physical_address = usaddress.tag(
                        f"{m.group(0)} {viewer.canvas.strings[j + 1]}".title(),
                        tag_mapping=mapping,
                    )[0]
                    physical_address["state"].upper()
                    physical_address["locationName"] = location_name
            if not mailing_address:
                m = re.search(r"(?<=MAILING ADDRESS :).*", s)
                if m:
                    mailing_address = usaddress.tag(
                        f"{m.group(0)} {viewer.canvas.strings[j + 1]}".title(),
                        tag_mapping=mapping,
                    )[0]
                    mailing_address["state"].upper()
                    mailing_address["locationName"] = location_name
            if not phone:
                m = re.search(r"(?<=Phone 1: ).*", s)
                if m:
                    phone = m.group(0)
                    election_offices.append({
                        "countyName": county_name,
                        "physicalAddress": physical_address,
                        "mailingAddress": mailing_address,
                        "phone": phone,
                        "officeSupervisor": office_supervisor,
                        "supervisorTitle": "County Clerk",
                        "website": website,
                    })
                    # reset for next round
                    phone, office_supervisor, website, location_name, county_name = (
                        "", ) * 5
            if not office_supervisor:
                m = re.search(r"(?<=COUNTY CLERK: ).*", s)
                if m:
                    office_supervisor = m.group(0).title()
            if not website:
                m = re.search(r"http.*", s)
                if m:
                    website = m.group(0)

    with open(
            os.path.join(ROOT_DIR, "scrapers", "wisconsin", "wisconsin.json"),
            "w") as f:
        json.dump(election_offices, f)
    return election_offices