Ejemplo n.º 1
0
def text_parsing_example():
	pdf_filepath = './example-text-crash-report.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		viewer = SimplePDFViewer(fd)
		viewer.render()

		markdown = viewer.canvas.text_content
		print('markdown = {}.'.format(markdown))

		print('viewer.canvas.strings = {}.'.format(viewer.canvas.strings))

		# Parse PDF markdown.
		print('isinstance(markdown, str) = {}.'.format(isinstance(markdown, str)))

		with open('./example-crash-markdown.txt', 'w') as fd2:
			fd2.write(markdown)

		# Now we may use any text processing tools like regular expressions, grep, custom parsers to extract the data.
		reporting_agency = markdown.split('(REPORTING AGENCY NAME *)', 1)[1].split('(', 1)[1].split(')',1)[0]
		print('reporting_agency = {}.'.format(reporting_agency))

		local_report_number = markdown.split('(LOCAL REPORT NUMBER *)', 1)[1].split('(', 1)[1].split(')',1)[0]
		print('local_report_number = {}.'.format(local_report_number))

		crash_severity = markdown.split('( ERROR)', 1)[1].split('(', 1)[1].split(')',1)[0]
		print('crash_severity = {}.'.format(crash_severity))
	finally:
		fd.close()
Ejemplo n.º 2
0
def parse_vaccinations(filename):
    # Read pdf (for metrics)
    with open(filename, mode="rb") as f:
        viewer = SimplePDFViewer(f)
        viewer.render()
    # Get list with strings
    strs = viewer.canvas.strings
    # Infer figures
    numbers = []
    for str in strs:
        try:
            numbers.append(clean_count(str))
        except:
            pass
    numbers.sort()
    total_vaccinations = numbers[-1]
    people_vaccinated = numbers[-2]
    people_fully_vaccinated = numbers[-3]
    total_boosters = numbers[-4]
    # Sanity check
    if people_vaccinated + people_fully_vaccinated + total_boosters != total_vaccinations:
        raise ValueError(
            f"people_vaccinated + people_fully_vaccinated + total_boosters != total_vaccinations ({people_vaccinated} + {people_fully_vaccinated} + {total_boosters} != {total_vaccinations})"
        )
    return total_vaccinations, people_vaccinated, people_fully_vaccinated, total_boosters
Ejemplo n.º 3
0
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        fd = open(pdf_path, "rb")

        doc = PDFDocument(fd)
        self.viewer = SimplePDFViewer(fd)
        self.pages = [p for p in doc.pages()]
Ejemplo n.º 4
0
def GetSiteKeys():
    pwd = pathlib.Path().absolute()
    # INVOICE PDF WORK STARTS HERE
    invoice_directory = '%s/Invoices' % (pwd)
    invoice_list = os.listdir(invoice_directory)

    WorkIDList = ['372856']

    InvoiceInfo = {}

    print(invoice_list)

    for invoice in invoice_list:

        pdf = "%s/Invoices/%s" % (pwd, invoice)
        fd = open(pdf, "rb")
        viewer = SimplePDFViewer(fd)
        viewer.render()
        raw_invoice_data = viewer.canvas.strings

        xyz = listToString(raw_invoice_data)
        xyz = xyz.split(' ')
        try:
            WorkOrderIndex = xyz.index('Order') + 1
            WorkOrder = xyz[WorkOrderIndex].split('-')[1]
            WorkIDList.append(WorkOrder)
        except:
            pass
        # print(xyz)
    print("Work Orders", WorkIDList)
    return WorkIDList
Ejemplo n.º 5
0
class PDFPageIterator:
    def __init__(self, filename):
        self._pdf_viewer = None
        self._page_number = 0
        self._rendered = False
        if filename:
            self._pdf_viewer = SimplePDFViewer(open(filename, 'rb'))

    def __iter__(self):
        return self

    def __next__(self):
        try:
            self._go_to_next_pdf_page()
            return self
        except PageDoesNotExist as e:
            raise StopIteration(e)

    def get_page_number(self):
        return self._page_number

    def get_strings(self):
        if not self._rendered:
            self._pdf_viewer.render()
            self._rendered = True
        return self._pdf_viewer.canvas.strings

    def _go_to_next_pdf_page(self):
        if self._page_number != 0:
            self._pdf_viewer.next()
        self._page_number += 1
        self._rendered = False
Ejemplo n.º 6
0
def encrypted_and_password_protected_pdf_tutorial():
	pdf_filepath = './encrypted-with-qwerty.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		viewer = SimplePDFViewer(fd, password='******')

		viewer.render()

		text = ''.join(viewer.canvas.strings)
		print('text = {}.'.format(text))

		#--------------------
		doc = PDFDocument(fd, password='******')

		page_one = next(doc.pages())
		print('page_one.Contents = {}.'.format(page_one.Contents))

		#--------------------
		try:
			doc = PDFDocument(fd, password='******')
			#viewer = SimplePDFViewer(fd, password='******')
		except ValueError as ex:
			print('ValueError raised: {}.'.format(ex))
	finally:
		fd.close()
Ejemplo n.º 7
0
    def get_simple_pdf_text(self, file):
        pdf_text = ""
        viewer = SimplePDFViewer(file)
        viewer.render()
        for canvas in viewer:
            pdf_text += "".join(canvas.strings)

        if not pdf_text.strip():
            return ""

        pdf_text = re.sub(r'\s+', ' ', pdf_text)
        pdf_text = pdf_text.replace('% Chg', ' % Chg ')
        pdf_text = pdf_text.split('% Chg')[-1].strip()
        pdf_text = pdf_text.replace('%', '% ')

        find_worlds = []

        PATTERN_WORLD = r"(?P<name>[a-zA-Z\(\)\&]+)"

        for t in re.finditer(PATTERN_WORLD, pdf_text):
            find_worlds.append(t["name"])

        for word in find_worlds:
            pdf_text = pdf_text.replace(word, f'{word} ')

        return re.sub(r'\s+', ' ', pdf_text).strip()
Ejemplo n.º 8
0
 def _text_from_pdf(self, pdf_link: str):
     with tempfile.NamedTemporaryFile() as tf:
         with open(tf.name, mode="wb") as f:
             f.write(requests.get(pdf_link).content)
         with open(tf.name, mode="rb") as f:
             viewer = SimplePDFViewer(f)
             viewer.render()
             raw_text = "".join(viewer.canvas.strings)
     return raw_text
Ejemplo n.º 9
0
def calculate_page_count(filepath):
    with open(filepath, "rb") as fd:
        viewer = SimplePDFViewer(fd)
        page = 0
        while True:
            try:
                page += 1
                viewer.navigate(page)
            except PageDoesNotExist:
                break
        return page
Ejemplo n.º 10
0
def xobject_image_example():
	pdf_filepath = './example-image-xobject.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		doc = PDFDocument(fd)

		# Extract XObject image.
		page = next(doc.pages())
		print('page.Resources.XObject = {}.'.format(page.Resources.XObject))

		xobj = page.Resources.XObject['img0']
		print('xobj.Type = {}, xobj.Subtype = {}.'.format(xobj.Type, xobj.Subtype))

		pil_image = xobj.to_Pillow()
		#pil_image.save('./extract-logo.png')

		#--------------------
		# Extract Images: a very simple way.
		viewer = SimplePDFViewer(fd)
		viewer.render()

		all_page_images = viewer.canvas.images
		if 'img0' in all_page_images:
			img = all_page_images['img0']
			print('img.Type = {}, img.Subtype = {}.'.format(img.Type, img.Subtype))

		all_page_inline_images = viewer.canvas.inline_images
		if all_page_inline_images:
			img = all_page_inline_images[0]
			print('img.Type = {}, img.Subtype = {}.'.format(img.Type, img.Subtype))
	finally:
		fd.close()

	#--------------------
	pdf_filepath = './tutorial-example.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		viewer = SimplePDFViewer(fd)

		# Extract image masks.
		viewer.navigate(5)
		viewer.render()

		inline_images = viewer.canvas.inline_images
		image_mask = next(img for img in inline_images if img.ImageMask)

		pil_img = image_mask.to_Pillow()
		#pil_img.save('./mask.png')
	finally:
		fd.close()
Ejemplo n.º 11
0
def read(path):
    try:
        print('\n=> Nubank Robot: In Progress...')
        with open(path, 'rb') as file:
            viewer = SimplePDFViewer(file)

            viewer.navigate(1)

            while True:
                try:
                    viewer.render()
                    if 'TRANSAÇÕES' in viewer.canvas.strings:
                        break
                    viewer.next()
                except:
                    print(
                        'Nubank Robot: Não foi achado dados de transações na fatura do nubank'
                    )

            content = list(
                filter(lambda s: len(s.strip()), viewer.canvas.strings))[3:-6]

            result = [[value, content[index * 3 + 1], content[index * 3 + 2]]
                      for index, value in enumerate(content[::3])]

            result.insert(0, ['Fatura Nubank'])

            print('     * Nubank Robot: Done\n')

            return result
    except:
        print('Nubank Robot: Tivemos um erro ao ler a fatura do nubank\n')
        return False
Ejemplo n.º 12
0
def create_sample_pdf(pdf_path):
    fd = open(pdf_path, "rb")
    reader_viewer = SimplePDFViewer(fd)
    reader_viewer.render()
    markdown = reader_viewer.canvas.text_content
    pdf_str = reader_viewer.canvas.strings

    rw_viewer = PdfReader(pdf_path)
    rw_content = rw_viewer.pages[0].Contents.stream
    pdf = PdfFileReader(pdf_path)
    pdf_writer = PdfFileWriter()
    report_page = pdf.getPage(0)
    report_page.extractText()

    return
Ejemplo n.º 13
0
def navigate_pages(doc: PDFDocument, viewer: SimplePDFViewer):
    for i, page in enumerate(doc.pages(), 1):
        # navigate to page
        viewer.navigate(i)
        # render the page
        viewer.render()

        # collapse that ass
        page_strings: List[str] = viewer.canvas.strings.copy()

        merge_ranges = get_line_ranges(strings_list=page_strings)

        page_strings = establish_uniformity(strings_list=page_strings,
                                            line_range_list=merge_ranges)

        get_county_election_office_info(strings_list=page_strings)
Ejemplo n.º 14
0
def readPDF(pdfFile):
    from pdfreader import PDFDocument, SimplePDFViewer
    fd = open(pdfFile, "rb")
    viewer = SimplePDFViewer(fd)
    viewer.render()
    countyHospitalData = {}
    compiled = ""
    for stringData in viewer.canvas.strings:
        if not stringData.isnumeric():
            compiled = compiled + stringData
        else:
            countyHospitalData[compiled] = stringData
            if compiled == 'Wright':
                break
            compiled = ""
    return countyHospitalData
Ejemplo n.º 15
0
    def parse(self):
        statements = []

        statement_files = list_statement_files(self.input_dir, "pdf")
        if not statement_files:
            logger.error(f"No statement files found.")
            raise SystemExit(1)

        logger.info(
            f"Collected statement files for processing: {statement_files}.")

        for statement_file in statement_files:
            logger.debug(f"Processing statement file[{statement_file}]")

            with open(statement_file, "rb") as fd:
                viewer = SimplePDFViewer(fd)
                activities = self.extract_activities(viewer)
                if not activities:
                    continue
                statements.append(activities)

        statements = sorted(
            statements,
            key=lambda k: k[self.get_first_non_ssp_activity_index(k)][
                "trade_date"])
        return [
            activity for activities in statements for activity in activities
        ]
Ejemplo n.º 16
0
def form_text_extraction_example():
	pdf_filepath = './example-form.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		viewer = SimplePDFViewer(fd)
		viewer.render()

		plain_text = ''.join(viewer.canvas.strings)
		print('("Farmworkers and Laborers" in plain_text) = {}.'.format('Farmworkers and Laborers' in plain_text))

		print('sorted(list(viewer.canvas.forms.keys())) = {}.'.format(sorted(list(viewer.canvas.forms.keys()))))

		form9_canvas = viewer.canvas.forms['Fm9']
		print('"".join(form9_canvas.strings) = {}.'.format(''.join(form9_canvas.strings)))
	finally:
		fd.close()
Ejemplo n.º 17
0
class ParserInterface:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        fd = open(pdf_path, "rb")

        doc = PDFDocument(fd)
        self.viewer = SimplePDFViewer(fd)
        self.pages = [p for p in doc.pages()]

    def contains(self, msg: str, page: int) -> bool:
        self.viewer.navigate(page)
        self.viewer.render()
        if msg in self.viewer.canvas.strings:
            return True
        return False

    def process(self, show_progress: bool) -> PDFContents:
        pass
Ejemplo n.º 18
0
def main():
    # Get the PDF
    r = requests.get(
        "https://www.sos.arkansas.gov/uploads/elections/ARCountyClerks.pdf")

    # Pass byte stream to PDFDocument parser (used for iterating through pages)
    doc = PDFDocument(r.content)
    # Pass byte stream to PDF viewer (used for reading strings on pages)
    viewer = SimplePDFViewer(r.content)
    navigate_pages(doc, viewer)
    pprint(ELECTION_OFFICE_INFO)
Ejemplo n.º 19
0
def parse_vaccinations(filename):
    # Read pdf (for metrics)
    with open(filename, mode="rb") as f:
        viewer = SimplePDFViewer(f)
        viewer.render()
    # Get list with strings
    strs = viewer.canvas.strings
    # Get indices
    idx_total_vax = strs.index("ümumi sayı")
    idx_dose_1 = strs.index("1-ci mərhələ üzrə ")
    idx_dose_2 = strs.index("2-ci mərhələ üzrə ")
    # Get metrics
    total_vaccinations = max([int(s) for s in strs[idx_total_vax:idx_dose_1] if s.isnumeric()])
    dose_1 = max([int(s) for s in strs[idx_dose_1:idx_dose_2] if s.isnumeric()])
    dose_2 = max([int(s) for s in strs[idx_dose_2:] if s.isnumeric()])
    # Sanity check
    if dose_1 + dose_2 != total_vaccinations:
        raise ValueError(
            f"Apparently, dose_1 + dose_2 != total_vaccinations ({dose_1} + {dose_2} != {total_vaccinations})"
        )
    return total_vaccinations, dose_1, dose_2
Ejemplo n.º 20
0
def read(path):
    print('=> Neon Robot: In Progress...')
    with open(path, 'rb') as file:
        viewer = SimplePDFViewer(file)

        viewer.navigate(2)
        viewer.render()

        full_string = ''.join(viewer.canvas.strings)
        re_pattern = '(.*R\$CartãoData)(.*)(Fique atento:Pagamento Mínimo:.*)'

        bill_string = sub(re_pattern, r'\2', full_string)
        after_date_spaces = sub('(.\d{2}\/\d{2}\/\d{4})(.)', r'\1--space--\2',
                                bill_string)
        before_date_spaces = sub('(.)(\d{2}\/\d{2}\/\d{4})', r'\1--space--\2',
                                 after_date_spaces)
        currency_spaces = sub('(.)(R\$\d)', r'\1--space--\2',
                              before_date_spaces)
        remove_card_column = sub('(Físico|Virtual)', '', currency_spaces)
        remove_currency_string = sub('R\$', '', remove_card_column)

        bill_list = remove_currency_string.split('--space--')

        content = list(filter(lambda s: len(s.strip()), bill_list))

        result = [[value, content[index * 3 + 1], content[index * 3 + 2]]
                  for index, value in enumerate(content[::3])]

        result.insert(0, ['Fatura neon'])

        print('     * Neon Robot: Done\n')

        return result
Ejemplo n.º 21
0
def get_text_pypdf(DOI:str) -> str:
    try:
        """gets the text from a given DOI"""
        hostname = socket.gethostname()
        path = pathlib.Path(__file__).parent.absolute()
        name = hostname + str(DOI).replace("/", "") + ".pdf"
        fp = Path(path / "pdfs" / name)  # build filepath
        url = "https://www.medrxiv.org/content/" + str(DOI) + "v1.full.pdf"  # build url
        response = requests.get(url)
        fp.write_bytes(response.content)  # save .pdf

        fd = open(str(path) + "/pdfs/" + name, "rb")  # open with pdfreader
        doc = PDFDocument(fd)
        all_pages = [p for p in doc.pages()]  # get pages
        viewer = SimplePDFViewer(fd)  # use simple viwer
        text = ""
        for p in range(len(all_pages)):  # for each page
            viewer.navigate(p + 1)  # nav to page
            try:
                viewer.render()  # render -> clean and strip
                text += (u"".join(viewer.canvas.strings).encode(sys.stdout.encoding, errors='replace').decode("windows-1252")) + '\n'
            except OverflowError:
                pass
        fd.close()
        return text.lower()
    except Exception as e:
        print(e, DOI)
        return ""
Ejemplo n.º 22
0
def parse_statements(statement_files):
    statements = []

    for statement_file in statement_files:
        with open(statement_file, "rb") as fd:
            viewer = SimplePDFViewer(fd)
            activities = extract_activities(viewer)
            if not activities:
                continue
            statements.append(activities)

    statements = sorted(statements, key=lambda k: k[0]["trade_date"])
    return [activity for activities in statements for activity in activities]
Ejemplo n.º 23
0
def page_extractor(filepath, page_number):
    with open(filepath, "rb") as fd:
        viewer = SimplePDFViewer(fd)
        viewer.navigate(page_number)
        viewer.render()
        content = viewer.canvas.strings

        # content = content[3:]  # remove page number

        text = ''.join(content)
        print('extracted page {}'.format(page_number), file=sys.stderr)
        return Page(page_number, text)
def pdfToText(string: str) -> list:
    fd = open(string, "rb")
    viewer = SimplePDFViewer(fd)

    plain_text = []
    try:
        while True:
            viewer.render()
            plain_text += viewer.canvas.strings
            viewer.next()
    except PageDoesNotExist:
        pass
    return plain_text
Ejemplo n.º 25
0
def GetFrontPageText(document, ID_page=0):

    read_pdf = PyPDF2.PdfFileReader(document)
    page = read_pdf.getPage(ID_page)
    page_text = page.extractText()

    if len(page_text) == 0:

        viewer = SimplePDFViewer(document)
        viewer.navigate(ID_page + 1)
        viewer.render()

        page_text = ''.join(viewer.canvas.strings)

    return page_text
Ejemplo n.º 26
0
def uploaded_file():
    if request.method == 'POST':
        f = request.files['file']
        filepath = os.path.join(app.config['UPLOAD_FOLDER'],
                                secure_filename(f.filename))
        f.save(filepath)
        fd = open(filepath, "rb")
        doc = PDFDocument(fd)
        version = doc.header.version
        print(doc.metadata)
        creationDate = doc.metadata.get('CreationDate')
        dataType = doc.metadata.get('Subtype')
        #data methods
        viewer = SimplePDFViewer(fd)
        textData = []
        for canvas in viewer:
            #print(canvas.strings)
            textData += canvas.strings
            tempstring = ''
            textWords = []
            for character in textData:
                if character != ' ':
                    tempstring += character
                else:
                    if tempstring:
                        textWords.append(tempstring)
                        tempstring = ''

        print(secure_filename(f.filename))
        print(creationDate)
        print(textWords)

        fileDocument = {
            "name": secure_filename(f.filename),
            "creationDate": creationDate,
            "text": textWords
        }

        collection.insert_one(fileDocument)
        return 'file uploaded successfully'
Ejemplo n.º 27
0
def init_cmb_from_pdf(month):
    filename = FILE_PATH.format(str(month).zfill(2))
    # logger.info(filename)
    fd = open(filename, "rb")

    doc = PDFDocument(fd)
    all_pages = [p for p in doc.pages()]
    # logger.info(len(all_pages))

    viewer = SimplePDFViewer(fd)
    records = []
    for i in range(len(all_pages)):
        viewer.navigate(i+1)
        viewer.render()
        records = np.append(records, viewer.canvas.strings[4:])

    head = np.where(records == '记账日')[0][0]
    tail = np.where(records == '本期还款总额')[0][-1]
    records = records[head:tail]

    # title_cn = records[:5]
    # title_en = records[5:11]
    records = records[11:]

    column_cn = ['交易日' '交易摘要' '人民币金额' '卡号末四位' '记账日' '交易地金额']
    column_en = ['transaction_date', 'transaction_description', 'transction_amount',
                 'card_number', 'bill_date', 'str_rmb']
    # Data: ['' '掌上生活还款' '-3,011.49' '9978' '07/24' '-3,011.49']

    df = pd.DataFrame(records.reshape(
        [int(len(records)/6), 6]), columns=column_en)

    df['type'] = 'cmb'

    df['transaction_date'] = df['transaction_date'].apply(
        lambda _: '2020/' + _)
    df['transaction_date'] = pd.to_datetime(
        df['transaction_date'], format="%Y/%m/%d", errors='coerce')

    df['transction_amount'] = df['transction_amount'].apply(
        lambda _: decimal_from_value(_))

    df = df[['transaction_date', 'transction_amount',
             'transaction_description', 'type']]

    return df
Ejemplo n.º 28
0
def Symptom_pdf():
    url = 'https://covid-assets.joinzoe.com/latest/covid_symptom_study_report.pdf'
    response = requests.get(url=url, proxies={})
    data = response.content

    symptom_pdf = open('ss.pdf', 'wb')
    symptom_pdf.write(data)
    symptom_pdf.close()
    response.headers

    viewer = SimplePDFViewer(data)
    viewer.navigate(7)
    viewer.render()
    for k, v in viewer.canvas.images.items():
        image = v.to_Pillow()
        name = 'ss' + k + '.png'
        image.save(name)
        print(name)
Ejemplo n.º 29
0
def parse_statements(statement_files):
    statements = []

    for statement_file in statement_files:
        logger.debug(f"Processing statement file[{statement_file}]")

        activities = []

        if statement_file.endswith('.pdf'):
            with open(statement_file, "rb") as fd:
                viewer = SimplePDFViewer(fd)
                activities = extract_activities_from_pdf(viewer)
        elif statement_file.endswith('.csv'):
            with open(statement_file, "r") as fd:
                viewer = csv.reader(fd, delimiter=",")
                activities = extract_activities_from_csv(viewer)

        if not activities:
            continue

        statements.append(activities)

    statements = sorted(statements, key=lambda k: k[0]["trade_date"])
    return [activity for activities in statements for activity in activities]
Ejemplo n.º 30
0
def hyperlink_and_annotation_tutorial():
	pdf_filepath = './annot-sample.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		viewer = SimplePDFViewer(fd)

		viewer.navigate(1)
		viewer.render()

		plain_text = ''.join(viewer.canvas.strings)
		print('"http" in plain_text = {}.'.format('http' in plain_text))

		print('len(viewer.annotations) = {}.'.format(len(viewer.annotations)))

		links = [annot.A.URI for annot in viewer.annotations if annot.Subtype == 'Link']
		print('links = {}.'.format(links))
	finally:
		fd.close()