コード例 #1
0
ファイル: xlsx_creator.py プロジェクト: Dialjini/bankDocs
def create(id, userid):
    if id == 1:
        pdf_path = os.path.dirname(__file__) + '/files/VTB_anketa.pdf'
    elif id == 2:
        return send_from_directory(directory=os.path.abspath(os.path.dirname(__file__) + '/files'),
                                   filename='VTB_accept.pdf')
    elif id == 3:
        pdf_path = os.path.dirname(__file__) + '/files/VTB_spravka.pdf'
    else:
        return 'BAD ID'

    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        fields = pdf.getFormTextFields()
        checkboxes = {}
        for i in pdf.getFields().keys():
            if 'Check Box' in i:
                checkboxes[i] = pdf.getFields()[i]

        docChecker(id=id, userid=userid, fields=checkboxes)
        docWriter(id=id, userid=userid, fields=fields)
        pdf_writer = PdfFileWriter()

        for page in range(pdf.getNumPages()):
            pdf_writer.addPage(pdf.getPage(page))
            pdf_writer.updatePageFormFieldValues(page=pdf_writer.getPage(page), fields=fields)
            updateCheckboxValues(page=pdf_writer.getPage(page), fields=checkboxes)

        with open(os.path.dirname(__file__) + '/files/downloaded.pdf', 'wb') as out:
            pdf_writer.write(out)

    return send_from_directory(directory=os.path.abspath(os.path.dirname(__file__) + '/files'),
                                   filename='downloaded.pdf')
コード例 #2
0
def test_get_form(src, expected, expected_get_fields):
    """Check if we can read out form data."""
    src = os.path.join(RESOURCE_ROOT, src)
    reader = PdfFileReader(src)
    fields = reader.getFormTextFields()
    assert fields == expected

    fields = reader.getFields()
    assert fields == expected_get_fields
コード例 #3
0
ファイル: main.py プロジェクト: rowanajmarshall/PdfFields
def main(args: List[str]) -> int:
    filename = args[1]
    reader = PdfFileReader(filename)
    fields = reader.getFields()
    for f in fields.keys():
        print(f"Name: '{f}', "
              f"Type: '{readable_type(fields[f])}', "
              f"Value: {get_value(fields[f])}")
    return 0
コード例 #4
0
ファイル: gui.py プロジェクト: smearle/form2doc
def addInTemplate(filepath):
    with app.tabbedFrame('form_templates'):
        template_name = filepath.split('/')[-1].replace('.pdf', '')
        with open(filepath, 'rb') as intemp:
            template_form = PdfFileReader(intemp)
            with app.tab(template_name):
                app.setStretch('both')
                app.addListBox(template_name,
                               sorted(template_form.getFields()), 0, 0, 10, 10)
                app.setListBoxGroup(template_name)
                app.setListBoxChangeFunction(template_name,
                                             updateFormTemplateEdit)
コード例 #5
0
ファイル: pdfparse.py プロジェクト: sethmund/PDFParser
def main():
    # --- Parse argv ---------------------------------------------------
    arg_parser = argparse.ArgumentParser(description='PDF field parser')

    arg_parser.add_argument('filenames', metavar='file', nargs='+',
     help='Path to one or more PDF to parse')

    arg_parser.add_argument('-o',
        metavar='output_file', dest='output', default = "output.csv",
        help='Output filename (default: output.csv)')

    args = arg_parser.parse_args()

    # --- Read in data from all the files ------------------------------
    parsed_data = []

    for file in args.filenames:
        with open(file, "rb") as con:
            pdf = PdfFileReader(con)
            fields = pdf.getFields()

            for column, objects in fields.items():
                    fields[column] = str(objects["/V"]) if "/V" in objects else ""

            parsed_data.append(fields)


    # If our dictionary is empty, assume that we have no data and exit
    # TODO: This only checks the first file. This could be more robust.
    if parsed_data[0] is None:
        sys.exit(0)

    # --- Write out data to the CSV file ------------------------------
    with open(args.output, "w") as outfile:

        csvwriter = DictWriter(
        outfile,
        delimiter=",",
        quotechar="\"",
        lineterminator="\n",
        quoting=QUOTE_NONNUMERIC,
        fieldnames = parsed_data[0].keys()
        )

        csvwriter.writeheader()

        for row in parsed_data:
            csvwriter.writerow(row)
コード例 #6
0
ファイル: reverser.py プロジェクト: jstarr/pdf-utility
def displaySourceData(window, fsource):
    '''Post the metadata for a pdf file'''
    '''NOTE: setting strict to False prevents PdfReadWarning.  See https://github.com/mstamy2/PyPDF2/issues/36 '''
    PDFSource = PdfFileReader(fsource, strict=False)
    info = PDFSource.getDocumentInfo()

    window['-NUMPAGES-'].update(PDFSource.getNumPages())
    window['-FIELDS-'].update(PDFSource.getFields(fsource))
    window['-ENCRYPTED-'].update(PDFSource.isEncrypted)
    window['-AUTHOR-'].update(info.author)
    window['-CREATIONDATE-'].update(str(convertDate(info)))
    window['-CREATOR-'].update(info.creator)
    window['-PRODUCER-'].update(info.producer)
    window['-SUBJECT-'].update(info.subject)
    window['-TITLE-'].update(info.title)
    return PDFSource
コード例 #7
0
def parse_pdf_impl(filenames):
    # --- Read in data from all the files ------------------------------
    parsed_data = []

    for file in filenames:
        with open(file, "rb") as con:
            pdf = PdfFileReader(con)
            fields = pdf.getFields()

            for column, objects in fields.items():
                fields[column] = str(objects["/V"]) if "/V" in objects else ""

            parsed_data.append(fields)

    # --- return as Pandas DataFrame ------------------------------
    parsed_data = pd.DataFrame(parsed_data)
    return parsed_data
コード例 #8
0
def _get_fields_from_pdf(pdf_data_path, radio_btn_group1, radio_btn_group2):
    pdf_data_source =\
     PdfFileReader(pdf_data_path.open(mode="rb"), strict=False)
    field_values = pair_fields_name_and_val(pdf_data_source.getFields(), True)

    try:
        group1_index = radio_btn_group1.index(field_values.get(_NAME_GROUP1))
        field_values[_NAME_GROUP1] = group1_index
    except ValueError:
        pass

    try:
        group2_index = radio_btn_group2.index(field_values.get(_NAME_GROUP2))
        field_values[_NAME_GROUP2] = group2_index
    except ValueError:
        pass

    group4_index = _index_from_btn_group4(field_values.get(_NAME_GROUP4))
    if group4_index >= 0:
        field_values[_NAME_GROUP4] = group4_index

    return field_values
コード例 #9
0
ファイル: compile.py プロジェクト: emwhite3/st-pdf-compiler
def read_pdf(filedir, stForms, i):
    form_fields = {}
    clean = CleanUtil()
    try:
        pdf_reader = PdfFileReader(
            open(filedir, "rb")
        )  #this creates a pdf reader so we can gather information from the pdf form
        fields = pdf_reader.getFields(
        )  #this is a dictionary with field key values and their associative input data
    except:
        fields = None
    if fields is None:
        fields = {}
    for key in fields:  #key is the forms field name i.e 'email' is  the key for '*****@*****.**'
        if key != 'sig_univ_employee' and key != 'sig_dept_chair' and key != 'dean_students':
            fl = fields[key]
            form_fields[fl.name] = clean.sanitize(fl.value, fl.fieldType)
    if 'employee_responsible_name' in form_fields:
        stForms['1'] = form_fields
    elif 'colleges-st10' not in form_fields:
        stForms['2' + str(i)] = form_fields
    return
コード例 #10
0
        return writer


if __name__ == '__main__':
    csv_filename = "EISAutoFill.csv"
    pdf_filename = "EIS 3 Certificate - Autofilled.pdf"

    csvin = os.path.normpath(os.path.join(os.getcwd(), 'in', csv_filename))
    pdfin = os.path.normpath(os.path.join(os.getcwd(), 'in', pdf_filename))
    pdfout = os.path.normpath(os.path.join(os.getcwd(), 'out'))
    data = pd.read_csv(csvin)
    pdf = PdfFileReader(open(pdfin, "rb"), strict=False)
    if "/AcroForm" in pdf.trailer["/Root"]:
        pdf.trailer["/Root"]["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})
    pdf_fields = [str(x) for x in pdf.getFields().keys()
                  ]  # List of all pdf field names
    csv_fields = data.columns.tolist()

    i = 0  #Filename numerical prefix
    for j, rows in data.iterrows():
        i += 1
        pdf2 = PdfFileWriter()
        set_need_appearances_writer(pdf2)
        if "/AcroForm" in pdf2._root_object:
            pdf2._root_object["/AcroForm"].update(
                {NameObject("/NeedAppearances"): BooleanObject(True)})

        # Key = pdf_field_name : Value = csv_field_value
        field_dictionary_1 = {
            "Full Name":
コード例 #11
0
from collections import OrderedDict
from PyPDF2 import PdfFileWriter, PdfFileReader, pdf
from PyPDF2.generic import BooleanObject, NameObject, IndirectObject
from pprint import pprint
import pandas as pd
import numpy as np
import os

data = pd.read_csv('WestulDatabaseOrd.csv')
pdfpath = os.getcwd()
pdf_new_name = 'Form_Final.pdf'

pdfread = PdfFileReader(open(pdf_new_name, 'rb'))
page = pdfread.getFields()

fields = pdf.trailer
コード例 #12
0
    def pdf(self, fp, csv_row):
        password = ''
        extracted_text = ''
        self.parser = PDFParser(fp)
        self.document_t = PDFDocument
        pf = PdfFileReader
        # isEncrypted
        try:
            i = 0
            try:
                thread = Thread(target=self.load_pdf,
                                args=(PDFDocument, password))
                thread.start()
                thread.join(timeout=90)
            except Exception as e:
                print('PDF I/O error: ' + e.__str__())
                row = [
                    self.line_count,
                    'PDF DOCUMENT OBJECT FAILED TO LOAD - ' + e.__str__() +
                    ': ' + self.url,
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                    '',
                ]
                # self.line_count += 1
                report_path = self.report_folder + self.report_name
                # 90 SECONDS or LOAD FAIL
                with open(report_path, 'a', encoding='utf8',
                          newline='') as csv_file:
                    writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
                    writer.dialect.lineterminator.replace('\n', '')
                    writer.writerow(row)

            stop_event.set()
            document = PDFDocument
            document = self.document_t
            pf = PdfFileReader(BytesIO(open(self.pdf_path, 'rb').read()))

            # ENCRYPTION
            if self.parser.doc.encryption is not None:
                csv_row.insert(4, [self.csv_header[4], 'ENCRYPTED'])
                csv_row.insert(5, [self.csv_header[5], 'ENCRYPTED'])
            else:
                csv_row.insert(4, [self.csv_header[4], 'FALSE'])
                csv_row.insert(5, [self.csv_header[5], 'NA'])
        except Exception as e:
            csv_row.insert(4, [self.csv_header[4], 'FAILED: ' + e.__str__()])
            csv_row.insert(5, [self.csv_header[5], 'NA'])
            exit_call = e.__str__() + ' document failed!!'
            print(exit_call)
            pass

        page_count = 0
        # istagged
        try:
            pages = PDFPage.get_pages(document)
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            page_no = 0
            istagged = 'FALSE'
            try:
                # document.catalog
                if document.catalog['MarkInfo']:
                    istagged = 'TRUE'
            except Exception as e:
                exit_call = e.__str__() + ' tagged info failed!!'
                print(exit_call)
            page_count = resolve1(document.catalog['Pages'])['Count']
            csv_row.insert(6, [self.csv_header[6], istagged])
            csv_row.insert(7, [self.csv_header[7], page_count])
        except Exception as e:
            csv_row.insert(6, [self.csv_header[6], 'IsTagged: ' + e.__str__()])
            csv_row.insert(7,
                           [self.csv_header[7], 'Page Count: ' + e.__str__()])
            exit_call = e.__str__() + ' tagged info failed!!'
            print(exit_call)
        # TOC
        try:
            if pf.outlines:
                csv_row.insert(8, [self.csv_header[8], 'TRUE'])
                '''pdf_path_toc = self.document_folder + pdf_name + '_toc.txt'
                places_list = pf.outlines

                with open(pdf_path_toc, 'w') as filehandle:
                    filehandle.writelines("%s\n" % place for place in places_list)
                filehandle.close()'''
            else:
                csv_row.insert(8, [self.csv_header[8], 'FALSE'])
        except Exception as e:
            csv_row.insert(8,
                           [self.csv_header[8], 'TOC FAILED: ' + e.__str__()])
            exit_call = e.__str__() + ' toc info failed!!'
            print(exit_call)
        # isForm, fields,
        try:
            if pf.getFields():
                csv_row.insert(9, [self.csv_header[9], 'TRUE'])
                csv_row.insert(10,
                               [self.csv_header[10],
                                pf.getFields().__len__()])
            else:
                csv_row.insert(9, [self.csv_header[9], 'FALSE'])
                csv_row.insert(10, [self.csv_header[10], 0])
        except Exception as e:
            csv_row.insert(9, [self.csv_header[9], 'FORMS: ' + e.__str__()])
            csv_row.insert(10, [self.csv_header[10], 'FIELDS: ' + e.__str__()])
            exit_call = e.__str__() + ' forms failed!!'
            print(exit_call)
        # tables
        csv_row.insert(11, [self.csv_header[11], 'NOT RUN'])
        write_clip = ''
        word_count = 0
        words_per_page = 0
        char_count = 0
        chars_per_word = 0
        image_count = 0
        # TODO: write 3 page sample and word count
        try:
            if pf.getNumPages() < 50:
                for page in range(pf.getNumPages()):
                    p = pf.getPage(page)
                    text_clip = p.extractText().encode('UTF-8')
                    text_clip = BytesIO(text_clip).read().__str__()[2:]
                    count_clip = re.findall(r"[^\W_]+", text_clip,
                                            re.MULTILINE)
                    word_count += len(count_clip)
                    char_count += len(text_clip)
                    if page <= 3:
                        write_clip += '[ PAGE ' + (page +
                                                   1).__str__() + ' START ] '
                        write_clip += text_clip.replace('\n', '').replace(
                            ',', ' ').replace('"', '')
                        write_clip += '[ PAGE ' + (page +
                                                   1).__str__() + ' END ]'
            else:
                write_clip = 'OVER 50 PAGES - SAMPLE SKIPPED'
        except Exception as e:
            exit_call = e.__str__() + ' :: TEXT sample failed!!'
            write_clip = exit_call
            word_count = exit_call
            char_count = exit_call
            print(exit_call)
        # TODO: Words/chars per page
        try:
            if not word_count == 0:
                chars_per_word = char_count / word_count
            else:
                chars_per_word = 0
            if not page_count == 0:
                words_per_page = word_count / page_count
            else:
                words_per_page = 0
        except Exception as e:
            exit_call = e.__str__() + ' :: WORD METRICS failed!!'
            chars_per_word = exit_call
            words_per_page = exit_call
            print(exit_call)
        # TODO: Add to row
        i = 12
        try:
            csv_row.insert(i, [self.csv_header[i], word_count.__str__()])
        except Exception as e:
            csv_row.insert(i,
                           [self.csv_header[i], 'WORD_COUNT: ' + e.__str__()])
        i = 13
        try:
            csv_row.insert(i, [self.csv_header[i], char_count.__str__()])
        except Exception as e:
            csv_row.insert(i,
                           [self.csv_header[i], 'CHAR_COUNT: ' + e.__str__()])
        i = 14
        try:
            csv_row.insert(i, [self.csv_header[i], words_per_page.__str__()])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], 'WPP: ' + e.__str__()])
        i = 15
        try:
            csv_row.insert(i, [self.csv_header[i], chars_per_word.__str__()])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], 'CPP: ' + e.__str__()])

        # TODO: IMAGES
        i = 16
        '''try:
            pdfImages = Globals.base_folder + 'cli-tools\\pdfimages.exe'

            img_folder = self.document_folder + 'images\\'  # + pdf_name[:-4] + '\\'
            if not os.path.exists(img_folder):
                os.makedirs(img_folder)
            # cmd = pdfImages + ' -list ' + '\"' + pdf_path + '\"'
            # output = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].split(b'\n')
            # save images to disk
            cmd = pdfImages + ' -list \"' + self.pdf_path + '\" \"' + ' ' + '\"'
            # subprocess.Popen(cmd, stdout=subprocess.PIPE)
            os.chdir(img_folder)
            image_list = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].split(b'\r\n')
            # os.remove(img_folder)
            # image_count = output.count('\n')
            image_count = image_list.__len__()
            if image_count > 2:
                # target = open(pdf_path_image, 'w')
                # target.write(image_list)
                # target.close()
                csv_row.insert(i, [self.csv_header[i], (image_count - 2).__str__()])
            elif image_count == 0:
                csv_row.insert(i, [self.csv_header[i], 0])
            else:
                csv_row.insert(i, [self.csv_header[i], 0])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], e.__str__() + ' image info failed!!'])
            exit_call = e.__str__() + ' image info failed!!'
            print(exit_call)'''
        # TODO: IMAGES per page
        i = 17
        percent_img_per_page = float
        try:
            if not image_count == 0 or page_count == 0:
                percent_img_per_page = (float(image_count) /
                                        float(page_count)) * 100
            else:
                percent_img_per_page = 0
            csv_row.insert(i, [self.csv_header[i], percent_img_per_page])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], 'IMG: ' + e.__str__()])
        # TODO: OCR risk
        i = 18
        try:
            if words_per_page == 0 or percent_img_per_page > 3000:
                ocr_risk = 5
            elif words_per_page < 15 or percent_img_per_page > 2000:
                ocr_risk = 4
            elif words_per_page < 40 or percent_img_per_page > 1000:
                ocr_risk = 3
            elif words_per_page < 70 or percent_img_per_page > 425:
                ocr_risk = 2
            elif words_per_page < 80 or percent_img_per_page > 200:
                ocr_risk = 1
            else:
                ocr_risk = 0
            csv_row.insert(i, [self.csv_header[i], ocr_risk])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], 'OCR: ' + e.__str__()])
        # author, creator, producer, subject, title,
        di = pf
        try:
            di = pf.documentInfo
        except Exception as e:
            exit_call = e.__str__() + ' :: DOCUMENT INFO LOAD failed!!'
            print(exit_call)

        # Document info
        if di:
            # Author
            try:
                i = 19
                if di.author:
                    csv_row.insert(
                        i, [self.csv_header[i],
                            di.author.encode('UTF-8')])
                else:
                    csv_row.insert(i, [self.csv_header[i], 'NULL'])
            except Exception as e:
                csv_row.insert(i,
                               [self.csv_header[i], 'AUTHOR: ' + e.__str__()])
                exit_call = e.__str__() + ' doc info failed!!'
                print(exit_call)
            # Creator
            try:
                i = 20
                if di.creator:
                    csv_row.insert(
                        i, [self.csv_header[i],
                            di.creator.encode('UTF-8')])
                else:
                    csv_row.insert(i, [self.csv_header[i], 'NULL'])
            except Exception as e:
                csv_row.insert(i,
                               [self.csv_header[i], 'CREATOR: ' + e.__str__()])
                print(exit_call)
                print('#5.1')
            # Producer
            try:
                i = 21
                if di.producer:
                    csv_row.insert(
                        i, [self.csv_header[i],
                            di.producer.encode('UTF-8')])
                else:
                    csv_row.insert(i, [self.csv_header[i], 'NULL'])
            except Exception as e:
                csv_row.insert(
                    i, [self.csv_header[i], 'PRODUCER: ' + e.__str__()])
                print(exit_call)
            # Subject
            try:
                i = 22
                if di.subject:
                    csv_row.insert(
                        i, [self.csv_header[i],
                            di.subject.encode('UTF-8')])
                else:
                    csv_row.insert(i, [self.csv_header[i], 'NULL'])
            except Exception as e:
                csv_row.insert(i,
                               [self.csv_header[i], 'SUBJECT: ' + e.__str__()])
                print(exit_call)
            # Title
            try:
                i = 23
                if di.title:
                    csv_row.insert(
                        i, [self.csv_header[i],
                            di.title.encode('UTF-8')])
                else:
                    csv_row.insert(i, [self.csv_header[i], 'NULL'])
            except Exception as e:
                csv_row.insert(i,
                               [self.csv_header[i], 'TITLE: ' + e.__str__()])
                print(exit_call)
        # Document clip
        i = 24
        try:
            csv_row.insert(i, [self.csv_header[i], write_clip])
        except Exception as e:
            csv_row.insert(i, [self.csv_header[i], e.__str__()])
        # Write results
        row = []
        for i in range(csv_row.__len__()):
            row.append(csv_row[i][1])
        report_path = self.report_folder + self.report_name
        # COPLETE WRITE
        with open(report_path, 'a', encoding='utf8', newline='') as csv_file:
            writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
            writer.dialect.lineterminator.replace('\n', '')
            writer.writerow(row)
        # csv_file.close()
        fp.close()
        os.remove(self.pdf_path)

        # Log close
        msg = (' >>>> PDF complete:[' + self.url + '] ' +
               self.line_count.__str__() + ' ' +
               (datetime.datetime.now().__str__()[:-7]))
        print(msg)
        utils.logline(self.log, msg)
コード例 #13
0
 def process_file(self):
     infile = PdfFileReader(open(self.fp, 'rb'))
     fields = infile.getFields()
     self.content = self.parse_fields(fields)
コード例 #14
0
ファイル: PyPDF_002.py プロジェクト: snzolnikov/PDF
#!/usr/bin/python
from PyPDF2 import PdfFileReader

pdf_document = "file.pdf"
with open(pdf_document, "rb") as filehandle:
    pdf = PdfFileReader(filehandle)
    info = pdf.getDocumentInfo()
    pages = pdf.getNumPages()
    print('file information: ', info)
    print("number of pages: %i" % pages)
    page1 = pdf.getPage(0)
    print(pdf.getIsEncrypted())
    print(pdf.pageMode)
    print(pdf.getFields())
    print(pdf.stream)
    print(pdf.flattenedPages)
    print(page1)
    print(page1.extractText())
コード例 #15
0
            True)
        return writer

    except Exception as e:
        print('set_need_appearances_writer() catch : ', repr(e))
        return writer


csvin = "H:\\gitprojects\\\PyPDF2-Pandas-PDFFieldUpdater\\in\\data.csv"
infile = "H:\\gitprojects\\\PyPDF2-Pandas-PDFFieldUpdater\\in\\PatientIntakeForm.pdf"
data = pd.read_csv(csvin)
pdf = PdfFileReader(open(infile, "rb"), strict=False)
if "/AcroForm" in pdf.trailer["/Root"]:
    pdf.trailer["/Root"]["/AcroForm"].update(
        {NameObject("/NeedAppearances"): BooleanObject(True)})
fields = pdf.getFields()  # Run in console to see Key names for field entry

i = 0  #Filename numerical prefix
for j, rows in data.iterrows():
    outfile = "H:\\gitprojects\\\PyPDF2-Pandas-PDFFieldUpdater\\out\\"
    i += 1

    pdf2 = PdfFileWriter()
    set_need_appearances_writer(pdf2)
    if "/AcroForm" in pdf2._root_object:
        pdf2._root_object["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    if "/AcroForm" in pdf2._root_object:
        pdf2._root_object["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})
コード例 #16
0
ファイル: autofill.py プロジェクト: cgloudeman/fill_pdf
def get_headers(infile):
    inputStream = open(infile, "rb")
    pdf_reader = PdfFileReader(inputStream, strict=False)
    fields = pdf_reader.getFields().keys()
    print('Headers needed for data file: {}'.format(list(fields)))
コード例 #17
0
ファイル: PDFForm2Excel.py プロジェクト: bsuerig/python
def PDFForm2Excel(mypath, outfile):
    errcount = 0
    mypath = abspath(mypath)
    pdffiles = []
    #Put PDF Files into Array pdffiles
    if parser.parse_args(
    ).boolean_recursive == False:  #non recursive: Only input Folder is analyzed
        pdffiles = sorted([
            join(mypath, f) for f in listdir(mypath)
            if f[-4:] == '.pdf' and isfile(join(mypath, f))
        ])
    if parser.parse_args(
    ).boolean_recursive == True:  #recursive: input Folder and Subfolders are analyzed
        for r, d, f in walk(mypath):
            for file in f:
                if file[-4:] == '.pdf' in file:
                    pdffiles.append(join(abspath(r), file))

    #Start with Initial PDF to create Dataframe schema
    #It needs to be assured that the first PDF and its Form Fields are always readable
    #They create the master schema for all following PDFs
    #It also has to be assured that the follwing PDF Forms have the same fields in the same order
    #print(pdffiles)
    pdf = pdffiles[0]
    # pdf
    #Define Objects f (file) and fields
    f = PdfFileReader(pdf)
    fields = f.getFields()
    #Define Dataframe Object for Results
    results = pd.DataFrame()
    #Define dataframe for Field Name List
    try:
        df = pd.DataFrame([(k, k1, v1) for k, v in fields.items()
                           for k1, v1 in v.items()],
                          columns=['Field', 'Type', pdf])
        df2 = df.loc[df['Type'] == '/V']  # Filter for Values := '/V' only
        df2 = df2.filter(items=['Field', pdf])
        df2 = df2.reset_index(drop=True)  # Reset Row Index
        results = df2.set_index('Field')
        results = results.drop(pdf, axis='columns')
    except:
        exit("##initial pdf read error## " + pdf)
    #Loop through all PDF Files in Input Folder
    for pdf in pdffiles:
        f = PdfFileReader(pdf)  # PDF Fileobject
        producer = f.getDocumentInfo().producer
        try:
            fields = f.getFields()
            df = pd.DataFrame([(k, k1, v1) for k, v in fields.items()
                               for k1, v1 in v.items()],
                              columns=['Field', 'Type', pdf])
            df2 = df.loc[df['Type'] == '/V']  # Filter for Values := '/V' only
            df2 = df2.filter(items=['Field',
                                    pdf])  # Add Filename as ColumnHeader
            df2 = df2.set_index('Field')  # Set 'Field' as Row Index
            df2[pdf] = df2[pdf].map(lambda x: x.lstrip(
                '='))  #remove heading '=' from future excel Cells
            results = results.merge(
                df2, on='Field', how='left')  # Write Values to Array 'results'
            print("read success   " + str(producer) + " " + pdf)
        except:
            results[pdf] = ''  #Create empty Column in results for failed PDF
            print("##read error## " + str(producer) + " " + pdf)
            errcount = errcount + 1
            continue
    print('--------------')
    print("Summary: " + str(len(pdffiles)) + " files read with " +
          str(errcount) + " file read Errors")
    #write results to Excel
    try:
        if parser.parse_args().boolean_col == False:
            results.T.to_excel(
                mypath + outfile, header=True, index=True
            )  # Write Result dataframe in one line per PDF to Excel File
        else:
            results.to_excel(
                mypath + outfile, header=True, index=True
            )  # Write Result dataframe in one Coumn per PDF to Excel File
        print("File " + mypath + outfile + " successfully written")
    except:
        print("Error writing File " + mypath + outfile)
コード例 #18
0
ファイル: __init__.py プロジェクト: Spencer-Sleep/Programs
def cargoDoc():
    fp = open(r"C:\Users\ssleep\Documents\Programming\Cargo Docker\Thursday\LCBO\601331975 PARS MANIFESTS.pdf", 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    visited = set()
    
    pars = []
    
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            if objid in visited: continue
            visited.add(objid)
            obj = doc.getobj(objid)
            if obj is None: continue
            pars = extract(objid,obj)

    pdfFileObj = open(specificPath, 'rb')
    pdfReader = PdfFileReader(pdfFileObj)
    
    fields = pdfReader.getFields()
#     print(len(fields)-15)


    for i in range(len(fields)-15):
        containerNumber = ""
        weight = ""
        consignee = ""
        shipper = ""
        eta = ""
        portOfLoading = ""
        portOfDischarge = ""
        description = ""
        if i == 0:
#             prefix = str(i) + "."
            containerNumber = fields["Container Row1"].value
            weight = float(fields["Weight KGRow1"].value)
            consignee = fields["Consignee"].value
            shipper = fields["Shipper"].value
            eta = fields["ETA DATE"].value
            portOfLoading = fields["undefined"].value
            portOfDischarge = fields["Port of Discharge"].value
            description = fields["Description of goods"].value
        else:
            for j in list(fields.keys()):
                if j==str(i):
                    for k in list(fields[j]["/Kids"]):
                        try:
                            if(k.getObject()['/T']=="WO"):
                                wo=k.getObject()['/V']
                            elif(k.getObject()['/T']=="Container Row1"):
                                containerNumber=k.getObject()['/V']
                            elif(k.getObject()['/T']=="SizeRow1"):
                                size=k.getObject()['/V']
                            elif(k.getObject()['/T']=="Weight KGRow1"):
                                weight=float(k.getObject()['/V'])
                            elif(k.getObject()['/T']=="Consignee"):
                                consignee=k.getObject()['/V']
                            elif(k.getObject()['/T']=="Shipper"):
                                shipper=k.getObject()['/V']
                            elif(k.getObject()['/T']=="ETA DATE"):
                                eta=k.getObject()['/V']
                            elif(k.getObject()['/T']=="undefined"):
                                portOfLoading=k.getObject()['/V']
                            elif(k.getObject()['/T']=="Port of Discharge"):
                                portOfDischarge=k.getObject()['/V']
                            elif(k.getObject()['/T']=="Description of goods"):
                                description=k.getObject()['/V']    
                        except KeyError:
                            True
コード例 #19
0
now = datetime.datetime.now()
print str(now)
year = now.year
month = now.month
day = now.day
month_six_later = month + 6
date = '{}/{}/{}'.format(day, month, year)
date_six_later = '{}/{}/{}'.format(day, month_six_later, year)
print(date)

#infile = "Test-Sheet-3.pdf"
infile = askopenfilename()
pdf_reader = PdfFileReader(open(infile, "rb"))

dictionary = pdf_reader.getFormTextFields()  # returns a python dictionary
dictionary_2 = pdf_reader.getFields(tree=None, retval=None)

patient_last_name = str(dictionary['Pat_LastName'])
patient_first_name = str(dictionary['Pat_FirstName'])
patient_name = '{} {}'.format(patient_first_name, patient_last_name)
#patient_gender = str(dictionary['Pat_Gender'])
patient_DOB = str(dictionary['Pat_DOB'])

patient_gender = str(dictionary_2['Pat_Gender'])

if patient_gender[102] == 'F':
    patient_gender = 'Female'
elif patient_gender[102] == 'M':
    patient_gender = 'Male'

knee_for_analysis = str(dictionary_2['Pat_Side'])
コード例 #20
0
	def write(client, doc):

		path_in_file = os.path.join(PDF_TEMPLATE_DIR, doc.file_name)
		inpt = open(path_in_file, 'rb')
		clients_file_name = str(client.first_name) + ' ' + str(client.last_name) + \
							'_' + str(doc.file_name)  # date or time
		p_file_path = os.path.join(PDF_GENERATED_RESULT_DIR, clients_file_name)

		reads = PdfFileReader(inpt)
		read = reads.getFormTextFields()
		checkboxes = reads.getFields()

		##        дефолтные значения ne menyat
		read['Text Field 490'] = ' '  # fio esli menyalos
		read['Text Field 473'] = ' '  # reklama
		read['Text Field 475'] = ' '  # inoe
		read['Text Field 493'] = ' '
		read['Text Field 494'] = ' '
		read['Text Field 492'] = ' '
		read['Text Field 496'] = ' '
		read['Text Field 497'] = ' '
		read['Text Field 498'] = ' '
		read['Text Field 50910'] = ' '
		read['Text Field 50610'] = ' '
		read['Text Field 505'] = ' '
		read['Text Field 504'] = ' '
		read['Text Field 506'] = ' '
		for i in range(11, 21, 1):
			read['Text Field 50' + str(i)] = ' '

		##Заемщик\созаемщик
		checkboxes['Check Box 136'] = '/Yes'  # zaemschik
		checkboxes['Check Box 137'] = '/Yes'  # sozaemschik
		checkboxes['Check Box 97'] = '/Yes'  # не убирать
		##########        Адрес
		checkboxes['Check Box 138'] = '/Yes'  # fakt adres sovpadaet s registr

		############Основания для проживания
		checkboxes['Check Box 101'] = '/Yes'  # соц наем
		checkboxes['Check Box 102'] = '/Yes'  # коммерческий наем
		checkboxes['Check Box 103'] = '/Yes'  # собственность
		checkboxes['Check Box 104'] = '/Yes'  # у родственников
		checkboxes['Check Box 105'] = '/Yes'  # иное, отразить в Листе дополнений
		######        Семейное положение
		checkboxes['Check Box 106'] = '/Yes'  # женат\замужем
		checkboxes['Check Box 107'] = '/Yes'  # в разводе
		checkboxes['Check Box 108'] = '/Yes'  # вдовец\вдова
		checkboxes['Check Box 109'] = '/Yes'  # гражданский брак
		checkboxes['Check Box 110'] = '/Yes'  # холост\не замужем

		##        Брачный договор
		checkboxes['Check Box 111'] = '/Yes'  # есть
		checkboxes['Check Box 112'] = '/Yes'  # нет
		##        Изменялась фамилия
		checkboxes['Check Box 113'] = '/Yes'  # да
		checkboxes['Check Box 114'] = '/Yes'  # нет
		# Дети совместно проживают
		# первый ребенок
		checkboxes['Check Box 115'] = '/Yes'  # да
		checkboxes['Check Box 116'] = '/Yes'  # net
		##            второй ребенок
		checkboxes['Check Box 117'] = '/Yes'  # da
		checkboxes['Check Box 118'] = '/Yes'  # net
		##            третий ребенок
		checkboxes['Check Box 119'] = '/Yes'  # da
		checkboxes['Check Box 120'] = '/Yes'  # net
		############Образование
		checkboxes['Check Box 121'] = '/Yes'  # nizhe srednego
		checkboxes['Check Box 122'] = '/Yes'  # srednee
		checkboxes['Check Box 123'] = '/Yes'  # srednee spec
		checkboxes['Check Box 124'] = '/Yes'  # neokon vishee
		checkboxes['Check Box 125'] = '/Yes'  # highest
		checkboxes['Check Box 126'] = '/Yes'  # neskolko high
		checkboxes['Check Box 127'] = '/Yes'  # dop vish
		checkboxes['Check Box 128'] = '/Yes'  # uchenaya stepen
		checkboxes['Check Box 129'] = '/Yes'  # MBA
		checkboxes['Check Box 130'] = '/Yes'  # inoe
		################занятонсть
		checkboxes['Check Box 131'] = '/Yes'  # yavlyatsya zarplatnym proektom
		checkboxes['Check Box 132'] = '/Yes'  # ne yavlyaetsa
		############Место работы
		checkboxes['Check Box 133'] = '/Yes'  # ispytatelny srok
		checkboxes['Check Box 134'] = '/Yes'  # ne ispytatelny srok
		checkboxes['Check Box 139'] = '/Yes'  # по найму бессрочно
		checkboxes['Check Box 140'] = '/Yes'  # по найму срочно
		checkboxes['Check Box 141'] = '/Yes'  # ИП
		checkboxes['Check Box 142'] = '/Yes'  # собственность бизнеса
		######        Сфера деятельности организации
		checkboxes['Check Box 144'] = '/Yes'  # армия
		checkboxes['Check Box 145'] = '/Yes'  # ИТ
		checkboxes['Check Box 146'] = '/Yes'  # Консалтинг
		checkboxes['Check Box 147'] = '/Yes'  # Медицина
		checkboxes['Check Box 148'] = '/Yes'  # наука
		checkboxes['Check Box 149'] = '/Yes'  # образование
		checkboxes['Check Box 150'] = '/Yes'  # строительство
		checkboxes['Check Box 151'] = '/Yes'  # отповая розничная культура
		checkboxes['Check Box 152'] = '/Yes'  # органы власти и управления
		checkboxes['Check Box 153'] = '/Yes'  # охранная деятельность
		checkboxes['Check Box 154'] = '/Yes'  # предприятия ТЭК
		checkboxes['Check Box 155'] = '/Yes'  # промышленность и машиностроение
		checkboxes['Check Box 156'] = '/Yes'  # социальная сфера
		checkboxes['Check Box 157'] = '/Yes'  # транспорт
		checkboxes['Check Box 158'] = '/Yes'  # туризм
		checkboxes['Check Box 159'] = '/Yes'  # услуги
		checkboxes['Check Box 160'] = '/Yes'  # финансы, банки, стразование
		checkboxes['Check Box 161'] = '/Yes'  # другие отрасли
		######################Численность персонала
		checkboxes['Check Box 162'] = '/Yes'  # do 10
		checkboxes['Check Box 163'] = '/Yes'  # 11-50
		checkboxes['Check Box 164'] = '/Yes'  # 51-100
		checkboxes['Check Box 165'] = '/Yes'  # 101-500
		checkboxes['Check Box 166'] = '/Yes'  # 501-1000
		checkboxes['Check Box 167'] = '/Yes'  # >1000
		################Срок существования организации
		checkboxes['Check Box 168'] = '/Yes'  # до 2 лет
		checkboxes['Check Box 169'] = '/Yes'  # от 2 до 5 лет
		checkboxes['Check Box 170'] = '/Yes'  # свыше 5 лет
		################Дополнительное место работы
		checkboxes['Check Box 171'] = '/Yes'  # имею
		checkboxes['Check Box 172'] = '/Yes'  # не имею
		##########Денежные средства (с учетом первоначального взноса)
		checkboxes['Check Box 17310'] = '/Yes'  # имею
		checkboxes['Check Box 174'] = '/Yes'  # не имею#
		####################Автомобиль
		checkboxes['Check Box 175'] = '/Yes'  # есть
		checkboxes['Check Box 176'] = '/Yes'  # нет
		##########Недвижисое имущество
		checkboxes['Check Box 1731011'] = '/Yes'  # есть
		checkboxes['Check Box 173'] = '/Yes'  # нет

		############Основания возниконовения права на имущество
		checkboxes['Check Box 177'] = '/Yes'  # покупка
		checkboxes['Check Box 178'] = '/Yes'  # приватизация
		checkboxes['Check Box 179'] = '/Yes'  # наследство
		checkboxes['Check Box 180'] = '/Yes'  # дарение
		checkboxes['Check Box 181'] = '/Yes'  # иное
		##############процедура бонкротства
		checkboxes['Check Box 182'] = '/Yes'  # применялось
		checkboxes['Check Box 183'] = '/Yes'  # не применялось
		################Алиментные обязательства
		checkboxes['Check Box 184'] = '/Yes'  # yest
		checkboxes['Check Box 185'] = '/Yes'  # net
		################Не редаткировать. Принять условия соглашения
		checkboxes['Check Box 186'] = '/Yes'  # иное
		checkboxes['Check Box 187'] = '/Yes'  # иное
		checkboxes['Check Box 189'] = '/Yes'  # согласие на обработку ПДн
		######################Представитель
		checkboxes['Check Box 188'] = '/Yes'  # есть представитель

		################Клиент
		read['Text Field 470'] = 'stepen rodstva s zaemschikom'
		read['Text Field 471'] = client.last_name + ' ' + client.first_name + \
								 ' ' + client.part_name
		read['Text Field 472'] = client.passport.gender  # male/female
		read['Text Field 474'] = client.snils  # 'snils'
		read['Text Field 476'] = client.inn  # 'INN'
		read['Text Field 477'] = 'index'
		read['Text Field 478'] = 'РФ'
		read['Text Field 479'] = 'oblast'
		read['Text Field 480'] = 'rayon'
		read['Text Field 481'] = client.address.city  # 'naselenny punkt'
		read['Text Field 482'] = client.address.street  # 'street'
		read['Text Field 483'] = client.address.buildingNumber  # 'number of home'
		read['Text Field 484'] = 'korpus'
		read['Text Field 485'] = client.address.flat  # 'flat'
		read['Text Field 486'] = 'phone'
		read['Text Field 487'] = 'home phone reg'
		read['Text Field 488'] = 'home phone prozhivanie'
		read['Text Field 489'] = 'work phone'
		read['Text Field 490'] = 'e-mail'
		if checkboxes['Check Box 113'] == '/Yes':  # изменялась ли фамилмя
			read['Text Field 491'] = 'FIO'
			read['Text Field 492'] = 'god izmeneniya'
		##################Дети
		read['Text Field 493'] = 'data rozhdeniya 1go rebenka'
		read['Text Field 494'] = 'data rozhdeniya 2go rebenka'

		################Зарплатный проект
		if checkboxes['Check Box 131'] == '/Yes':
			read['Text Field 496'] = 'nomer karty'
		####################Работа
		if checkboxes['Check Box 136'] == '/Yes':
			read['Text Field 497'] = 's'
			read['Text Field 498'] = 'do'
		if checkboxes['Check Box 138'] == '/Yes':
			read['Text Field 499'] = '% buisness'
		read['Text Field 500'] = 'должность'
		read['Text Field 501'] = 'среднемесячный доход'
		read['Text Field 502'] = "стаж работы на текущем месте, лет"
		read['Text Field 50311'] = 'Стаж по профилю, лет'
		read['Text Field 50411'] = 'Общий стаж работы общий, лет'
		read['Text Field 50510'] = 'Название организации'
		read['Text Field 50610'] = 'инн организации'
		read['Text Field 50710'] = 'фактический адрес'
		read['Text Field 50810'] = 'телефон организации'
		read['Text Field 50910'] = 'добавочный номер'
		read['Text Field 5010'] = 'сайт организации'
		if checkboxes['Check Box 151'] == '/Yes':
			read['Text Field 505'] = 'сфера розничной торговли'
		if checkboxes['Check Box 159'] == '/Yes':
			read['Text Field 504'] = 'уточните сферу'
		if checkboxes['Check Box 161'] == '/Yes':
			read['Text Field 503'] = 'Уточните'

		######################Активы
		if checkboxes['Check Box 173'] == '/Yes':
			read['Text Field 5011'] = 'Наличные средства, сумма, руб'
			read['Text Field 5012'] = 'Банк №1'
			read['Text Field 5013'] = 'Банк №2'
			read['Text Field 5014'] = 'Сумма'
			read['Text Field 5015'] = 'Сумма'
		if checkboxes['Check Box 175'] == '/Yes':
			read['Text Field 5016'] = 'марка'
			read['Text Field 5017'] = 'год приобретения'
			read['Text Field 5018'] = 'стоимость по вашей оценке'
		if checkboxes['Check Box 1731011'] == '/Yes':
			read['Text Field 5019'] = 'Тип объекта недвижимости'
			read['Text Field 5020'] = 'Текущая рыночная стоимость(по вашей оценке)'
		if checkboxes['Check Box 181'] == '/Yes':
			read['Text Field 506'] = 'иное'

		######################Представитель
		if checkboxes['Check Box 188'] == '/Yes':
			read['Text Field 5021'] = 'Фио представителя'
		########################Согласие на Пдн
		read['Text Field 5026'] = client.last_name + ' ' + client.first_name + \
								  ' ' + client.part_name  # Пдн

		outpt = open(p_file_path, 'wb')
		write = PdfFileWriter()
		set_need_appearances_writer(write)
		for i in range(reads.getNumPages()):
			write.addPage(reads.getPage(i))
			updateCheckboxValues(reads.getPage(i), checkboxes)
			write.updatePageFormFieldValues(reads.getPage(i), read)
		write.write(outpt)
		inpt.close()
		outpt.close()
コード例 #21
0
    # this is the smallest example of a pdf I could find from the examples at
    # https://stackoverflow.com/questions/17279712/what-is-the-smallest-possible-valid-pdf.
    # it does _not_ successfully parse with pypdf2, but it might be enough to exercise the
    # code enough to get internal imports or caches initialized before the fork. IRL you
    # might like to use a real pdf for this.
    r = PdfFileReader(
        BytesIO(
            codecs.decode(
                b"255044462D312E0D747261696C65723C"
                b"3C2F526F6F743C3C2F50616765733C3C"
                b"2F4B6964735B3C3C2F4D65646961426F"
                b"785B302030203320335D3E3E5D3E3E3E"
                b"3E3E3E",
                "hex",
            )))
except PyPdfError:
    pass

import sys
from cpytraceafl import fuzz_from_here, crashing_excepthook

fuzz_from_here(excepthook=crashing_excepthook)

with open(sys.argv[1], "rb") as f:
    try:
        r = PdfFileReader(f)
        r.getFields()
        r.getXmpMetadata()
    except PyPdfError:
        pass
コード例 #22
0
def get_form_fields(infile):
    infile = PdfFileReader(open(infile, 'rb'))
    fields = infile.getFields()
    return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())
コード例 #23
0
	def write(client, doc):
		path_in_file = os.path.join(PDF_TEMPLATE_DIR, doc.file_name)
		inpt = open(path_in_file, 'rb')
		clients_file_name = str(client.first_name) + ' ' + str(client.last_name) + \
							'_' + str(doc.file_name)  # date or time
		p_file_path = os.path.join(PDF_GENERATED_RESULT_DIR, clients_file_name)

		reads = PdfFileReader(inpt)
		read = reads.getFormTextFields()
		checkboxes = reads.getFields()
		checkboxes['chk0'] = '/Yes'

		##    checkboxes['chk1'] = '/Yes' #Созаемщик
		##    checkboxes['chk2'] = '/Yes' #Поручитель
		checkboxes['untitled9'] = '/Yes'  # Квартира
		checkboxes['untitled10'] = '/Yes'  # Дом с участком
		checkboxes['untitled11'] = '/Yes'  # Апартаменты
		checkboxes['untitled12'] = '/Yes'  # Таунхаус
		checkboxes['untitled13'] = '/Yes'  # страховка
		checkboxes['untitled14'] = '/Yes'  # мужчина
		checkboxes['untitled15'] = '/Yes'  # женщина
		##    if str(checkboxes['chk1']) == '/Yes' or str(checkboxes['chk2']) == '/Yes':
		##        read['str0'] = 'фамилия заемщика'
		##    if checkboxes['chk']
		read['str1'] = 'Сумма кредита'
		read['str2'] = 'срок кредита'
		read['str3'] = 'сумма первоначального взноса'
		read['str4'] = 'стоимость объекта'
		read['str5'] = 'запрашиваемая сумма top up'
		read['str6'] = 'регион приобретения недвижимости'
		read['str7'] = client.last_name
		read['str8'] = client.first_name
		read['str9'] = client.part_name
		read['str10'] = 'дата рождения'
		read['str11'] = 'гражданство'
		read['str12'] = 'место рождения'
		read['str13'] = client.snils  # 'снилс'
		read['str14'] = client.inn  # inn
		read['str15'] = 'фио при изменении'
		read['str16'] = client.passport.serial + ' ' + client.passport.number  # 'серия номер паспорта'
		read['str17'] = 'дата выдачи'
		read['str18'] = 'код подразделения'
		read['str19'] = client.passport._from  # 'кем выдан'
		read['str20'] = 'адрес регистрации'
		read['str21'] = 'адрес проживания'  # lj,bnm
		read['str22'] = 'мобильный'
		read['str23'] = 'регистрац'
		read['str24'] = 'тел жит'
		read['str25'] = 'email'
		read['str62'] = 'кол-во детей'
		read['str26'] = 'регистрац'

		outpt = open(p_file_path, 'wb')
		write = PdfFileWriter()
		set_need_appearances_writer(write)
		for i in range(reads.getNumPages() - 1):  # пока хз почему
			write.addPage(reads.getPage(i))
			updateCheckboxValues(reads.getPage(i), checkboxes)
		write.updatePageFormFieldValues(reads.getPage(0), read)
		write.write(outpt)
		inpt.close()
		outpt.close()
コード例 #24
0
def return_infile():
    from PyPDF2 import PdfFileReader
    infile = "C:\\Users\\\mark.nations\\Desktop\\H-15\\h15.pdf"
    pdf = PdfFileReader(open(infile, "rb"), strict=False)
    fields = pdf.getFields()
    return PdfFileReader, fields, infile, pdf
コード例 #25
0
def arrive(specificPath):
    pdfFileObj = open(specificPath, 'rb')
    pdfReader = PdfFileReader(pdfFileObj)

    fields = pdfReader.getFields()
    #     print(len(fields)-15)

    for i in range(len(fields) - 15):
        driver.switch_to_default_content()
        driver.switch_to_frame(
            driver.find_element_by_css_selector(
                "frame[src='portals/portal.asp']"))

        containerNumber = ""
        size = ""
        weight = ""
        otherInfo = ""
        consignee = ""
        if i == 0:
            #             prefix = str(i) + "."
            wo = fields["WO"].value
            containerNumber = fields["Container Row1"].value
            size = fields["SizeRow1"].value
            weight = float(fields["Weight KGRow1"].value)
            otherInfo = fields["Other info"].value
            consignee = fields["Consignee"].value
        else:
            for j in list(fields.keys()):
                if j == str(i):
                    for k in list(fields[j]["/Kids"]):
                        try:
                            if (k.getObject()['/T'] == "WO"):
                                wo = k.getObject()['/V']
                            elif (k.getObject()['/T'] == "Container Row1"):
                                containerNumber = k.getObject()['/V']
                            elif (k.getObject()['/T'] == "SizeRow1"):
                                size = k.getObject()['/V']
                            elif (k.getObject()['/T'] == "Weight KGRow1"):
                                weight = float(k.getObject()['/V'])
                            elif (k.getObject()['/T'] == "Other info"):
                                otherInfo = k.getObject()['/V']
                            elif (k.getObject()['/T'] == "Consignee"):
                                consignee = k.getObject()['/V']
                        except KeyError:
                            True
        elem = driver.find_element_by_name("container_prefix_dof")
        elem.send_keys(containerNumber[:4])
        elem = driver.find_element_by_name("container_number_dof")
        elem.send_keys(containerNumber[4:11])
        select = Select(driver.find_element_by_name("ddlLoadStatus_dof"))
        select.select_by_visible_text("Load")
        select = Select(driver.find_element_by_name("lineid"))
        select.select_by_visible_text("Hapag-Lloyd Container Line")
        elem = driver.find_element_by_name("ddlSzTyCnt")
        elem.send_keys(size)
        elem = driver.find_element_by_name("cargo_weight")
        elem.send_keys(str(weight))
        select = Select(driver.find_element_by_name("ddWeightUnits"))
        select.select_by_visible_text("Kgs")
        elem = driver.find_element_by_id("CkbCR")
        elem.click()
        try:
            if not ("LCBO" in consignee or "LIQUOR CONTROL" in consignee):
                elem = driver.find_element_by_id("CkbFR")
                elem.click()
        except:
            elem = driver.find_element_by_id("CkbFR")
            elem.click()
        reservation = "import"

        if size == "20R86" or size == "40R96":
            m = re.search("Temperature: ", otherInfo)
            n = re.search(r"\.\d+ C", otherInfo[m.end():])
            #             print(str(m.end()) + "  " + str(n.start()))
            reservation += otherInfo[m.end():n.start() + m.end()] + "c"

        elem = driver.find_element_by_name("bkg_nbr_dof")
        elem.send_keys(reservation)
        select = Select(driver.find_element_by_name("Line"))
        select.select_by_visible_text("Hapag-Lloyd Container Line")

        elem = driver.find_element_by_name("Submit")
        elem.click()
        wait = WebDriverWait(driver, 10)
        wait.until(lambda driver: "Equipment is already on Terminal" in driver.
                   page_source or EC.element_to_be_clickable(
                       driver.find_element_by_name("Close")))
        if "Equipment is already on Terminal" in driver.page_source:
            f = open(testfile, "a+")
            f.write("WO: " + wo + "          " + "Container: " +
                    containerNumber + "\n")
            f.close()
            driver.switch_to_default_content()
            driver.switch_to_frame(
                driver.find_element_by_css_selector(
                    "frame[src='MenuNavFrame.asp?MenuID=10']"))
            elem = driver.find_element_by_css_selector(
                'a[href*="Gate/VirtualArrive/VirtualArrive.asp"')
            elem.click()
        else:
            elem = driver.find_element_by_name("Close")
            elem.click()
コード例 #26
0
    def make_pdf(self):
        #Import dependencies
        from PyPDF2 import PdfFileReader
        from datetime import datetime
        import os
        import pypdftk
        import pytz

        pdf_pages = []

        #Cycle through pages
        for j, page in enumerate(self.pages):

            template_name = os.path.join(THIS_FOLDER,
                                         "./pdf_templates/form.pdf")

            #Read pdf templates using PyPDF2
            form = PdfFileReader(open(template_name, "rb"))

            #Get main form field names from pdf reader
            fields = form.getFields(tree=None, retval=None, fileobj=None)
            field_names = list(fields.keys())

            #Make a copy of field_values
            field_values = self.details[:]

            #Add values from each page
            for product in page:
                field_values += [
                    product.reference, product.lot, product.quantity,
                    product.description
                ]

            #Pad out unused fields, zip into dict for writing
            field_values += [""] * (len(field_names) - len(field_values))
            field_dict = dict(
                zip(field_names, map(lambda x: x.upper(), field_values)))
            #Add page to writer, update fields from input data
            pdf_pages.append(pypdftk.fill_form(template_name, field_dict))

        if self.checklist:
            end_form_template_name = os.path.join(
                THIS_FOLDER, "./pdf_templates/end_page.pdf")
            #Get pdf templates using PyPDF2
            end_form = PdfFileReader(open(end_form_template_name, "rb"))

            #Get end form fields from reader
            end_fields = end_form.getFields(tree=None,
                                            retval=None,
                                            fileobj=None)
            end_field_names = list(end_fields.keys())

            #Populate end field values with name and date, position depending on options
            end_field_values = [""] * 4
            index = 2 if self.new else 0
            tz = pytz.timezone("Australia/Brisbane")
            current_date = datetime.now(tz)
            end_field_values[index:index + 1] = [
                self.client.first_name + " " + self.client.last_name,
                current_date.strftime("%d/%m/%Y")
            ]

            #Zip end field values and names into dict
            end_field_dict = dict(zip(end_field_names, end_field_values))

            pdf_pages.append(
                pypdftk.fill_form(end_form_template_name, end_field_dict))
        pypdftk.concat(pdf_pages,
                       os.path.join(THIS_FOLDER, "../dynamic/print.pdf"))
コード例 #27
0
def printMeta(directorio):
    try:
        print('Analizando el directorio: ' + directorio, '\n')
        for dirpath, dirnames, files in os.walk(directorio):
            for name in files:
                ext = name.lower().rsplit('.', 1)[-1]
                #Documentos
                if ext in ['pdf']:
                    print("[*] Metadatos del archivo: %s " %
                          (dirpath + os.path.sep + name))
                    print(
                        '----------------------------------------------------------'
                    )
                    try:
                        pdfFile = PdfFileReader(
                            open(dirpath + os.path.sep + name,
                                 'rb'))  #abrimos el fichero
                        docInfo = pdfFile.getDocumentInfo(
                        )  #creamos un diccionario con la info recolectada

                        for metaItem in docInfo:
                            print('[+]' + metaItem + ':' +
                                  str(docInfo[metaItem]))

                        docInfoextra = {
                            pdfFile.getNumPages(): 'Numero de paginas: ',
                            pdfFile.getPageMode(): 'Modo de la pagina: ',
                            pdfFile.isEncrypted: 'Encriptacion: ',
                            pdfFile.getFields(): 'Campos de texto: '
                        }
                        for element in docInfoextra:
                            if element != None:
                                print('[+]/' + docInfoextra[element] +
                                      str(element))
                        xmpinfo = pdfFile.getXmpMetadata()
                    except:
                        pass
                    if xmpinfo != None:
                        if hasattr(xmpinfo, 'dc_contributor'):
                            print('[+]/' + 'dc_contributor',
                                  xmpinfo.dc_contributor)
                        elif hasattr(xmpinfo, 'dc_identifier'):
                            print('[+]/' + 'dc_identifier',
                                  xmpinfo.dc_identifier)
                        elif hasattr(xmpinfo, 'dc_date'):
                            print('[+]/' + 'dc_date', xmpinfo.dc_date)
                        elif hasattr(xmpinfo, 'dc_source'):
                            print('[+]/' + 'dc_source', xmpinfo.dc_source)
                        elif hasattr(xmpinfo, 'dc_subject'):
                            print('[+]/' + 'dc_subject', xmpinfo.dc_subject)
                        elif hasattr(xmpinfo, 'xmp_modifyDate'):
                            print('[+]/' + 'xmp_modifyDate',
                                  xmpinfo.xmp_modifyDate)
                        elif hasattr(xmpinfo, 'xmp_metadataDate'):
                            print('[+]/' +
                                  'xmp_metadataDate'), xmpinfo.xmp_metadataDate
                        elif hasattr(xmpinfo, 'xmpmm_documentId'):
                            print('[+]/' + 'xmpmm_documentId',
                                  xmpinfo.xmpmm_documentId)
                        elif hasattr(xmpinfo, 'xmpmm_instanceId'):
                            print('[+]/' + 'xmpmm_instanceId',
                                  xmpinfo.xmpmm_instanceId)
                        elif hasattr(xmpinfo, 'pdf_keywords'):
                            print('[+]/' + 'pdf_keywords',
                                  xmpinfo.pdf_keywords)
                        elif hasattr(xmpinfo, 'pdf_pdfversion'):
                            print('[+]/' + 'pdf_pdfversion',
                                  xmpinfo.pdf_pdfversion)
                    print("\n")
                #Imagenes
                elif ext in ['jpg', 'tiff']:
                    print("[*] Metadatos del archivo: %s " %
                          (dirpath + os.path.sep + name))
                    print(
                        '----------------------------------------------------------'
                    )
                    f = open(dirpath + os.path.sep + name, 'rb')
                    tags = exifread.process_file(f)
                    if len(tags) == 0:
                        print('[!]No hay metadatos')
                    for tag in tags.keys():
                        if tag not in ('JPEGThumbnail', 'TIFFThumbnail',
                                       'Filename', 'EXIF MakerNote'):
                            print("[+]: %s, valor %s" % (tag, tags[tag]))
                    print("\n")
                #Musica
                elif ext in ['mp3']:
                    print("[*] Metadatos del archivo: %s " %
                          (dirpath + os.path.sep + name))
                    print(
                        '----------------------------------------------------------'
                    )
                    tag = eyed3.id3.Tag()
                    tag.parse(dirpath + os.path.sep + name)
                    if tag.artist is not None:
                        print('Artista: ', tag.artist)
                    if tag.album is not None:
                        print('Album: ', tag.album)
                    if tag.title is not None:
                        print('Titulo: ', tag.title)
                    if tag.track_num[0] is not None:
                        print('Track: ', tag.track_num[0])
                    else:
                        print('[!]No hay metadatos')
                #Docs
                elif ext in ['docs']:
                    print("[*] Metadatos del archivo: %s " %
                          (dirpath + os.path.sep + name))
                    print(
                        '----------------------------------------------------------'
                    )
                    document = docx.Document(docx=dirpath + os.path.sep + name)
                    core_properties = document.core_properties
                    print(core_properties.author)
                    print(core_properties.created)
                    print(core_properties.last_modified_by)
                    print(core_properties.last_printed)
                    print(core_properties.modified)
                    print(core_properties.revision)
                    print(core_properties.title)
                    print(core_properties.category)
                    print(core_properties.comments)
                    print(core_properties.identifier)
                    print(core_properties.keywords)
                    print(core_properties.language)
                    print(core_properties.subject)
                    print(core_properties.version)
                    print(core_properties.keywords)
                    print(core_properties.content_status)

        print('[+]Ejecucion finalizada')

    except (KeyboardInterrupt, SystemExit):
        print('[!]Se ha interrumpido la ejecucion')
    except:
        print("Unexpected error:", sys.exc_info()[0])
コード例 #28
0
def main():
    pdf = PdfFileReader(file('tt.pdf', 'rb'), strict=False)
    print pdf.getFields({''})
コード例 #29
0
	def write(client, doc):
		path_in_file = os.path.join(PDF_TEMPLATE_DIR, doc.file_name)
		inpt = open(path_in_file, 'rb')
		clients_file_name = str(client.first_name) + ' ' + str(client.last_name) + \
							'_' + str(doc.file_name)  # date or time
		p_file_path = os.path.join(PDF_GENERATED_RESULT_DIR, clients_file_name)

		reads = PdfFileReader(inpt)
		read = reads.getFormTextFields()
		checkboxes = reads.getFields()
		##        checkboxes['chk0'] = '/Yes'
		read['gText1'] = 'Наименование компании-партнёра'
		read['gText2'] = 'ФИО сотрудника компании-партнёра'
		read['gText3'] = '*****@*****.**'
		read['Text1'] = read['Text28'] = 'Фамилия'  # client.last_name
		read['Text2'] = 'Имя'  # client.first_name
		read['Text3'] = 'Отчество'  # client.part_name
		##        read['Text28'] = client.first_name + client.part_name
		read['gNum1'] = 9379373737  # телефон партнера
		read['Num1'] = 99999  # запрашиваемая сумма кредита
		read['Num2'] = 122  # количество месяцев срок кредита
		read['Num3'] = 99999  # Предваритаельная стоимость жилья
		read['Text32'] = 'РФ'
		read['Text7'] = 'РФ'
		read['Text8'] = 'oblast'
		read['Text9'] = 'rayon'
		read['Num33'] = 'номер квартиры'  # client.address.flat
		read['Text33'] = 'ulitsa'  # client.address.street
		read['Num32'] = 123  # client.address.buildingNumber
		read['Text44'] = 44  # client.address korpus ??
		read['Text35'] = 'gorod'  # client.address.city
		read['Num6'] = 433  # client.address.flat
		read['Num4'] = 443531  # client.address.index ??
		read['Num7'] = 9061264537  # stacion telefon
		read['email'] = '*****@*****.**'  # client.email
		read['Num14'] = 9061264536  # client.phone_number
		read['Text20'] = 'nameOfOrganiz'  # client.OrganizationInfo.full_name
		read['Text21'] = 'address_of_jobs'  # client.OrganizationInfo.address
		read['Num17'] = 'inn'  # client.OrganizationInfo.inn_number
		read['Num18'] = 45523455549  # client.OrganizationInfo.hr_number
		read['Num19'] = 45523455548  # client.OrganizationInfo.phoneJob ??
		# рабочий телефон
		read['Num20'] = 99  # stazh v godah in organization
		read['Num21'] = 11  # stazh v month in organization
		read['Num22'] = 24  # full stazh in years
		read['Num23'] = 11  # full stazh in months
		read['Num24'] = 555555  # client.AdditionalClientInfo.average_income
		read['Num25'] = 12222  # client.AdditionalClinetInfo.aliment
		read['Num26'] = 222222  # client.AdditionalClinetInfo.monetary_obligations
		read['Num27'] = 3608  # client.passport.serial
		read['Num28'] = 128333  # client.passport.number
		read['Num29'] = 640  # str(clent.passport.code_of)[:3]
		read['Num30'] = 128  # str(client.passport.code_of)[4:]
		read['Text31'] = ''  # пока так дальше видно будет

		outpt = open(out, 'wb')
		write = PdfFileWriter()
		set_need_appearances_writer(write)
		for i in range(reads.getNumPages() - 1):  # пока хз почему
			write.addPage(reads.getPage(i))
			updateCheckboxValues(reads.getPage(i), checkboxes)
			write.updatePageFormFieldValues(reads.getPage(i), read)

		write.write(outpt)
		inpt.close()
		outpt.close()