Python PDFQueryの例、pdfquery.PDFQuery Pythonの例

コード例 #1

0

ファイルを表示

def scrape_OPR800(filename):
    pdf = pdfquery.PDFQuery(filename)
    pages = pdf.doc.catalog['Pages'].resolve()['Count']
    print("pdf has %d pages" % pages)
    totals = []
    date = None
    for i in range((pages / 10) + 1):
        try:
            # load the next 10 pages
            pdf = pdfquery.PDFQuery(filename)
            pdf.load(range(i * 10, min((i + 1) * 10, pages)))

            # if we don't already have the date, search for it
            if date is None:
                date_field = "EI Date To: "
                date = pdf.pq("LTTextBoxHorizontal:contains('%s')" %
                              date_field)[0].text
                date = date[date.index(date_field) + len(date_field):]
                if " " in date: date = date[:date.index(" ")]
                print("got date (%s)" % date)

            # search for the totals and append them
            total = pdf.pq('LTTextLineHorizontal:contains("Total for")')
            for t in total:
                totals.append(t.text)

            print("finished page %d" % (min((i + 1) * 10, pages)))
            del pdf
        except Exception as exc:
            print("error in %s on page %s (%s)" % (filename,
                                                   ((i + 1) * 10), exc))

    return (totals, date)

コード例 #2

0

ファイルを表示

def gen_data(path, pages):
    pdf = pdfquery.PDFQuery(path)
    pdf.load(pages["main_page"][0] - 1)
    data = {
        "year": get_cy(pdf),
        "fund_ein": get_fund_ein(pdf),
        "fund_name": get_fund_name(pdf),
        "partner_ein": get_partner_ein(pdf),
        "beginning_ca": get_beginning_ca(pdf),
        "capital_cont": get_cap_contr(pdf),
        "cy_increase": get_cy_increase(pdf),
        "withdrawls": get_withdrawals(pdf),
        "ending_ca": get_ending_ca(pdf),
    }

    box_data = get_box_detail(pdf)
    pdf = pdfquery.PDFQuery(path)
    pdf.load(pages['Item_L_detail'][0] - 1)
    data['summary_income_loss_item_l'] = summary_income_loss_item_l(
        pdf, pages['Item_L_detail'][0] - 1)
    data['less_deductions_item_l'] = less_deductions_item_l(
        pdf, pages['Item_L_detail'][0] - 1)
    box_data = calculate_other_income(data, box_data)
    box_data = calculate_other_deductions(data, box_data)
    data.update(box_data)
    return data

コード例 #3

0

ファイルを表示

def pdfquery_FindText(filenamme, words, offset):
    # xfile : the PDF file in which to look
    # xString : the string to look for

    start_time = time.time()
    querys = [f'LTTextLineHorizontal:contains("{word}")' for word in words]
    res = defaultdict(lambda: PreallocatedList(1000, int))
    pdf = pdfquery.PDFQuery(filenamme, parse_tree_cacher=FileCache("tmp/"))
    page_num = 0
    while True:
        try:
            pdf.load(page_num)
            query_objs = [pdf.pq(query) for query in querys]
            for query_obj, word in zip(query_objs, words):
                if query_obj:
                    res[word].append(page_num + offset)
            page_num += 1
            if page_num % 30 == 0:
                del pdf
                collected = gc.collect()
                print(f"Garbage collector: collected {collected} objects.")
                pdf = pdfquery.PDFQuery(filenamme,
                                        parse_tree_cacher=FileCache("tmp/"))
                # if page_num % 100 == 0:
                print(page_num)
            # break
        except StopIteration:
            break
    print("--- Batch futási idő: %s másodperc ---" %
          (time.time() - start_time))
    return res

コード例 #4

0

ファイルを表示

ファイル: tests.py プロジェクト: yogjay/pdfquery

 def test_annot_dereferencing(self):
     """
         See issues #37, #42.
     """
     pdf = pdfquery.PDFQuery("tests/samples/bug37.pdf")
     pdf.load()
     pdf = pdfquery.PDFQuery("tests/samples/bug42.pdf")
     pdf.load()

コード例 #5

0

ファイルを表示

def convert(filename, filepath = ""):
    try:
        if filepath:
            pdf = pdfquery.PDFQuery(filepath + "/" + filename)
            pdf.load()
            with open('{}/{}.xml'.format(filepath, filename.replace(".pdf", "")),'wb') as f:
                f.write(etree.tostring(pdf.tree, pretty_print=True))
        else:   
            pdf = pdfquery.PDFQuery(filename)
            pdf.load()
            with open('{}.xml'.format(filename.replace(".pdf", "")),'wb') as f:
                f.write(etree.tostring(pdf.tree, pretty_print=True))
    except:
        print(traceback.format_exc())

コード例 #6

0

ファイルを表示

ファイル: Final_Code_VARP.py プロジェクト: manojreddy3210/cognitive

def read_cordinates1(path):
    PagePosDict = defaultdict()
    page_num = []
    PageDict = defaultdict()
    """ This function will read text and the cordinates of each of the horizontal boxes within a page of a pdf file"""
    pdf = pdfquery.PDFQuery(path)

    pdf_pages = PdfFileReader(open(path, 'rb'))
    pages = pdf_pages.getNumPages()
    for i in range(0, pages):
        try:
            pdf.load(i)
            print(i)
            JQuery = pdf.pq('LTPage')
            for j in JQuery("LTTextLineHorizontal"):
                try:
                    PageDict[i].append(JQuery(j).text())
                except KeyError:
                    PageDict[i] = [JQuery(j).text()]

                cordinates = list()
                cordinates.append(i)
                page_num.append(i)
                cord = JQuery(j).attr('bbox')
                for a in ['[', ']']:
                    cord = cord.replace(a, '')
                for a in cord.split(', '):
                    cordinates.append(float(a))
                PagePosDict[tuple(cordinates)] = JQuery(j).text()

        except Exception:
            continue

    return PagePosDict, PageDict, page_num

コード例 #7

0

ファイルを表示

def read_cordinates12(path):
    PagePosDict = defaultdict()
    """ This function will read text and the cordinates of each of the horizontal boxes within a page of a pdf file"""
    pdf = pdfquery.PDFQuery(path)

    for i in range(0, 5):
        try:
            pdf.load(i)
            print(i)
            JQuery = pdf.pq('LTPage')

            for j in JQuery("LTTextBoxHorizontal"):
                cordinates = list()
                cordinates.append(i)

                cord = JQuery(j).attr('bbox')
                for a in ['[', ']']:
                    cord = cord.replace(a, '')
                for a in cord.split(', '):
                    cordinates.append(float(a))
                PagePosDict[tuple(cordinates)] = JQuery(j).text()

        except Exception:
            continue

    return PagePosDict

コード例 #8

0

ファイルを表示

ファイル: filehunt.py プロジェクト: codewatchorg/PANhunt

    def check_regexs(self, regexs, search_extensions, enable_pdf):
        """Checks the file for matching regular expressions: if a ZIP then each file in the ZIP (recursively) or the text in a document"""

        if self.type == 'ZIP':
            try:
                if get_ext(self.path) == '.docx':
                    doctext = docx2txt.process(self.path)
                    self.check_text_regexs(doctext, regexs, '')

                if zipfile.is_zipfile(self.path):
                    zf = zipfile.ZipFile(self.path)
                    self.check_zip_regexs(zf, regexs, search_extensions,
                                          enable_pdf, '')
                else:
                    self.set_error('Invalid ZIP file')
            except IOError:
                self.set_error(sys.exc_info()[1])
            except:
                self.set_error(sys.exc_info()[1])

        elif self.type == 'TEXT':
            try:
                file_text = read_file(self.path, 'rb')
                self.check_text_regexs(file_text, regexs, '')
            except WindowsError:
                self.set_error(sys.exc_info()[1])
            except IOError:
                self.set_error(sys.exc_info()[1])
            except:
                self.set_error(sys.exc_info()[1])

        elif self.type == 'SPECIAL':
            if get_ext(self.path) == '.msg':
                try:
                    msg = msmsg.MSMSG(self.path)
                    if msg.validMSG:
                        self.check_msg_regexs(msg, regexs, search_extensions,
                                              enable_pdf, '')
                    else:
                        self.set_error('Invalid MSG file')
                    msg.close()
                except IOError:
                    self.set_error(sys.exc_info()[1])
                except:
                    self.set_error(sys.exc_info()[1])
            if enable_pdf:
                if get_ext(self.path) == '.pdf':
                    try:
                        pdf = pdfquery.PDFQuery(self.path)
                        pdf.load()
                        self.check_pdf_regexs(pdf, regexs, '')
                    except:
                        self.set_error(sys.exc_info()[1])
            if get_ext(self.path) == '.mdb':
                try:
                    self.check_access_regexs(self.path, 'mdb', regexs)
                except:
                    self.set_error(sys.exc_info()[1])

        return self.matches

コード例 #9

0

ファイルを表示

ファイル: get_urls_by_scraping_mail_pdfs.py プロジェクト: chakshugoyal97/bias-free-news-aggregator

def process_pdf(pdf_link, filename, folder):
    print(pdf_link)
    pdf = pdfquery.PDFQuery(pdf_link)
    pdf.load()
    pdf_holder = []
    as_i = -20
    news_alert_date = ''
    pages = pdf.pq('LTPage')

    #pdf.tree.write("test2.xml", pretty_print=True, encoding="utf-8")

    ###########
    date_label = pdf.pq(
        pdf.pq(pdf.pq(pages[0])('LTTextBoxHorizontal')[0])(
            'LTTextLineHorizontal')[2]).text()
    date_label = date_label[date_label.find(':') + 1:].strip()
    date_label = ' '.join(date_label.split()[1:4])
    print(date_label)
    print("--------")

    for i, p in enumerate(pages):
        page = pdf.pq(p)
        URLs = page('Annot')
        for i, url in enumerate(URLs):
            url = str(pdf.pq(url))
            formatted = url[url.find('url=') + len('url='):]
            formatted = formatted[:formatted.find('&amp')]
            link_to_story = formatted
            if (link_to_story[0:4] == "http"):
                pdf_holder.append([date_label, link_to_story])

    df = pd.DataFrame(pdf_holder, columns=['feed-date', 'link'])
    return (df)

コード例 #10

0

ファイルを表示

def add_all_options_of_one_trim_to_file(range_begin, range_end):
    decrement = 9.1
    options_deviation = 9
    for x in range(range_begin, range_end):
        print(x)
        pdf = pdfquery.PDFQuery('windowsticker (%d).pdf' % x, parse_tree_cacher=FileCache("/tmp/"))
        pdf.load()
        equipment_group_label = pdf.pq('LTTextLineHorizontal:contains("EQUIPMENT GROUP")')
        options_label = pdf.pq('LTTextLineHorizontal:contains("OPTIONAL EQUIPMENT")')
        bottom_corner_equip_group = float(equipment_group_label.attr('y0'))
        left_corner = float(options_label.attr('x0'))
        bottom_corner_options = float(options_label.attr('y0'))
        position1 = 0
        while True:
            options = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner,
                    bottom_corner_options - options_deviation - position1,left_corner + 300,
                    bottom_corner_options - position1)).text()
            #only reads 6
            if options == "":
                break
            options_file.write(options + '\n')
            position1 += decrement
        equipment = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner,
        bottom_corner_equip_group,left_corner + 300, bottom_corner_equip_group + 9)).text()
        options_file.write(equipment + '\n')
    options_file.close()

コード例 #11

0

ファイルを表示

ファイル: tests.py プロジェクト: pettarin/pdfquery

 def setUpClass(cls):
     cls.pdf = pdfquery.PDFQuery(
         "tests/samples/IRS_1040A.pdf",
         parse_tree_cacher=FileCache("/tmp/")
         if sys.argv[1] == 'cache' else None,
     )
     cls.pdf.load()

コード例 #12

0

ファイルを表示

    def parse(self, response):

        self.logger.info('Getting faq at %s', response.url)

        #print(dir(response.body))
        #return
        data = io.BytesIO()
        data.write(response.body)
        #with open('./tmp.pdf', 'w') as f:
        #	f.write(response.body)

        # with open('tmp.pdf', 'r') as f:

        # input = open('./tmp.pdf', 'r')
        pdf = pdfquery.PDFQuery(data)
        #pdf = pdfquery.PDFQuery("./tmp.pdf")
        pdf.load()
        text = pdf.tree.xpath('//LTTextLineHorizontal//text()')
        print(text)

        faqItem = FaqItem()
        #faqItem['answer'] = ''.join(im_siblings)
        #aqItem['url'] = response.url

        yield faqItem

コード例 #13

0

ファイルを表示

ファイル: getxml.py プロジェクト: doringutsu/State-by-state

def getxml(filename):
    pdf = pdfquery.PDFQuery(filename + '.pdf')
    pdf.load()

    tree_root = pdf.tree
    with open(filename + '.xml', 'w') as f:
        f.write(etree.tostring(tree_root, pretty_print=True))

コード例 #14

0

ファイルを表示

ファイル: ordemoapp2.py プロジェクト: nsmailacn/ordemoapp2

def processRequest(req):
    if req.get("result").get("action") != "searchPDF":
        return {}

    result = req.get("result")
    parameters = result.get("parameters")
    searchText = parameters.get("searchText")
    if searchText is None:
        return {}
    strTxt = 'LTTextLineHorizontal:contains("' + str(searchText) + '")'

    from pdfquery.cache import FileCache
    pdf = pdfquery.PDFQuery("rms-160-rn.pdf",
                            parse_tree_cacher=FileCache("tmp/"))
    pdf.load()
    results = []
    count = 0

    for pq1 in pdf.pq(strTxt):
        page_pq = pq1.iterancestors(
            'LTPage').next()  # Use just the first ancestor
        if pq1.text is None:
            results.append({
                "page#": page_pq.get("pageid"),
                "txt": pq1[0].text
            })
        else:
            results.append({"page#": page_pq.get("pageid"), "txt": pq1.text})
        count = count + 1
        if count == 5:
            break

    res = makeResult(results)
    return res

コード例 #15

0

ファイルを表示

def read_cordinates(path, page_no=None):
    """This function will read contents from the pages mentioned"""
    import pdfquery
    from collections import defaultdict
    """ This function will read text and the cordinates of each of the horizontal boxes within a page of a pdf file"""
    PagePosDict = defaultdict()
    pdf = pdfquery.PDFQuery(path)
    if page_no == None:
        page_no = range(6)
    global PLAINTIFF, DEFENDANT
    try:
        for i in page_no:
            pdf.load(i)
            JQuery = pdf.pq('LTPage')
            if JQuery.text().find('Service of Process Transmittal') >= 0:
                #i+=1
                continue
            for j in JQuery("LTTextBoxHorizontal"):
                cordinates = list()
                cordinates.append(i)
                cord = JQuery(j).attr('bbox')
                for a in ['[', ']']:
                    cord = cord.replace(a, '')
                for a in cord.split(', '):
                    cordinates.append(float(a))
                PagePosDict[tuple(cordinates)] = JQuery(j).text()
            #i+=1
    except Exception:
        return (PagePosDict)
    return (PagePosDict)

コード例 #16

0

ファイルを表示

ファイル: tests.py プロジェクト: yogjay/pdfquery

 def test_unicode_text(self):
     pdf = pdfquery.PDFQuery("tests/samples/bug18.pdf")
     pdf.load()
     self.assertEqual(
         pdf.pq('LTTextLineHorizontal:contains("Hop Hing Oils")').text(),
         (u'5 Hop Hing Oils and Fats (Hong Kong) Ltd \uf06c '
          u'\u7279\u5bf6\u7cbe\u88fd\u8c6c\u6cb9'))

コード例 #17

0

ファイルを表示

ファイル: tests.py プロジェクト: yogjay/pdfquery

 def test_xml_conversion(self):
     """
         Test that converted XML hasn't changed from saved version.
     """
     pdf = pdfquery.PDFQuery("tests/samples/bug28.pdf")
     pdf.load()
     self.assertValidOutput(pdf, "bug28_output")

コード例 #18

0

ファイルを表示

ファイル: projectSecurity.py プロジェクト: YahavMizrahi/DataSecurity

 def getUser_FromPdf(self, pdfUrl):
     '''
     read from pdf file and get username and password
     :param pdfUrl:the pdf path
     :return:None
     '''
     web_file = urllib.request.urlopen(pdfUrl)
     local_file = open('tempPdfFile.pdf', 'wb')
     local_file.write(web_file.read())
     web_file.close()
     local_file.close()
     pdf = pdfquery.PDFQuery("tempPdfFile.pdf")
     pdf.load()
     model = pdf.pq(
         'LTTextLineHorizontal:contains("Model")').text().replace(
             "Model", "")
     userName = pdf.pq(
         'LTTextLineHorizontal:contains("username")').text().text().replace(
             "username", "")
     password = pdf.pq(
         'LTTextLineHorizontal:contains("password")').text().text().replace(
             "password", "")
     self.listPdf.append({
         'Model': model,
         'Username': userName,
         'Password': password
     })

コード例 #19

0

ファイルを表示

def main():
    # Number of arguments must be 2
    # Otherwise error will be thrown

    if len(sys.argv) < 3:
        print "\nUsage : python pdf_extract <file_name>.pdf <output_file>"
        print "Note  : XML file will be created. No need to put .xml extension in output file name\n"

    else:
        try:
            # Takes the PDF file from command line argument
            pdf = pdfquery.PDFQuery(sys.argv[1])
        except:
            print "File doesn't exists in the specified directory!"
            return 0

        # Loads page 0 into memory
        # Use pdf.load() to load entire file
        # Use pdf.load(1,3,5) to load select pages
        pdf.load(0)

        # Outputs the entire contents into output file
        # Use jQuery to extract data
        pdf.tree.write(str(sys.argv[2] + ".xml"), pretty_print=True)

    return 0

コード例 #20

0

ファイルを表示

    def pdf_link2soup(self, link):
        xml_path = '%s-%s' % (self.xml_tmp_path, md5(link).hexdigest())

        # Link -> PDF
        pdf_content = urlopen(link).read()
        open(self.pdf_tmp_path, 'wb').write(pdf_content)

        # PDF -> XML
        pdf = pdfquery.PDFQuery(self.pdf_tmp_path,
                                merge_tags=('LTChar'),
                                round_floats=True,
                                round_digits=3,
                                input_text_formatter=None,
                                normalize_spaces=False,
                                resort=True,
                                parse_tree_cacher=None,
                                laparams={
                                    'all_texts': True,
                                    'detect_vertical': False
                                })

        pdf.load()
        pdf.tree.write(xml_path)

        # XML -> Soup
        xml_content = open(xml_path, 'r').read()
        return BeautifulSoup(xml_content, 'xml')

コード例 #21

0

ファイルを表示

ファイル: analise_pdf.py プロジェクト: Xx220xX/UFU_Programas

def extract(fileName, table_out):
    outs = [
        open('../out/' + f + '.md', 'w', encoding='utf-8') for f in table_out
    ]
    for f in outs:
        print('| Nome | total de aulas |', file=f)
        print('| :---- | :---- |', file=f)
    progress = 0
    d_progress = 100 / len(profs)
    for prof in profs:
        for i in range(len(fileName)):
            pdf = None
            try:
                pdf = pdfquery.PDFQuery('../out/Docentes/' + prof.name +
                                        fileName[i])
            except FileNotFoundError as e:
                print(
                    f'{prof.name} não possui informaçao de {fileName[i][1:-3]}'
                )
                print(
                    f'| {prof.name} | Nao possui arquivo {fileName[i][1:-3]}|',
                    file=outs[i])
                continue
            pdf.load(0)
            a = pdf.extract([('with_formatter', 'text'), \
                                 ('total de aulas', f':in_bbox("{square(100, 131, 137, 81)}")')])
            print(
                f'| {prof.name} | {a["total de aulas"][:len(a["total de aulas"]) // 2]} |',
                file=outs[i])
        progress += d_progress
        print("%.2f %%" % progress)
    for f in outs:
        f.close()

コード例 #22

0

ファイルを表示

ファイル: pdf_reader.py プロジェクト: GreenSpanA/MENU_ONLINE

def single_file_coordinates(pdf_menu_file):

    # create tree of elements
    menu_tree = pdfquery.PDFQuery(pdf_menu_file)
    menu_tree.load()

    # number of pages in the pdf_menu_file
    num_pages = len(menu_tree.tree.xpath('//*/LTPage'))

    print('number of pages', num_pages)

    menu_pd = pd.DataFrame(
        columns=['items', 'height', 'x0', 'x1', 'y0', 'y1', 'page_num'])

    for page_num in range(1, num_pages + 1):

        selector = '//LTPage[@pageid = "' + str(page_num) + '"]//*'

        treeExtract = menu_tree.tree.xpath(selector)

        menu_pd_page = single_page_coordinates(treeExtract, menu_tree)

        if menu_pd_page.shape[0] != 0:

            menu_pd_page['page_num'] = page_num

            menu_pd = pd.concat([menu_pd, menu_pd_page])

        else:
            menu_pd = pd.DataFrame()

    return menu_pd

コード例 #23

0

ファイルを表示

ファイル: reader.py プロジェクト: Burathar/invoice-extractor

 def load(self, file):
     self.file = file
     print('Loading %s' % str(self.file.split('\\')[-1:]))
     self.pdf = pdfquery.PDFQuery(self.file)
     page_count = len(self.pdf._pages)
     #self.pdf.load(list(range(2,page_count)))
     self.pdf.load()

コード例 #24

0

ファイルを表示

ファイル: pdf_loader.py プロジェクト: huynguyen7/SURF-Scrapping

def get_part_3():
    list_3 = []
    for pdf_path in part_3:
        pdf = pdfquery.PDFQuery(pdf_path, parse_tree_cacher=FileCache("./tmp/"))
        pdf.load()
        list_3.append(pdf)
    print("Finished getting part 3")
    return list_3

コード例 #25

0

ファイルを表示

ファイル: pdf_watermark.py プロジェクト: bio-boris/parse_pdf

def main():
    #1. Loads up the PDF and gets the number of pages
    if len(sys.argv) >= 2:
        pdf_name = sys.argv[1]
        if len(sys.argv) > 2:
            log_name = pdf_name + ".log.txt"
            print "Check Logfile (" + log_name + ")"
            sys.stdout = open(log_name, 'w')
        print "About to open " + str(pdf_name)
        sys.stdout.flush()
    else:
        #pdf_name = "binder_combined copy.pdf"
        exit("Error: Please Input a PDF")

    pdf = pdfquery.PDFQuery(pdf_name)
    pdf_count = pdf.doc.catalog['Pages'].resolve()['Count']

    #2. Create Invoices Object, which A) holds an array of invoices
    # and B) holds
    invoices = Invoices()
    count = 0

    for page in range(pdf_count):

        print "About to load " + str(page + 1),
        sys.stdout.flush()
        try:
            pdf.load(page)
        except:
            print "ERROR: Couldn't load page " + str(page + 1)
            sys.stdout.flush()
            invoices.addUnknownInvoicePage()
            continue

        identifiers = invoices.getIdentifiers()

        foundPage = False
        for identifier in identifiers:
            pdf_id = pdf.pq('LTTextLineHorizontal:contains("' +
                            str(identifier) + '")')
            if (pdf_id):
                print identifier,
                searchFunction = identifiers[identifier]
                #Call the search function with a blank invoice Object
                blank_obj = Invoice()
                obj = searchFunction(blank_obj, pdf)
                invoices.add(page, searchFunction(Invoice(), pdf))
                #print "Invoice:",obj.num,"PO:",obj.po,"JOB:",obj.job
                obj.printInvoice()
                foundPage = True
                break
        if (not foundPage):
            invoices.addUnknownInvoicePage()
            print ""

    printToPDF(pdf_name, invoices)

コード例 #26

0

ファイルを表示

def parse_data():

    filename = "../data/input/4_ini_3.pdf"

    pdf = pdfquery.PDFQuery(filename)

    pdf.load(1)

    print(
        pdf.pq('LTTextBoxHorizontal:contains("' + "Kalenderwoche:" +
               '")').text())

コード例 #27

0

ファイルを表示

def load_file(file):
    global PAGE_ANOMALIES

    pdf = pdfquery.PDFQuery(file)
    if file in PAGE_ANOMALIES:
        pdf.load(PAGE_ANOMALIES[file])
    else:
        pdf.load()

    LOG.print("\tFile loaded")
    return pdf

コード例 #28

0

ファイルを表示

    def __init__(self, pdfpath, pdf=None):
        self.logger = logging.getLogger("hsbcpdf.helpers.basestatement")
        self.pdfpath = pdfpath
        self.pdf = pdf
        if self.pdf is None:
            self.pdf = pdfquery.PDFQuery(pdfpath)
            self.pdf.load()

        self.page_height = None
        self.page_width = None
        self.account_number = None
        self.st_date = None

コード例 #29

0

ファイルを表示

ファイル: text_parser.py プロジェクト: hackNY-labs-2018/bank-parser-dash

def extract_text_if_valid(filename):
    """Parses text from a pdf if it is valid; otherwise,
    returns an empty string.
    """
    if not filename.endswith('.pdf'):
        print('Sorry, only PDF files are supported for text extraction.')
        return ""
    # prevent space normalization in order to separate different text blocks in each line
    pdf = pdfquery.PDFQuery(filename, normalize_spaces=False, resort=False)
    pdf.load()

    return extract_transactions(pdf)

コード例 #30

0

ファイルを表示

    def get_scraper(cls, pdfpath, pdf=None):
        if not os.path.exists(pdfpath):
            raise ScraperException(f'"{pdfpath}" file not found')
        if not os.path.isfile(pdfpath):
            raise ScraperException(f'"{pdfpath}" not a file')
        pdf = pdfquery.PDFQuery(pdfpath)
        pdf.load()

        for s in cls._scrapers:
            if s.probe_bank(pdf) and s.probe_type(pdf):
                logger.debug("pdf file matches {}.{}".format(
                    s.st_bank, s.st_type))
                return s(pdfpath, pdf)