コード例 #1
0
def test_atomise_does_not_disrupt_table_finding():
    fh = open('fixtures/sample_data/13_06_12_10_36_58_boletim_ingles_junho_2013.pdf', 'rb')
    pdf_page = pdftables.get_pdf_page(fh, 4)
    table1, _ = pdftables.page_to_tables(pdf_page, atomise=True, extend_y=False)
    table2, _ = pdftables.page_to_tables(pdf_page, atomise=False, extend_y=False)
    
    
    assert_equals(table1, table2)
コード例 #2
0
def test_atomise_does_not_disrupt_table_finding():
    pdf_page = fixture(
        "13_06_12_10_36_58_boletim_ingles_junho_2013.pdf").get_page(4)
    table1, _ = pdftables.page_to_tables(
        pdf_page, ConfigParameters(atomise=True, extend_y=False))
    table2, _ = pdftables.page_to_tables(
        pdf_page, ConfigParameters(atomise=False, extend_y=False))

    assert_equals(table1, table2)
コード例 #3
0
def test_atomise_does_not_disrupt_table_finding():
    fh = open(
        'fixtures/sample_data/13_06_12_10_36_58_boletim_ingles_junho_2013.pdf',
        'rb')
    pdf_page = pdftables.get_pdf_page(fh, 4)
    table1, _ = pdftables.page_to_tables(pdf_page,
                                         atomise=True,
                                         extend_y=False)
    table2, _ = pdftables.page_to_tables(pdf_page,
                                         atomise=False,
                                         extend_y=False)

    assert_equals(table1, table2)
コード例 #4
0
def test_atomise_does_not_disrupt_table_finding():
    pdf_page = fixture("13_06_12_10_36_58_boletim_ingles_junho_2013.pdf").get_page(4)
    table1, _ = pdftables.page_to_tables(
            pdf_page,
            ConfigParameters(
                atomise=True,
                extend_y=False))
    table2, _ = pdftables.page_to_tables(
            pdf_page,
            ConfigParameters(
                atomise=False,
                extend_y=False))


    assert_equals(table1, table2)
コード例 #5
0
def test_it_returns_the_AlmondBoard_p2_table_by_size():
    fh = open('fixtures/sample_data/2012.01.PosRpt.pdf', 'rb')
    pdf_page = get_pdf_page(fh, 2)
    table1, _ = page_to_tables(pdf_page)
    #table1, _ = getTable(fh, 2)
    assert_equals(78, len(table1))
    assert_equals(10, len(table1[0]))
コード例 #6
0
def test_it_can_use_hints_AlmondBoard_p1():
    pdf_page = fixture("2012.01.PosRpt.pdf").get_page(1)
    table, _ = page_to_tables(
        pdf_page,
        ConfigParameters(
            atomise=False,
            table_top_hint=u"% Change",
            table_bottom_hint=u"Uncommited"))
    assert_equals(
    [[u'Salable', u'Million Lbs.', u'Kernel Wt.', u'Kernel Wt.', u'% Change'], 
     [u'1.  Carryin August 1, 2011', u'254.0', u'253,959,411', u'321,255,129', u'-20.95%'], 
     [u'2.  Crop Receipts to Date', u'1,950.0', u'1,914,471,575', u'1,548,685,417', u'23.62%'], 
     [u'3.  [3% Loss and Exempt]', u'58.5', u'57,434,147)(', u'46,460,563(', u')'], 
     [u'4.  New Crop Marketable (2-3)', u'1,891.5', u'1,857,037,428', u'1,502,224,854', u'23.62%'], 
     [u'5.  [Reserve]', u'n/a', u'0', u'0', u''],
     [u'6.  Total Supply (1+4-5)Shipments by Handlers', u'2,145.5', u'2,110,996,839', u'1,823,479,983', u'15.77%'], 
     [u'7.  Domestic', u'555.0', u'265,796,698', u'255,785,794', u'3.91%'], 
     [u'8.  Export', u'1,295.0', u'755,447,255', u'664,175,807', u'13.74%'], 
     [u'9.  Total Shipments', u'1,850.0', u'1,021,243,953', u'919,961,601', u'11.01%'], 
     [u'10.  Forecasted Carryout', u'295.5', u'', u'', u''], 
     [u'11.  Computed Inventory (6-9)Commitments (sold, not delivered)**', u'', u'1,089,752,886', u'903,518,382', u'20.61%'], 
     [u'12.  Domestic', u'', u'214,522,238', u'187,492,263', u'14.42%'], [u'13.  Export', u'', u'226,349,446', u'155,042,764', u'45.99%'],
     [u'14.  Total Commited Shipments', u'', u'440,871,684', u'342,535,027', u'28.71%'], 
     [u'15.  Uncommited Inventory (11-14)', u'', u'648,881,202', u'560,983,355', u'15.67%']]
    , table)
コード例 #7
0
def test_it_exits_gracefully_when_no_tables_found():
    fh = open("fixtures/sample_data/13_06_12_10_36_58_boletim_ingles_junho_2013.pdf", "rb")
    pdf_page = get_pdf_page(fh, 5)
    table, table_diagnostic_data = page_to_tables(pdf_page)

    assert_equals([], table)
    assert isinstance(table_diagnostic_data, TableDiagnosticData)
コード例 #8
0
def test_it_can_use_hints_AlmondBoard_p1():
    fh = open("fixtures/sample_data/2012.01.PosRpt.pdf", "rb")
    pdf_page = get_pdf_page(fh, 1)
    table, _ = page_to_tables(pdf_page, hints=[u"% Change", u"Uncommited"])
    assert_equals(
        [
            [u"", u"Million Lbs.", u"Kernel Wt.", u"Kernel Wt.", u"% Change"],
            [u"1.  Carryin August 1, 2011", u"254.0", u"253,959,411", u"321,255,129", u"-20.95%"],
            [u"2.  Crop Receipts to Date", u"1,950.0", u"1,914,471,575", u"1,548,685,417", u"23.62%"],
            [u"3.  [3% Loss and Exempt]", u"58.5", u"57,434,147)(", u"46,460,563()", u""],
            [u"4.  New Crop Marketable (2-3)", u"1,891.5", u"1,857,037,428", u"1,502,224,854", u"23.62%"],
            [u"5.  [Reserve]", u"n/a", u"0", u"0", u""],
            [u"6.  Total Supply (1+4-5)", u"2,145.5", u"2,110,996,839", u"1,823,479,983", u"15.77%"],
            [u"Shipments by Handlers7.  Domestic", u"555.0", u"265,796,698", u"255,785,794", u"3.91%"],
            [u"8.  Export", u"1,295.0", u"755,447,255", u"664,175,807", u"13.74%"],
            [u"9.  Total Shipments", u"1,850.0", u"1,021,243,953", u"919,961,601", u"11.01%"],
            [u"10.  Forecasted Carryout", u"295.5", u"", u"", u""],
            [u"11.  Computed Inventory (6-9)", u"", u"1,089,752,886", u"903,518,382", u"20.61%"],
            [u"Commitments (sold, not delivered)**12.  Domestic", u"", u"214,522,238", u"187,492,263", u"14.42%"],
            [u"13.  Export", u"", u"226,349,446", u"155,042,764", u"45.99%"],
            [u"14.  Total Commited Shipments", u"", u"440,871,684", u"342,535,027", u"28.71%"],
            [u"15.  Uncommited Inventory (11-14)", u"", u"648,881,202", u"560,983,355", u"15.67%"],
        ],
        table,
    )
コード例 #9
0
def test_it_can_use_one_hint_argentina_by_size():
    pdf_page = fixture("argentina_diputados_voting_record.pdf").get_page(1)
    table1, _ = page_to_tables(
        pdf_page, ConfigParameters(atomise=False, table_top_hint='Apellido'))
    #table1,_ = getTable(fh, 2)
    assert_equals(32, len(table1))
    assert_equals(4, len(table1[0]))
コード例 #10
0
def test_it_can_use_one_hint_argentina_by_size():
    fh = open("fixtures/sample_data/argentina_diputados_voting_record.pdf", "rb")
    pdf_page = get_pdf_page(fh, 1)
    table1, _ = page_to_tables(pdf_page, hints=["Apellido", ""])
    # table1,_ = getTable(fh, 2)
    assert_equals(32, len(table1))
    assert_equals(4, len(table1[0]))
コード例 #11
0
def test_it_exits_gracefully_when_no_tables_found():
    pdf_page = fixture(
        "13_06_12_10_36_58_boletim_ingles_junho_2013.pdf").get_page(5)
    table, table_diagnostic_data = page_to_tables(pdf_page)

    assert_equals([], table)
    assert (isinstance(table_diagnostic_data, TableDiagnosticData))
コード例 #12
0
def test_it_returns_the_AlmondBoard_p2_table_by_size():
    fh = open("fixtures/sample_data/2012.01.PosRpt.pdf", "rb")
    pdf_page = get_pdf_page(fh, 2)
    table1, _ = page_to_tables(pdf_page)
    # table1, _ = getTable(fh, 2)
    assert_equals(78, len(table1))
    assert_equals(10, len(table1[0]))
コード例 #13
0
def test_the_atomise_option_works_on_coceral_p1_by_size():
    pdf_page = fixture(
        "1359397366Final_Coceral grain estimate_2012_December.pdf").get_page(1)
    table, _ = page_to_tables(pdf_page, ConfigParameters(atomise=True))
    #table1, _ = getTable(fh, 2)
    assert_equals(43, len(table))
    assert_equals(31, len(table[0]))
コード例 #14
0
def test_the_atomise_option_works_on_coceral_p1_by_size():
    fh = open("fixtures/sample_data/1359397366Final_Coceral grain estimate_2012_December.pdf", "rb")
    pdf_page = get_pdf_page(fh, 1)
    table, _ = page_to_tables(pdf_page, atomise=True)
    # table1, _ = getTable(fh, 2)
    assert_equals(43, len(table))
    assert_equals(31, len(table[0]))
コード例 #15
0
def test_it_can_use_one_hint_argentina_by_size():
    fh = open('fixtures/sample_data/argentina_diputados_voting_record.pdf',
              'rb')
    pdf_page = get_pdf_page(fh, 1)
    table1, _ = page_to_tables(pdf_page, hints=['Apellido', ''])
    #table1,_ = getTable(fh, 2)
    assert_equals(32, len(table1))
    assert_equals(4, len(table1[0]))
コード例 #16
0
def test_the_atomise_option_works_on_coceral_p1_by_size():
    pdf_page = fixture("1359397366Final_Coceral grain estimate_2012_December.pdf").get_page(1)
    table, _ = page_to_tables(pdf_page,
            ConfigParameters(
                atomise=True))
    #table1, _ = getTable(fh, 2)
    assert_equals(43, len(table))
    assert_equals(31, len(table[0]))
コード例 #17
0
def test_it_exits_gracefully_when_no_tables_found():
    fh = open(
        'fixtures/sample_data/13_06_12_10_36_58_boletim_ingles_junho_2013.pdf',
        'rb')
    pdf_page = get_pdf_page(fh, 5)
    table, table_diagnostic_data = page_to_tables(pdf_page)

    assert_equals([], table)
    assert (isinstance(table_diagnostic_data, TableDiagnosticData))
コード例 #18
0
ファイル: page_table.py プロジェクト: akmiller01/alexm-util
def page():
    basename = os.path.basename(options.input)
    inputname, inputextension = os.path.splitext(basename)
    fh = open(options.input, 'rb')
    pdf_page = pdftables.get_pdf_page(fh, int(options.page))
    table1, _ = pdftables.page_to_tables(pdf_page)
    data = pd.DataFrame(table1)
    data.to_csv(options.output+inputname+"-"+options.page+".csv", encoding='utf-8',index=False)
    print("Done.")
コード例 #19
0
def test_the_atomise_option_works_on_coceral_p1_by_size():
    fh = open(
        'fixtures/sample_data/1359397366Final_Coceral grain estimate_2012_December.pdf',
        'rb')
    pdf_page = get_pdf_page(fh, 1)
    table, _ = page_to_tables(pdf_page, atomise=True)
    #table1, _ = getTable(fh, 2)
    assert_equals(43, len(table))
    assert_equals(31, len(table[0]))
コード例 #20
0
def test_it_can_use_one_hint_argentina_by_size():
    pdf_page = fixture("argentina_diputados_voting_record.pdf").get_page(1)
    table1, _ = page_to_tables(
            pdf_page,
            ConfigParameters(
                atomise=False,
                table_top_hint='Apellido'))
    #table1,_ = getTable(fh, 2)
    assert_equals(32, len(table1))
    assert_equals(4, len(table1[0]))
コード例 #21
0
	def start_reading(self, check):
		total = self.doc.numPages
		for num in range(0, total):
			page = self.doc.getPage(num)
			text = page.extractText().strip().replace('\n','')
			if text.find("(SCHEME OF EXAMINATIONS)") == -1:
				self.handle_result_page(text, check)
			else:
				table = page_to_tables(get_pdf_page(self.f, num+1))
				self.subjects = self.handle_heading_page(text, table)
コード例 #22
0
def test_it_can_use_hints_AlmondBoard_p1():
    fh = open('fixtures/sample_data/2012.01.PosRpt.pdf', 'rb')
    pdf_page = get_pdf_page(fh, 1)
    table, _ = page_to_tables(pdf_page, hints=[u"% Change", u"Uncommited"])
    assert_equals([
        [u'', u'Million Lbs.', u'Kernel Wt.', u'Kernel Wt.', u'% Change'],
        [
            u'1.  Carryin August 1, 2011', u'254.0', u'253,959,411',
            u'321,255,129', u'-20.95%'
        ],
        [
            u'2.  Crop Receipts to Date', u'1,950.0', u'1,914,471,575',
            u'1,548,685,417', u'23.62%'
        ],
        [
            u'3.  [3% Loss and Exempt]', u'58.5', u'57,434,147)(',
            u'46,460,563()', u''
        ],
        [
            u'4.  New Crop Marketable (2-3)', u'1,891.5', u'1,857,037,428',
            u'1,502,224,854', u'23.62%'
        ], [u'5.  [Reserve]', u'n/a', u'0', u'0', u''],
        [
            u'6.  Total Supply (1+4-5)', u'2,145.5', u'2,110,996,839',
            u'1,823,479,983', u'15.77%'
        ],
        [
            u'Shipments by Handlers7.  Domestic', u'555.0', u'265,796,698',
            u'255,785,794', u'3.91%'
        ],
        [u'8.  Export', u'1,295.0', u'755,447,255', u'664,175,807', u'13.74%'],
        [
            u'9.  Total Shipments', u'1,850.0', u'1,021,243,953',
            u'919,961,601', u'11.01%'
        ], [u'10.  Forecasted Carryout', u'295.5', u'', u'', u''],
        [
            u'11.  Computed Inventory (6-9)', u'', u'1,089,752,886',
            u'903,518,382', u'20.61%'
        ],
        [
            u'Commitments (sold, not delivered)**12.  Domestic', u'',
            u'214,522,238', u'187,492,263', u'14.42%'
        ], [u'13.  Export', u'', u'226,349,446', u'155,042,764', u'45.99%'],
        [
            u'14.  Total Commited Shipments', u'', u'440,871,684',
            u'342,535,027', u'28.71%'
        ],
        [
            u'15.  Uncommited Inventory (11-14)', u'', u'648,881,202',
            u'560,983,355', u'15.67%'
        ]
    ], table)
コード例 #23
0
def main((i,inputPath)):
    fh = open(inputPath,'rb')
    pdf_page = pdftables.get_pdf_page(fh, i)
    bs = "\b"
    sys.stdout.write(bs*4)
    sys.stdout.write("%03d " % i)
    sys.stdout.flush()
    try:
        table1, _ = pdftables.page_to_tables(pdf_page)
    except:
        table1 = []
        print("read error")
    if len(table1)>0:
        data = pd.DataFrame(table1)
        data.to_csv(outputPath+"%03d" % i+".csv", encoding='utf-8')
    else:
        sys.stdout.write("    ")
コード例 #24
0
def test_it_returns_the_AlmondBoard_p4_table():
    pdf_page = fixture("2012.01.PosRpt.pdf").get_page(4)
    table, _ = page_to_tables(
        pdf_page,
        ConfigParameters(
            atomise=False,
            extend_y=False))
    assert_equals(
    [[u'Variety Name', u'Total Receipts', u'Total Receipts', u'Total Inedibles', u'Receipts', u'% Rejects'], 
     [u'Aldrich', u'48,455,454', u'49,181,261', u'405,555', u'2.53%', u'0.82%'], 
     [u'Avalon', u'7,920,179', u'8,032,382', u'91,733', u'0.41%', u'1.14%'], 
     [u'Butte', u'151,830,761', u'150,799,510', u'1,054,567', u'7.93%', u'0.70%'], 
     [u'Butte/Padre', u'215,114,812', u'218,784,885', u'1,145,000', u'11.24%', u'0.52%'], 
     [u'Carmel', u'179,525,234', u'178,912,935', u'1,213,790', u'9.38%', u'0.68%'], 
     [u'Carrion', u'507,833', u'358,580', u'2,693', u'0.03%', u'0.75%'], 
     [u'Fritz', u'105,479,433', u'106,650,571', u'1,209,192', u'5.51%', u'1.13%'], 
     [u'Harvey', u'58,755', u'58,755', u'1,416', u'0.00%', u'2.41%'], 
     [u'Hashem', u'430,319', u'430,014', u'1,887', u'0.02%', u'0.44%'], 
     [u'Le Grand', u'0', u'0', u'0', u'0.00%', u'0.00%'], 
     [u'Livingston', u'7,985,535', u'7,926,910', u'186,238', u'0.42%', u'2.35%'], 
     [u'Marchini', u'363,887', u'391,965', u'3,675', u'0.02%', u'0.94%'], 
     [u'Merced', u'65,422', u'66,882', u'1,167', u'0.00%', u'1.74%'], 
     [u'Mission', u'19,097,034', u'18,851,071', u'110,323', u'1.00%', u'0.59%'], 
     [u'Mixed', u'36,358,011', u'36,926,337', u'952,264', u'1.90%', u'2.58%'], 
     [u'Mono', u'757,637', u'689,552', u'6,785', u'0.04%', u'0.98%'], 
     [u'Monterey', u'220,713,436', u'212,746,409', u'2,293,892', u'11.53%', u'1.08%'], 
     [u'Morley', u'822,529', u'825,738', u'6,264', u'0.04%', u'0.76%'], 
     [u'N43', u'156,488', u'85,832', u'340', u'0.01%', u'0.40%'], 
     [u'Neplus', u'1,279,599', u'1,237,532', u'17,388', u'0.07%', u'1.41%'], 
     [u'Nonpareil', u'741,809,844', u'727,286,104', u'5,121,465', u'38.75%', u'0.70%'], 
     [u'Padre', u'62,905,358', u'62,417,565', u'193,168', u'3.29%', u'0.31%'], 
     [u'Peerless', u'5,113,472', u'5,101,245', u'20,792', u'0.27%', u'0.41%'], 
     [u'Price', u'25,312,529', u'25,124,463', u'143,983', u'1.32%', u'0.57%'], 
     [u'Ruby', u'4,163,237', u'4,057,470', u'35,718', u'0.22%', u'0.88%'], 
     [u'Sauret', u'55,864', u'55,864', u'517', u'0.00%', u'0.93%'], 
     [u'Savana', u'389,317', u'390,585', u'2,049', u'0.02%', u'0.52%'], 
     [u'Sonora', u'31,832,025', u'33,184,703', u'387,848', u'1.66%', u'1.17%'], 
     [u'Thompson', u'491,026', u'487,926', u'8,382', u'0.03%', u'1.72%'], 
     [u'Tokyo', u'783,494', u'794,699', u'4,511', u'0.04%', u'0.57%'], 
     [u'Winters', u'5,780,183', u'5,756,167', u'46,211', u'0.30%', u'0.80%'], 
     [u'Wood Colony', u'37,458,735', u'36,331,907', u'189,967', u'1.96%', u'0.52%'], 
     [u'Major Varieties Sub Total:', u'1,913,017,442', u'1,893,945,819', u'14,858,780', u'99.92%', u'0.78%'], 
     [u'Minor Varieties Total:', u'1,454,133', u'1,480,800', u'34,997', u'0.08%', u'2.36%'], 
     [u'Grand Total All Varieties', u'1,914,471,575', u'1,895,426,619', u'14,893,777', u'100.00%', u'0.79%']]
    , table
    )
コード例 #25
0
def test_it_returns_the_AlmondBoard_p4_table():
    fh = open("fixtures/sample_data/2012.01.PosRpt.pdf", "rb")
    pdf_page = get_pdf_page(fh, 4)
    table, _ = page_to_tables(pdf_page, extend_y=False)
    assert_equals(
        [
            [u"Variety Name", u"Total Receipts", u"Total Receipts", u"Total Inedibles", u"Receipts", u"% Rejects"],
            [u"Aldrich", u"48,455,454", u"49,181,261", u"405,555", u"2.53%", u"0.82%"],
            [u"Avalon", u"7,920,179", u"8,032,382", u"91,733", u"0.41%", u"1.14%"],
            [u"Butte", u"151,830,761", u"150,799,510", u"1,054,567", u"7.93%", u"0.70%"],
            [u"Butte/Padre", u"215,114,812", u"218,784,885", u"1,145,000", u"11.24%", u"0.52%"],
            [u"Carmel", u"179,525,234", u"178,912,935", u"1,213,790", u"9.38%", u"0.68%"],
            [u"Carrion", u"507,833", u"358,580", u"2,693", u"0.03%", u"0.75%"],
            [u"Fritz", u"105,479,433", u"106,650,571", u"1,209,192", u"5.51%", u"1.13%"],
            [u"Harvey", u"58,755", u"58,755", u"1,416", u"0.00%", u"2.41%"],
            [u"Hashem", u"430,319", u"430,014", u"1,887", u"0.02%", u"0.44%"],
            [u"Le Grand", u"0", u"0", u"0", u"0.00%", u"0.00%"],
            [u"Livingston", u"7,985,535", u"7,926,910", u"186,238", u"0.42%", u"2.35%"],
            [u"Marchini", u"363,887", u"391,965", u"3,675", u"0.02%", u"0.94%"],
            [u"Merced", u"65,422", u"66,882", u"1,167", u"0.00%", u"1.74%"],
            [u"Mission", u"19,097,034", u"18,851,071", u"110,323", u"1.00%", u"0.59%"],
            [u"Mixed", u"36,358,011", u"36,926,337", u"952,264", u"1.90%", u"2.58%"],
            [u"Mono", u"757,637", u"689,552", u"6,785", u"0.04%", u"0.98%"],
            [u"Monterey", u"220,713,436", u"212,746,409", u"2,293,892", u"11.53%", u"1.08%"],
            [u"Morley", u"822,529", u"825,738", u"6,264", u"0.04%", u"0.76%"],
            [u"N43", u"156,488", u"85,832", u"340", u"0.01%", u"0.40%"],
            [u"Neplus", u"1,279,599", u"1,237,532", u"17,388", u"0.07%", u"1.41%"],
            [u"Nonpareil", u"741,809,844", u"727,286,104", u"5,121,465", u"38.75%", u"0.70%"],
            [u"Padre", u"62,905,358", u"62,417,565", u"193,168", u"3.29%", u"0.31%"],
            [u"Peerless", u"5,113,472", u"5,101,245", u"20,792", u"0.27%", u"0.41%"],
            [u"Price", u"25,312,529", u"25,124,463", u"143,983", u"1.32%", u"0.57%"],
            [u"Ruby", u"4,163,237", u"4,057,470", u"35,718", u"0.22%", u"0.88%"],
            [u"Sauret", u"55,864", u"55,864", u"517", u"0.00%", u"0.93%"],
            [u"Savana", u"389,317", u"390,585", u"2,049", u"0.02%", u"0.52%"],
            [u"Sonora", u"31,832,025", u"33,184,703", u"387,848", u"1.66%", u"1.17%"],
            [u"Thompson", u"491,026", u"487,926", u"8,382", u"0.03%", u"1.72%"],
            [u"Tokyo", u"783,494", u"794,699", u"4,511", u"0.04%", u"0.57%"],
            [u"Winters", u"5,780,183", u"5,756,167", u"46,211", u"0.30%", u"0.80%"],
            [u"Wood Colony", u"37,458,735", u"36,331,907", u"189,967", u"1.96%", u"0.52%"],
            [u"Major Varieties Sub Total:", u"1,913,017,442", u"1,893,945,819", u"14,858,780", u"99.92%", u"0.78%"],
            [u"Minor Varieties Total:", u"1,454,133", u"1,480,800", u"34,997", u"0.08%", u"2.36%"],
            [u"Grand Total All Varieties", u"1,914,471,575", u"1,895,426,619", u"14,893,777", u"100.00%", u"0.79%"],
        ],
        table,
    )
コード例 #26
0
def page():
    basename = os.path.basename(options.input)
    inputname, inputextension = os.path.splitext(basename)
    fh = open(options.input, 'rb')
    pdf_page = pdftables.get_pdf_page(fh, int(options.page))
    table1, _ = pdftables.page_to_tables(pdf_page)
    table = []
    for row in table1:
        if uni(row[1])!='' and uni(row[1])!='Wage':
            newrow = []
            #Append the column description
            newrow.append(uni(row[1]))
            for cell in row[1:]:
                naIndex = [m.start() for m in re.finditer('N/A', uni(cell))]
                dotIndex = [m.start() for m in re.finditer('\.', uni(cell))]
                if len(naIndex)+len(dotIndex)==1:
                    #It's fine, it's just one value
                    newrow.append(uni(cell))
                else:
                    #We need to separate things...
                    sliceIndicies = []
                    #Slice 3 characters after N/A
                    for indx in naIndex:
                        sliceIndicies.append(indx+3)
                    #Slice 2 characters after .
                    for indx in dotIndex:
                        sliceIndicies.append(indx+3)
                    #Sort
                    sliceIndicies.sort()
                    #Add zero for the beginning
                    sliceIndicies.insert(0,0)
                    #Slice
                    indxLen = len(sliceIndicies)
                    for i in range(1,indxLen):
                        start = sliceIndicies[i-1]
                        end = sliceIndicies[i]
                        strSlice = uni(cell)[start:end]
                        newrow.append(strSlice)
            table.append(newrow)
    data = pd.DataFrame(table)
    data.to_csv(options.output+inputname+"-"+options.page+".csv", encoding='utf-8',index=False)
    print("Done.")
コード例 #27
0
ファイル: runtables.py プロジェクト: ziweizhou/pdftables
# pagenumber = 1

# SelectedPDF = "pdf_prc_prod_1_7_1288_acucar-vhp-vendido-mercado-externo_sao-paulo_mensal.pdf"
# pagenumber = 1

# SelectedPDF = "commodity-prices_en.pdf"
# pagenumber = 1

SelectedPDF = "AnimalExampleTables.pdf"  # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf
pagenumber = 2

filepath = os.path.join(PDF_TEST_FILES, SelectedPDF)
fh = open(filepath, "rb")
# pta.plotAllPages(fh)


pdf_page = get_pdf_page(fh, pagenumber)

table, diagnosticData = page_to_tables(pdf_page, extend_y=False, hints=hints, atomise=False)

fig, ax1 = pta.plotpage(diagnosticData)

result = StringIO()
(columns, rows) = get_dimensions(table)
result.write("     {} columns, {} rows\n".format(columns, rows))

print to_string(table)


# BoxList = plotAllPages(open(filepath, 'rb'))
コード例 #28
0
def test_it_does_not_crash_on_m30_p5():
    fh = open("fixtures/sample_data/m30-JDent36s15-20.pdf", "rb")
    pdf_page = get_pdf_page(fh, 5)
    table, _ = page_to_tables(pdf_page)
    """Put this in for more aggressive test"""
コード例 #29
0
def test_it_copes_with_CONAB_p8():
    pdf_page = fixture(
        "13_06_12_10_36_58_boletim_ingles_junho_2013.pdf").get_page(8)
    table, _ = page_to_tables(pdf_page, ConfigParameters(atomise=True))
コード例 #30
0
def test_it_does_not_crash_on_m30_p5():
    pdf_page = fixture("m30-JDent36s15-20.pdf").get_page(5)
    table, _ = page_to_tables(pdf_page)
    """Put this in for more aggressive test"""
コード例 #31
0
def test_it_copes_with_CONAB_p8():
    fh = open("fixtures/sample_data/13_06_12_10_36_58_boletim_ingles_junho_2013.pdf", "rb")
    pdf_page = get_pdf_page(fh, 8)
    table, _ = page_to_tables(pdf_page, atomise=True)
コード例 #32
0
def test_it_returns_the_AlmondBoard_p2_table_by_size():
    pdf_page = fixture("2012.01.PosRpt.pdf").get_page(2)
    table1, _ = page_to_tables(pdf_page, ConfigParameters(atomise=False))
    #table1, _ = getTable(fh, 2)
    assert_equals(78, len(table1))
    assert_equals(9, len(table1[0]))
コード例 #33
0
def test_it_does_not_crash_on_m30_p5():
    fh = open('fixtures/sample_data/m30-JDent36s15-20.pdf', 'rb')
    pdf_page = get_pdf_page(fh, 5)
    table, _ = page_to_tables(pdf_page)
    """Put this in for more aggressive test"""
コード例 #34
0
def test_it_returns_the_AlmondBoard_p2_table_by_size():
    pdf_page = fixture("2012.01.PosRpt.pdf").get_page(2)
    table1, _ = page_to_tables(pdf_page, ConfigParameters(atomise=False))
    #table1, _ = getTable(fh, 2)
    assert_equals(78, len(table1))
    assert_equals(9, len(table1[0]))
コード例 #35
0
def test_it_returns_the_AlmondBoard_p4_table():
    fh = open('fixtures/sample_data/2012.01.PosRpt.pdf', 'rb')
    pdf_page = get_pdf_page(fh, 4)
    table, _ = page_to_tables(pdf_page, extend_y=False)
    assert_equals([
        [
            u'Variety Name', u'Total Receipts', u'Total Receipts',
            u'Total Inedibles', u'Receipts', u'% Rejects'
        ],
        [
            u'Aldrich', u'48,455,454', u'49,181,261', u'405,555', u'2.53%',
            u'0.82%'
        ],
        [u'Avalon', u'7,920,179', u'8,032,382', u'91,733', u'0.41%', u'1.14%'],
        [
            u'Butte', u'151,830,761', u'150,799,510', u'1,054,567', u'7.93%',
            u'0.70%'
        ],
        [
            u'Butte/Padre', u'215,114,812', u'218,784,885', u'1,145,000',
            u'11.24%', u'0.52%'
        ],
        [
            u'Carmel', u'179,525,234', u'178,912,935', u'1,213,790', u'9.38%',
            u'0.68%'
        ], [u'Carrion', u'507,833', u'358,580', u'2,693', u'0.03%', u'0.75%'],
        [
            u'Fritz', u'105,479,433', u'106,650,571', u'1,209,192', u'5.51%',
            u'1.13%'
        ], [u'Harvey', u'58,755', u'58,755', u'1,416', u'0.00%', u'2.41%'],
        [u'Hashem', u'430,319', u'430,014', u'1,887', u'0.02%', u'0.44%'],
        [u'Le Grand', u'0', u'0', u'0', u'0.00%', u'0.00%'],
        [
            u'Livingston', u'7,985,535', u'7,926,910', u'186,238', u'0.42%',
            u'2.35%'
        ], [u'Marchini', u'363,887', u'391,965', u'3,675', u'0.02%', u'0.94%'],
        [u'Merced', u'65,422', u'66,882', u'1,167', u'0.00%', u'1.74%'],
        [
            u'Mission', u'19,097,034', u'18,851,071', u'110,323', u'1.00%',
            u'0.59%'
        ],
        [
            u'Mixed', u'36,358,011', u'36,926,337', u'952,264', u'1.90%',
            u'2.58%'
        ], [u'Mono', u'757,637', u'689,552', u'6,785', u'0.04%', u'0.98%'],
        [
            u'Monterey', u'220,713,436', u'212,746,409', u'2,293,892',
            u'11.53%', u'1.08%'
        ], [u'Morley', u'822,529', u'825,738', u'6,264', u'0.04%', u'0.76%'],
        [u'N43', u'156,488', u'85,832', u'340', u'0.01%', u'0.40%'],
        [u'Neplus', u'1,279,599', u'1,237,532', u'17,388', u'0.07%', u'1.41%'],
        [
            u'Nonpareil', u'741,809,844', u'727,286,104', u'5,121,465',
            u'38.75%', u'0.70%'
        ],
        [
            u'Padre', u'62,905,358', u'62,417,565', u'193,168', u'3.29%',
            u'0.31%'
        ],
        [
            u'Peerless', u'5,113,472', u'5,101,245', u'20,792', u'0.27%',
            u'0.41%'
        ],
        [
            u'Price', u'25,312,529', u'25,124,463', u'143,983', u'1.32%',
            u'0.57%'
        ], [
            u'Ruby', u'4,163,237', u'4,057,470', u'35,718', u'0.22%', u'0.88%'
        ], [u'Sauret', u'55,864', u'55,864', u'517', u'0.00%', u'0.93%'],
        [u'Savana', u'389,317', u'390,585', u'2,049', u'0.02%', u'0.52%'],
        [
            u'Sonora', u'31,832,025', u'33,184,703', u'387,848', u'1.66%',
            u'1.17%'
        ], [u'Thompson', u'491,026', u'487,926', u'8,382', u'0.03%', u'1.72%'],
        [u'Tokyo', u'783,494', u'794,699', u'4,511', u'0.04%', u'0.57%'],
        [
            u'Winters', u'5,780,183', u'5,756,167', u'46,211', u'0.30%',
            u'0.80%'
        ],
        [
            u'Wood Colony', u'37,458,735', u'36,331,907', u'189,967', u'1.96%',
            u'0.52%'
        ],
        [
            u'Major Varieties Sub Total:', u'1,913,017,442', u'1,893,945,819',
            u'14,858,780', u'99.92%', u'0.78%'
        ],
        [
            u'Minor Varieties Total:', u'1,454,133', u'1,480,800', u'34,997',
            u'0.08%', u'2.36%'
        ],
        [
            u'Grand Total All Varieties', u'1,914,471,575', u'1,895,426,619',
            u'14,893,777', u'100.00%', u'0.79%'
        ]
    ], table)
コード例 #36
0
ファイル: runtables.py プロジェクト: MikeTrenfield/pdftables
#SelectedPDF = "commodity-prices_en.pdf"
#pagenumber = 1

#SelectedPDF = "AnimalExampleTables.pdf" # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf
#pagenumber = 2

filepath = os.path.join(PDF_TEST_FILES, SelectedPDF)
pta.plotAllPages(fh)


doc = PDFDocument(open(filepath, 'rb'))
pdf_page = doc.get_page(pagenumber)

table, diagnosticData = page_to_tables(
    pdf_page, ConfigParameters(
        extend_y=False,
        table_top_hint=table_top_hint,
        table_bottom_hint=table_bottom_hint,
        atomise=False))

fig, ax1 = pta.plotpage(diagnosticData)

result = StringIO()
(columns, rows) = get_dimensions(table)
result.write("     {} columns, {} rows\n".format(columns, rows))

print to_string(table)




# BoxList = plotAllPages(open(filepath, 'rb'))
コード例 #37
0
from pdftables import PDFDocument
from pdftables import page_to_tables

fh = open('tests/20140627.pdf', 'rb')
doc = PDFDocument.from_fileobj(fh)
page - doc.get_page(1)
tables = page_to_tables(page)
コード例 #38
0
#pagenumber = 1

#SelectedPDF = "commodity-prices_en.pdf"
#pagenumber = 1

#SelectedPDF = "AnimalExampleTables.pdf" # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf
#pagenumber = 2

filepath = os.path.join(PDF_TEST_FILES, SelectedPDF)
pta.plotAllPages(fh)

doc = PDFDocument(open(filepath, 'rb'))
pdf_page = doc.get_page(pagenumber)

table, diagnosticData = page_to_tables(
    pdf_page,
    ConfigParameters(extend_y=False,
                     table_top_hint=table_top_hint,
                     table_bottom_hint=table_bottom_hint,
                     atomise=False))

fig, ax1 = pta.plotpage(diagnosticData)

result = StringIO()
(columns, rows) = get_dimensions(table)
result.write("     {} columns, {} rows\n".format(columns, rows))

print to_string(table)

# BoxList = plotAllPages(open(filepath, 'rb'))
コード例 #39
0
def test_it_copes_with_CONAB_p8():
    fh = open(
        'fixtures/sample_data/13_06_12_10_36_58_boletim_ingles_junho_2013.pdf',
        'rb')
    pdf_page = get_pdf_page(fh, 8)
    table, _ = page_to_tables(pdf_page, atomise=True)
コード例 #40
0
ファイル: render.py プロジェクト: yashica-gupta/pdftables
def check(path):
    fileobj = open(path, 'rb')
    doc = PDFDocument.from_fileobj(fileobj)
    tables = pdftables.page_to_tables(doc.get_page(0))
    print tables
コード例 #41
0
import pdftables

my_pdf = open('data/WEF_GlobalCompetitivenessReport_2014-15.pdf', 'rb')
chart_page = pdftables.get_pdf_page(my_pdf, 29)

table = pdftables.page_to_tables(chart_page)


titles = zip(table[0][0], table[0][1])[:5]
titles = [''.join([title[0], title[1]]) for title in titles]
print(titles)

all_rows = []
for row_data in table[0][2:]:
    all_rows.extend([row_data[:5], row_data[5:]])

print(all_rows)
コード例 #42
0
ファイル: runtables.py プロジェクト: wsniper/pdftables-1
#SelectedPDF = "pdf_prc_prod_1_7_1288_acucar-vhp-vendido-mercado-externo_sao-paulo_mensal.pdf"
#pagenumber = 1

#SelectedPDF = "commodity-prices_en.pdf"
#pagenumber = 1

SelectedPDF = "AnimalExampleTables.pdf"  # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf
pagenumber = 2

filepath = os.path.join(PDF_TEST_FILES, SelectedPDF)
fh = open(filepath, 'rb')
#pta.plotAllPages(fh)

pdf_page = get_pdf_page(fh, pagenumber)

table, diagnosticData = page_to_tables(pdf_page,
                                       extend_y=False,
                                       hints=hints,
                                       atomise=False)

fig, ax1 = pta.plotpage(diagnosticData)

result = StringIO()
(columns, rows) = get_dimensions(table)
result.write("     {} columns, {} rows\n".format(columns, rows))

print to_string(table)

# BoxList = plotAllPages(open(filepath, 'rb'))
コード例 #43
0
def test_it_copes_with_CONAB_p8():
    pdf_page = fixture("13_06_12_10_36_58_boletim_ingles_junho_2013.pdf").get_page(8)
    table, _ = page_to_tables(pdf_page, ConfigParameters(atomise=True))
コード例 #44
0
ファイル: render.py プロジェクト: arifsaeed/pdftables
def check(path):
    fileobj = open(path, "rb")
    doc = PDFDocument.from_fileobj(fileobj)
    tables = pdftables.page_to_tables(doc.get_page(0))
    print tables
コード例 #45
0
import pdftables

my_pdf = open('data/WEF_GlobalCompetitivenessReport_2014-15.pdf', 'rb')
chart_page = pdftables.get_pdf_page(my_pdf, 29)

table = pdftables.page_to_tables(chart_page)

titles = zip(table[0][0], table[0][1])[:5]
titles = [''.join([title[0], title[1]]) for title in titles]
print(titles)

all_rows = []
for row_data in table[0][2:]:
    all_rows.extend([row_data[:5], row_data[5:]])

print(all_rows)
コード例 #46
0
def test_it_exits_gracefully_when_no_tables_found():
    pdf_page = fixture("13_06_12_10_36_58_boletim_ingles_junho_2013.pdf").get_page(5)
    table, table_diagnostic_data = page_to_tables(pdf_page)

    assert_equals([],table)
    assert(isinstance(table_diagnostic_data, TableDiagnosticData))
コード例 #47
0
def test_it_does_not_crash_on_m30_p5():
    pdf_page = fixture("m30-JDent36s15-20.pdf").get_page(5)
    table, _ = page_to_tables(pdf_page)
    """Put this in for more aggressive test"""