Esempio n. 1
0
#SelectedPDF = "pdf_prc_prod_1_7_1288_acucar-vhp-vendido-mercado-externo_sao-paulo_mensal.pdf"
#pagenumber = 1

#SelectedPDF = "commodity-prices_en.pdf"
#pagenumber = 1

SelectedPDF = "AnimalExampleTables.pdf"  # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf
pagenumber = 2

filepath = os.path.join(PDF_TEST_FILES, SelectedPDF)
fh = open(filepath, 'rb')
#pta.plotAllPages(fh)

pdf_page = get_pdf_page(fh, pagenumber)

table, diagnosticData = page_to_tables(pdf_page,
                                       extend_y=False,
                                       hints=hints,
                                       atomise=False)

fig, ax1 = pta.plotpage(diagnosticData)

result = StringIO()
(columns, rows) = get_dimensions(table)
result.write("     {} columns, {} rows\n".format(columns, rows))

print to_string(table)

# BoxList = plotAllPages(open(filepath, 'rb'))
Esempio n. 2
0

def calculate_modal_height(box_list):
    height_list = []
    for box in box_list:
        if box.classname in ('LTTextLineHorizontal', 'LTChar'):
            height_list.append(round(box.bbox[TOP] - box.bbox[BOTTOM]))

    modal_height = Counter(height_list).most_common(1)
    return modal_height[0][0]


def file_handle_from_url(URL):
    # TODO: move this function to a helper library
    response = requests.get(URL)
    fh = StringIO(response.content)
    return fh


if __name__ == '__main__':
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
    if len(sys.argv) > 1:
        from display import to_string
        with open(sys.argv[1], 'rb') as f:
            tables = get_tables(f)
            for i, table in enumerate(tables):
                print("---- TABLE {} ----".format(i + 1))
                print(to_string(table))
    else:
        print("Usage: {} <file.pdf>".format(sys.argv[0]))
Esempio n. 3
0
#SelectedPDF = "AnimalExampleTables.pdf" # 7 pages works fine in pdfminer, 4 for first test 2012.01.PosRpt.pdf
#pagenumber = 2

filepath = os.path.join(PDF_TEST_FILES, SelectedPDF)
pta.plotAllPages(fh)


doc = PDFDocument(open(filepath, 'rb'))
pdf_page = doc.get_page(pagenumber)

table, diagnosticData = page_to_tables(
    pdf_page, ConfigParameters(
        extend_y=False,
        table_top_hint=table_top_hint,
        table_bottom_hint=table_bottom_hint,
        atomise=False))

fig, ax1 = pta.plotpage(diagnosticData)

result = StringIO()
(columns, rows) = get_dimensions(table)
result.write("     {} columns, {} rows\n".format(columns, rows))

print to_string(table)




# BoxList = plotAllPages(open(filepath, 'rb'))
Esempio n. 4
0

def calculate_modal_height(box_list):
    height_list = []
    for box in box_list:
        if box.classname in ('LTTextLineHorizontal', 'LTChar'):
            height_list.append(round(box.bbox[TOP] - box.bbox[BOTTOM]))

    modal_height = Counter(height_list).most_common(1)
    return modal_height[0][0]


def file_handle_from_url(URL):
    # TODO: move this function to a helper library
    response = requests.get(URL)
    fh = StringIO(response.content)
    return fh


if __name__ == '__main__':
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
    if len(sys.argv) > 1:
        from display import to_string
        with open(sys.argv[1], 'rb') as f:
            tables = get_tables(f)
            for i, table in enumerate(tables):
                print("---- TABLE {} ----".format(i + 1))
                print(to_string(table))
    else:
        print("Usage: {} <file.pdf>".format(sys.argv[0]))