Ejemplo n.º 1
0
def test_pdf_to_text_all():
    '''test type textbox is dict'''

    cwd = os.getcwd()
    loc = '{}/pdf2textbox/data'.format(cwd)
    file_loc = '{}/Id=MMP15%2F57_5694_5696.pdf'.format(loc)
    with open(file_loc, 'br') as pdf:
        textbox = _pdf_to_text_all(pdf, verbose=False)

    assert isinstance(textbox, dict)
Ejemplo n.º 2
0
def test_convert_text_only():
    '''assert that this file has one column and no header'''

    cwd = os.getcwd()
    loc = '{}/pdf2textbox/data'.format(cwd)
    pdf_loc = '{}/01a_only_text.pdf'.format(loc)

    pdf = _get_pdf_file(pdf_loc, verbose=False)
    boxes = _pdf_to_text_all(pdf, verbose=False)

    assert boxes[1]['column']
    assert boxes[2]['column']
Ejemplo n.º 3
0
def test_convert_two_cols():
    '''assert that this file has two columns'''

    cwd = os.getcwd()
    loc = '{}/pdf2textbox/data'.format(cwd)
    pdf_loc = '{}/02a_two_cols.pdf'.format(loc)

    pdf = _get_pdf_file(pdf_loc, verbose=False)
    boxes = _pdf_to_text_all(pdf, verbose=False)

    assert boxes[1]['left_column']
    assert boxes[1]['right_column']
    assert boxes[2]['column']
Ejemplo n.º 4
0
def test_textbox_structure():
    '''test if textbox contains pages, header, left and right column'''

    cwd = os.getcwd()
    loc = '{}/pdf2textbox/data'.format(cwd)
    file_loc = '{}/Id=MMP15%2F57_5694_5696.pdf'.format(loc)
    with open(file_loc, 'br') as pdf:
        textbox = _pdf_to_text_all(pdf, verbose=False)

    for key, val in textbox.items():
        assert isinstance(key, int)
        assert isinstance(val, dict)
        for subkey, subval in val.items():
            assert subkey in ['header', 'left_column', 'right_column']
            assert isinstance(subval, list)
Ejemplo n.º 5
0
def test_convert_three_cols_and_header():
    '''assert that this file has three columns and a header'''

    cwd = os.getcwd()
    loc = '{}/pdf2textbox/data'.format(cwd)
    pdf_loc = '{}/06a_three_cols_and_header.pdf'.format(loc)

    pdf = _get_pdf_file(pdf_loc, verbose=False)
    boxes = _pdf_to_text_all(pdf, verbose=False)

    assert boxes[1]['header']
    assert boxes[1]['left_column']
    assert boxes[1]['center_column']
    assert boxes[1]['right_column']
    assert boxes[2]['header']
    assert boxes[2]['left_column']
    assert boxes[2]['right_column']