def test_pdf_to_text_all(): '''test type textbox is dict''' cwd = os.getcwd() loc = '{}/pdf2textbox/data'.format(cwd) file_loc = '{}/Id=MMP15%2F57_5694_5696.pdf'.format(loc) with open(file_loc, 'br') as pdf: textbox = _pdf_to_text_all(pdf, verbose=False) assert isinstance(textbox, dict)
def test_convert_text_only(): '''assert that this file has one column and no header''' cwd = os.getcwd() loc = '{}/pdf2textbox/data'.format(cwd) pdf_loc = '{}/01a_only_text.pdf'.format(loc) pdf = _get_pdf_file(pdf_loc, verbose=False) boxes = _pdf_to_text_all(pdf, verbose=False) assert boxes[1]['column'] assert boxes[2]['column']
def test_convert_two_cols(): '''assert that this file has two columns''' cwd = os.getcwd() loc = '{}/pdf2textbox/data'.format(cwd) pdf_loc = '{}/02a_two_cols.pdf'.format(loc) pdf = _get_pdf_file(pdf_loc, verbose=False) boxes = _pdf_to_text_all(pdf, verbose=False) assert boxes[1]['left_column'] assert boxes[1]['right_column'] assert boxes[2]['column']
def test_textbox_structure(): '''test if textbox contains pages, header, left and right column''' cwd = os.getcwd() loc = '{}/pdf2textbox/data'.format(cwd) file_loc = '{}/Id=MMP15%2F57_5694_5696.pdf'.format(loc) with open(file_loc, 'br') as pdf: textbox = _pdf_to_text_all(pdf, verbose=False) for key, val in textbox.items(): assert isinstance(key, int) assert isinstance(val, dict) for subkey, subval in val.items(): assert subkey in ['header', 'left_column', 'right_column'] assert isinstance(subval, list)
def test_convert_three_cols_and_header(): '''assert that this file has three columns and a header''' cwd = os.getcwd() loc = '{}/pdf2textbox/data'.format(cwd) pdf_loc = '{}/06a_three_cols_and_header.pdf'.format(loc) pdf = _get_pdf_file(pdf_loc, verbose=False) boxes = _pdf_to_text_all(pdf, verbose=False) assert boxes[1]['header'] assert boxes[1]['left_column'] assert boxes[1]['center_column'] assert boxes[1]['right_column'] assert boxes[2]['header'] assert boxes[2]['left_column'] assert boxes[2]['right_column']